Initial commit

2026-02-05 23:18:26 +01:00
commit 747c92ac6b
31 changed files with 4220 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,408 @@
 # READ THIS BEFORE YOU REFACTOR ME
 #
 # setup.py uses the list of patterns in this file to decide
 # what to delete, but it's not 100% sound.  So, for example,
 # if you delete aten/build/ because it's redundant with build/,
 # aten/build/ will stop being cleaned.  So be careful when
 # refactoring this file!
 ## Model cache
 .md
 model_cache/
 ## PyTorch
 .coverage
 coverage.xml
 .dmypy.json
 .gradle
 .hypothesis
 .mypy_cache
 .additional_ci_files
 .lintrunner.private.toml
 /.extracted_scripts/
 **/.pytorch_specified_test_cases.csv
 **/.pytorch-disabled-tests.json
 */*.pyc
 */*.so*
 */**/__pycache__
 */**/*.dylib*
 */**/*.pyc
 */**/*.pyd
 */**/*.so*
 */**/**/*.pyc
 */**/**/**/*.pyc
 */**/**/**/**/*.pyc
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
 aten/src/ATen/hip/HIPConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
 docs/build/
 docs/cpp/src
 docs/src/**/*
 docs/cpp/build
 docs/cpp/source/api
 docs/cpp/source/html/
 docs/cpp/source/latex/
 docs/source/compile/generated/
 docs/source/generated/
 docs/source/compile/generated/
 log
 usage_log.txt
 usage_log*
 test-reports/
 test/*.bak
 test/**/*.bak
 test/.coverage
 test/.hypothesis/
 test/cpp/api/mnist
 test/custom_operator/model.pt
 test/debug/
 test/jit_hooks/*.pt
 test/data/legacy_modules.t7
 test/data/*.pt
 test/forward_backward_compatibility/nightly_schemas.txt
 dropout_model.pt
 test/generated_type_hints_smoketest.py
 test/htmlcov
 test/cpp_extensions/**/install
 test/kernel.errors.txt
 third_party/build/
 third_party/nccl/
 tools/coverage_plugins_package/pip-wheel-metadata/
 tools/shared/_utils_internal.py
 tools/fast_nvcc/wrap_nvcc.sh
 tools/fast_nvcc/wrap_nvcc.bat
 tools/fast_nvcc/tmp/
 torch.egg-info/
 torch/_C/__init__.pyi
 torch/_C/_nn.pyi
 torch/_C/_VariableFunctions.pyi
 torch/_VF.pyi
 torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
 torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
 torch/testing/_internal/generated/annotated_fn_args.py
 torch/testing/_internal/data/*.pt
 torch/headeronly/version.h
 torch/csrc/cudnn/cuDNN.cpp
 torch/csrc/generated
 torch/csrc/generic/TensorMethods.cpp
 torch/csrc/inductor/aoti_torch/generated/*.cpp
 torch/csrc/inductor/aoti_torch/generated/extend/*
 torch/csrc/jit/generated/*
 torch/csrc/jit/fuser/config.h
 torch/csrc/nn/THCUNN.cpp
 torch/csrc/nn/THCUNN.cwrap
 torch/bin/
 torch/cmake/
 torch/lib/*.a*
 torch/lib/*.dll*
 torch/lib/*.exe*
 torch/lib/*.dylib*
 torch/lib/*.h
 torch/lib/*.lib
 torch/lib/*.pdb
 torch/lib/*.so*
 torch/lib/protobuf*.pc
 torch/lib/build
 torch/lib/caffe2/
 torch/lib/cmake
 torch/lib/include
 torch/lib/pkgconfig
 torch/lib/protoc
 torch/lib/protobuf/
 torch/lib/tmp_install
 torch/lib/torch_shm_manager
 torch/lib/site-packages/
 torch/lib/python*
 torch/lib64
 torch/include/
 torch/share/
 torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
 torch/_inductor/kernel/vendored_templates/*
 test/inductor/test_tlx*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/asm_fmha_v3_bwd_configs.hpp
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fav_v3/mha_bwd.hip
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_convert*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fwd_blob*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/bwd_blob*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_api*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_api*
 # Root level file used in CI to specify certain env configs.
 # E.g., see .circleci/config.yaml
 env
 .circleci/scripts/COMMIT_MSG
 scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
 merge_record.json
 .github/scripts/nightly_source_matrix.json
 # These files get copied over on invoking setup.py
 torchgen/packaged/*
 !torchgen/packaged/README.md
 # This file is injected by ROCm build scripts to bootstrap in torch/__init__.py.
 torch/_rocm_init.py
 # IPython notebook checkpoints
 .ipynb_checkpoints
 # Editor temporaries
 *.swa
 *.swb
 *.swc
 *.swd
 *.swe
 *.swf
 *.swg
 *.swh
 *.swi
 *.swj
 *.swk
 *.swl
 *.swm
 *.swn
 *.swo
 *.swp
 *~
 .~lock.*
 # macOS dir files
 .DS_Store
 # Ninja files
 .ninja_deps
 .ninja_log
 compile_commands.json
 *.egg-info/
 docs/source/scripts/activation_images/
 docs/source/scripts/quantization_backend_configs/
 docs/source/scripts/lr_scheduler_images/
 ## General
 # Compiled Object files
 *.slo
 *.lo
 *.o
 *.cuo
 *.obj
 # Compiled Dynamic libraries
 *.so
 *.dylib
 *.dll
 # Compiled Static libraries
 *.lai
 *.la
 *.a
 *.lib
 # Compiled protocol buffers
 *.pb.h
 *.pb.cc
 *_pb2.py
 # Compiled python
 *.pyc
 *.pyd
 # Compiled MATLAB
 *.mex*
 # NFS handle files
 **/.nfs*
 # Sublime Text settings
 *.sublime-workspace
 *.sublime-project
 # Eclipse Project settings
 *.*project
 .settings
 # QtCreator files
 *.user
 # PyCharm files
 .idea
 # GDB history
 .gdb_history
 ## Caffe2
 # build, distribute, and bins (+ python proto bindings)
 build/
 # Allow tools/build/ for build support.
 !tools/build/
 build_host_protoc
 build_android
 build_ios
 .build_debug/*
 .build_release/*
 .build_profile/*
 distribute/*
 *.testbin
 *.bin
 cmake_build
 .cmake_build
 gen
 .setuptools-cmake-build
 .pytest_cache
 aten/build/*
 # Linker scripts for prioritized text optimization
 cmake/linker_script.ld
 # Bram
 plsdontbreak
 # Generated documentation
 docs/_site
 docs/gathered
 _site
 doxygen
 docs/dev
 # LevelDB files
 *.sst
 *.ldb
 LOCK
 CURRENT
 MANIFEST-*
 # generated version file
 caffe2/version.py
 # setup.py intermediates
 .eggs
 caffe2.egg-info
 MANIFEST
 # Atom/Watchman required file
 .watchmanconfig
 .watchman
 # Files generated by CLion
 cmake-build-debug
 # BEGIN NOT-CLEAN-FILES (setup.py handles this marker. Do not change.)
 #
 # Below files are not deleted by "setup.py clean".
 # Downloaded bazel
 tools/bazel
 # Visual Studio Code files
 .vs
 /.vscode/*
 !/.vscode/extensions.json
 !/.vscode/settings_recommended.json
 # YouCompleteMe config file
 .ycm_extra_conf.py
 # Files generated when a patch is rejected
 *.orig
 *.rej
 # Files generated by ctags
 CTAGS
 GTAGS
 GRTAGS
 GSYMS
 GPATH
 tags
 TAGS
 # ccls file
 .ccls-cache/
 # clang tooling storage location
 .clang-format-bin
 .clang-tidy-bin
 .lintbin
 # clangd background index
 .clangd/
 .cache/
 # bazel symlinks
 bazel-*
 # xla repo
 xla/
 # direnv, posh-direnv
 .env
 .envrc
 .psenvrc
 # generated shellcheck directories
 .shellcheck_generated*/
 # zip archives
 *.zip
 # core dump files
 **/core.[1-9]*
 # Generated if you use the pre-commit script for clang-tidy
 pr.diff
 # coverage files
 */**/.coverage.*
 # buck generated files
 .buckd/
 .lsp-buck-out/
 .lsp.buckd/
 buck-out/
 # Downloaded libraries
 third_party/ruy/
 third_party/glog/
 # Virtualenv
 .venv/
 venv/
 # Log files
 *.log
 sweep/
 # Android build artifacts
 android/pytorch_android/.cxx
 android/pytorch_android_torchvision/.cxx
 # Pyre configs (for internal usage)
 .pyre_configuration
 .pyre_configuration.codenav
 .arcconfig
 .stable_pyre_client
 .pyre_client
 # Claude Code local configuration
 CLAUDE.local.md
 /test_*.py
 /debug_*.py
 CLAUDE_CONTEXT/
 /.claude/settings.local.json
--- a/AMD_FIX_SUMMARY.md
+++ b/AMD_FIX_SUMMARY.md
@@ -0,0 +1,100 @@
 # AMD GPU Monitoring Fix Summary
 ## Issue
 The AMDMonitor class was using incorrect pyrsmi API calls. The implementation attempted to use low-level `rocmsmi` module which has complex initialization and function signatures.
 ## Solution
 Updated to use the correct `rocml` high-level API from pyrsmi, based on the official example at:
 `/anvme/workspace/ihpc125h-llm-profiles/pyrsmi/examples/llm_monitoring/monitor_llm_inference.py`
 ## Changes Made
 ### 1. Fixed AMDMonitor Class
 **Before** (incorrect):
 ```python
 from pyrsmi import rocmsmi
 ret = self.rocmsmi.rsmi_init(0)
 power_uw = self.rocmsmi.rsmi_dev_power_ave_get(self.device_id)
 ```
 **After** (correct):
 ```python
 from pyrsmi import rocml
 self.rocml.smi_initialize()
 power_watts = self.rocml.smi_get_device_average_power(self.device_id)
 ```
 **Key API Functions**:
 - `rocml.smi_initialize()` - Initialize monitoring
 - `rocml.smi_get_device_average_power(device_id)` - Get power in Watts (not microwatts!)
 - `rocml.smi_get_device_utilization(device_id)` - Get GPU utilization %
 - `rocml.smi_get_device_memory_used(device_id)` - Get memory used in bytes
 - `rocml.smi_get_device_memory_total(device_id)` - Get total memory in bytes
 - `rocml.smi_get_device_temperature(device_id)` - Get temperature
 - `rocml.smi_get_device_name(device_id)` - Get device name
 - `rocml.smi_shutdown()` - Cleanup
 ### 2. Updated All SLURM Scripts for Apptainer
 All GPU benchmark scripts now run inside the apptainer container:
 **A100, H100, H200** (NVIDIA):
 ```bash
 APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
 apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py ...
 ```
 **MI300X** (AMD):
 ```bash
 APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif"
 apptainer exec --rocm $APPTAINER_IMAGE python run_benchmark.py ...
 ```
 Note: `--nv` for NVIDIA, `--rocm` for AMD
 ### 3. Updated Documentation
 - README.md now mentions apptainer usage
 - Updated setup instructions to use apptainer for model caching
 - Added notes about container flags (--nv vs --rocm)
 ## Testing
 To verify the AMD monitoring works:
 ```bash
 # Inside apptainer on MI300X node
 apptainer exec --rocm pytorch_25.10_tilelang.sif python -c "
 from utils.gpu_monitor import AMDMonitor
 m = AMDMonitor(0)
 print(f'GPU: {m.get_device_name()}')
 metrics = m.get_metrics()
 print(f'Power: {metrics.power_watts:.2f} W')
 print(f'Utilization: {metrics.gpu_utilization_percent:.1f}%')
 print(f'Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB')
 m.cleanup()
 "
 ```
 ## Files Modified
 1. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/utils/gpu_monitor.py` - Fixed AMDMonitor class
 2. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_a100.sh` - Added apptainer
 3. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h100.sh` - Added apptainer
 4. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_h200.sh` - Added apptainer
 5. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/slurm_mi300x.sh` - Added apptainer with --rocm
 6. `/anvme/workspace/ihpc125h-llm-profiles/llm-benchmark/README.md` - Updated documentation
 ## Key Differences: rocml vs rocmsmi
 | Feature | rocml (High-level) | rocmsmi (Low-level) |
 |---------|-------------------|---------------------|
 | API Style | Simple functions | Complex C-style API |
 | Initialization | `smi_initialize()` | `rsmi_init(0)` + error codes |
 | Power | Returns Watts | Returns microwatts |
 | Memory | Returns bytes | Returns bytes via enums |
 | Error Handling | Returns -1 on error | Returns error codes |
 | Ease of Use | Much easier | Complex |
 The `rocml` module is the recommended high-level Python API for pyrsmi.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,311 @@
 # LLM Benchmark Suite
 A comprehensive benchmarking suite for comparing LLM performance (Qwen3-4B) across different GPU architectures: **MI300X**, **A100 80G**, **H100**, and **H200**.
 ## Features
 - **Pretraining Benchmarks**: Separate metrics for forward, backward, and optimizer stages
 - **Inference Benchmarks**: Separate metrics for prefill (TTFT) and decode (ITL) stages
 - **Energy Monitoring**: GPU-specific energy and power measurement
  - NVIDIA: pynvml
  - AMD: pyrsmi
 - **Attention Implementations**:
  - FlashAttention-2 (A100, MI300X)
  - FlashAttention-3 Hopper (H100, H200)
  - Configurable via CLI
 - **Comprehensive Metrics**:
  - Tokens per second
  - Energy per token
  - Time to First Token (TTFT)
  - Inter-Token Latency (ITL)
  - End-to-End Request Latency
  - GPU utilization and memory usage
 ## Directory Structure
 ```
 llm-benchmark/
 ├── cache_model.py           # Model caching script
 ├── benchmark_pretrain.py    # Pretraining benchmark
 ├── benchmark_inference.py   # Inference benchmark
 ├── run_benchmark.py         # Main orchestration script
 ├── requirements.txt         # Python dependencies
 ├── utils/
 │   ├── gpu_monitor.py       # GPU monitoring (NVIDIA & AMD)
 │   ├── metrics.py           # Metrics collection and reporting
 │   └── attention.py         # Attention implementation helpers
 ├── configs/
 │   ├── a100.yaml
 │   ├── h100.yaml
 │   ├── h200.yaml
 │   └── mi300x.yaml
 └── results/                 # Benchmark results (JSON)
 ```
 ## Setup
 ### 1. Container Environment
 All benchmarks should be run inside the apptainer container:
 ```bash
 # Container is located at:
 /anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_tilelang.sif
 ```
 ### 2. Install Dependencies (if not using apptainer)
 If you want to run directly without apptainer:
 ```bash
 # Install Python dependencies
 pip install -r requirements.txt
 # For AMD GPUs, ensure ROCm and pyrsmi are installed
 # For NVIDIA GPUs, ensure CUDA and pynvml are installed
 ```
 ### 3. Cache Model (Run on Head Node)
 **IMPORTANT**: Run this on the head node BEFORE allocating compute nodes, as compute nodes are typically offline.
 ```bash
 # Using apptainer (recommended)
 apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
  --model-name Qwen/Qwen3-4B \
  --cache-dir ./model_cache
 # Or directly (if dependencies installed)
 python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
 ```
 The model will be cached to `./model_cache` in the current directory (avoiding slow NFS $HOME).
 ## Usage
 ### Quick Start
 ```bash
 # Run both pretraining and inference benchmarks
 python run_benchmark.py --mode both --model-path ./model_cache
 # Run only pretraining
 python run_benchmark.py --mode pretrain --num-steps 20
 # Run only inference
 python run_benchmark.py --mode inference --num-requests 20
 ```
 ### Detailed Usage
 #### List Available GPUs
 ```bash
 python run_benchmark.py --list-gpus
 ```
 #### Pretraining Benchmark
 ```bash
 python benchmark_pretrain.py \
  --model-path ./model_cache \
  --model-name Qwen/Qwen3-4B \
  --attn-implementation auto \
  --batch-size 8 \
  --sequence-length 8192 \
  --num-steps 10 \
  --warmup-steps 3 \
  --output-dir ./results
 ```
 **Metrics Reported** (per stage: forward, backward, optimizer):
 - Duration (ms)
 - Tokens processed
 - Throughput (tokens/s)
 - Energy (J)
 - Energy per token (J/token)
 - Average power (W)
 - Peak memory (GB)
 - GPU utilization (%)
 #### Inference Benchmark
 ```bash
 python benchmark_inference.py \
  --model-path ./model_cache \
  --model-name Qwen/Qwen3-4B \
  --attn-implementation auto \
  --num-requests 10 \
  --prompt-length 512 \
  --generation-length 100 \
  --warmup-requests 2 \
  --output-dir ./results
 ```
 **Metrics Reported**:
 - **Prefill**: TTFT, throughput, energy per token
 - **Decode**: ITL, throughput, energy per token
 - **End-to-End**: Request latency, total throughput, total energy
 ### Attention Implementations
 The benchmark automatically selects the optimal attention implementation based on GPU:
 - **A100, MI300X**: `flash_attention_2`
 - **H100, H200**: `flash_attention_3_hopper`
 Override with `--attn-implementation`:
 ```bash
 # Force FlashAttention-3 Hopper on H100
 python run_benchmark.py --attn-implementation flash_attention_3_hopper
 # Use SDPA instead
 python run_benchmark.py --attn-implementation sdpa
 ```
 Available options:
 - `auto` - Auto-detect based on GPU
 - `flash_attention_2` - FlashAttention-2 (all GPUs)
 - `flash_attention_3_hopper` - FlashAttention-3 for H100/H200
 - `sdpa` - PyTorch Scaled Dot Product Attention
 - `eager` - Standard PyTorch attention
 ## Running on SLURM
 All SLURM scripts are configured to run inside the apptainer container. First cache the model on the head node:
 ```bash
 # On head node (with internet access)
 apptainer exec --nv pytorch_25.10_tilelang.sif python cache_model.py \
  --model-name Qwen/Qwen3-4B \
  --cache-dir ./model_cache
 ```
 Then submit jobs:
 ```bash
 # A100
 sbatch slurm_a100.sh
 # H100
 sbatch slurm_h100.sh
 # H200
 sbatch slurm_h200.sh
 # MI300X
 sbatch slurm_mi300x.sh
 ```
 **Note**: 
 - NVIDIA GPUs use `--nv` flag
 - AMD GPUs use `--rocm` flag
 ## Output
 Results are saved to the `--output-dir` directory (default: `./results/`):
 - `pretrain_<GPU>_<ATTENTION>.json` - Pretraining metrics
 - `inference_<GPU>_<ATTENTION>.json` - Inference metrics
 Example output:
 ```
 ===============================================================================
 PRETRAINING BENCHMARK RESULTS
 ===============================================================================
 Model: Qwen/Qwen3-4B
 GPU: NVIDIA A100 80GB
 Attention: flash_attention_2
 Batch Size: 8
 Sequence Length: 8192
 Training Steps: 10
 -------------------------------------------------------------------------------
 STAGE BREAKDOWN
 -------------------------------------------------------------------------------
 [1] FORWARD PASS
  Duration:              1005.23 ms
  Tokens:                 163,840
  Throughput:           163,012.45 tokens/s
  Energy:                    253.0 J
  Energy per Token:       1.5443 mJ/token
 [2] BACKWARD PASS
  Duration:              2052.11 ms
  Tokens:                 163,840
  Throughput:            79,857.23 tokens/s
  Energy:                    516.2 J
  Energy per Token:       3.1513 mJ/token
 [3] OPTIMIZER STEP
  Duration:                153.42 ms
  Tokens:                 163,840
  Throughput:         1,068,012.34 tokens/s
  Energy:                     38.4 J
  Energy per Token:       0.2344 mJ/token
 -------------------------------------------------------------------------------
 OVERALL METRICS
 -------------------------------------------------------------------------------
  Total Duration:        3210.76 ms
  Total Tokens:          163,840
  Throughput:           51,012.45 tokens/s
  Total Energy:             807.6 J
  Energy per Token:       4.9300 mJ/token
 ===============================================================================
 ```
 ## Key Metrics Reference
 ### Pretraining
 - **Forward**: Input processing and loss calculation
 - **Backward**: Gradient computation
 - **Optimizer**: Weight updates
 ### Inference
 - **TTFT (Time to First Token)**: Prefill latency
 - **ITL (Inter-Token Latency)**: Average decode time per token
 - **E2E Latency**: Total request time (prefill + decode)
 ### Energy
 - **Energy (J)**: Total energy consumed
 - **Energy per Token (mJ/token)**: Energy efficiency metric
 - **Average Power (W)**: Power consumption during stage
 ## Troubleshooting
 ### Model Not Found
 Ensure you've cached the model first:
 ```bash
 python cache_model.py --model-name Qwen/Qwen2.5-3B-Instruct --cache-dir ./model_cache
 ```
 ### GPU Monitoring Errors
 - **NVIDIA**: Install pynvml: `pip install pynvml`
 - **AMD**: Install pyrsmi: `pip install pyrsmi`
 ### FlashAttention-3 Not Found
 For H100/H200, ensure FlashAttention-3 is installed. If not available, use:
 ```bash
 python run_benchmark.py --attn-implementation flash_attention_2
 ```
 ### Out of Memory
 Reduce batch size or sequence length:
 ```bash
 python run_benchmark.py --batch-size 4 --sequence-length 1024
 ```
 ## Citation
 If you use this benchmark suite, please cite:
 - [FlashAttention-2](https://github.com/Dao-AILab/flash-attention)
 - [FlashAttention-3](https://github.com/Dao-AILab/flash-attention) (for Hopper)
 - [Qwen Models](https://huggingface.co/Qwen)
 ## License
 MIT License - see LICENSE file for details
--- a/benchmark_inference.py
+++ b/benchmark_inference.py
@@ -0,0 +1,417 @@
 #!/usr/bin/env python3
 """
 Inference Benchmark for LLM Performance Evaluation
 Measures performance and energy metrics for inference workloads with
 separate measurements for prefill and decode stages.
 """
 import argparse
 import os
 import sys
 import time
 from pathlib import Path
 from typing import Optional
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from tqdm import tqdm
 # Add utils to path
 sys.path.insert(0, str(Path(__file__).parent))
 from utils.gpu_monitor import get_gpu_monitor
 from utils.metrics import StageMetrics, InferenceMetrics, MetricsReporter
 from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
 def benchmark_inference(
    model_name_or_path: str,
    attn_implementation: str = "auto",
    num_requests: int = 10,
    prompt_length: int = 512,
    generation_length: int = 100,
    warmup_requests: int = 2,
    device: str = "cuda",
    device_id: int = 0,
    output_dir: Optional[str] = None,
    verbose: bool = True,
 ):
    """
    Run inference benchmark.
    Args:
        model_name_or_path: Path to model or HuggingFace identifier
        attn_implementation: Attention implementation to use
        num_requests: Number of inference requests to measure
        prompt_length: Length of input prompt
        generation_length: Number of tokens to generate
        warmup_requests: Number of warmup requests
        device: Device to use
        device_id: GPU device ID
        output_dir: Directory to save results
        verbose: Print verbose output
    """
    print("=" * 80)
    print("INFERENCE BENCHMARK")
    print("=" * 80)
    # Initialize GPU monitor
    if verbose:
        print("\n[1/7] Initializing GPU monitor...")
    monitor = get_gpu_monitor(device_id)
    gpu_name = monitor.get_device_name()
    if verbose:
        print(f"  GPU: {gpu_name}")
    # Determine attention implementation
    if attn_implementation == "auto":
        attn_implementation = get_default_attention(gpu_name)
        if verbose:
            print(f"  Auto-selected attention: {attn_implementation}")
    # Validate attention for GPU
    valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
    if warning and verbose:
        print(f"  ⚠ {warning}")
    # Load model
    if verbose:
        print(f"\n[2/7] Loading model: {model_name_or_path}")
    # Determine attn_implementation parameter for model loading
    load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            attn_implementation=load_attn,
            trust_remote_code=True,
        )
        model = model.to(device)
        # Configure attention (patch if needed for FA3)
        model = configure_model_attention(model, attn_implementation, verbose=verbose)
        if verbose:
            total_params = sum(p.numel() for p in model.parameters())
            print(f"  Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
    except Exception as e:
        print(f"✗ Error loading model: {e}")
        sys.exit(1)
    # Load tokenizer
    if verbose:
        print(f"\n[3/7] Loading tokenizer...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path,
            trust_remote_code=True
        )
    except Exception as e:
        print(f"✗ Error loading tokenizer: {e}")
        sys.exit(1)
    # Generate synthetic prompts
    if verbose:
        print(f"\n[4/7] Generating synthetic prompts...")
        print(f"  Prompt length: {prompt_length}")
        print(f"  Generation length: {generation_length}")
    # Create random input_ids (synthetic prompts)
    vocab_size = model.config.vocab_size
    # We'll create one prompt and reuse it
    prompt_ids = torch.randint(0, vocab_size, (1, prompt_length), device=device)
    # Warmup
    if verbose:
        print(f"\n[5/7] Running warmup ({warmup_requests} requests)...")
    model.eval()
    with torch.no_grad():
        for _ in range(warmup_requests):
            _ = model.generate(
                prompt_ids,
                max_new_tokens=generation_length,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
    # Synchronize before benchmarking
    torch.cuda.synchronize()
    # Benchmark
    if verbose:
        print(f"\n[6/7] Running benchmark ({num_requests} requests)...")
    # Storage for per-request metrics
    prefill_times = []
    decode_times = []
    e2e_times = []
    prefill_energies = []
    decode_energies = []
    e2e_energies = []
    prefill_powers = []
    decode_powers = []
    memory_usage = []
    gpu_utils = []
    # For inference, we separate prefill (first token) from decode (remaining tokens)
    # We'll use a custom generation loop to measure them separately
    for req_idx in tqdm(range(num_requests), desc="Benchmarking"):
        # === PREFILL PHASE (Time to First Token) ===
        # This is the forward pass with the prompt to get the first token
        monitor.start_monitoring()
        torch.cuda.synchronize()
        prefill_start = time.perf_counter()
        with torch.no_grad():
            # Forward pass with prompt
            outputs = model(input_ids=prompt_ids, use_cache=True)
            logits = outputs.logits
            past_key_values = outputs.past_key_values
            # Get first generated token
            next_token_logits = logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
        torch.cuda.synchronize()
        prefill_time = time.perf_counter() - prefill_start
        prefill_energy = monitor.get_energy_consumed()
        prefill_power = monitor.get_average_power()
        prefill_times.append(prefill_time * 1000)  # Convert to ms
        prefill_energies.append(prefill_energy)
        prefill_powers.append(prefill_power)
        # === DECODE PHASE (Inter-Token Latency) ===
        # Generate remaining tokens one by one
        monitor.start_monitoring()
        torch.cuda.synchronize()
        decode_start = time.perf_counter()
        generated_tokens = [next_token]
        with torch.no_grad():
            for _ in range(generation_length - 1):
                # Forward pass with single token using cached keys/values
                outputs = model(
                    input_ids=next_token,
                    past_key_values=past_key_values,
                    use_cache=True
                )
                logits = outputs.logits
                past_key_values = outputs.past_key_values
                # Get next token
                next_token_logits = logits[:, -1, :]
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
                generated_tokens.append(next_token)
        torch.cuda.synchronize()
        decode_time = time.perf_counter() - decode_start
        decode_energy = monitor.get_energy_consumed()
        decode_power = monitor.get_average_power()
        decode_times.append(decode_time * 1000)  # Convert to ms
        decode_energies.append(decode_energy)
        decode_powers.append(decode_power)
        # End-to-end metrics
        e2e_time = prefill_time + decode_time
        e2e_energy = prefill_energy + decode_energy
        e2e_times.append(e2e_time * 1000)  # Convert to ms
        e2e_energies.append(e2e_energy)
        # Get memory and utilization
        metrics = monitor.get_metrics()
        memory_usage.append(metrics.memory_used_gb)
        gpu_utils.append(metrics.gpu_utilization_percent)
    # Compute aggregated metrics
    # Prefill metrics (TTFT)
    prefill_duration_ms = sum(prefill_times)
    prefill_energy_j = sum(prefill_energies)
    prefill_tokens = prompt_length * num_requests
    prefill_tps = prefill_tokens / (prefill_duration_ms / 1000)
    prefill_ept = prefill_energy_j / prefill_tokens
    avg_ttft_ms = sum(prefill_times) / len(prefill_times)
    prefill_metrics = StageMetrics(
        stage_name="prefill",
        duration_ms=prefill_duration_ms,
        tokens_processed=prefill_tokens,
        tokens_per_second=prefill_tps,
        energy_joules=prefill_energy_j,
        energy_per_token=prefill_ept,
        avg_power_watts=sum(prefill_powers) / len(prefill_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )
    # Decode metrics (ITL)
    decode_duration_ms = sum(decode_times)
    decode_energy_j = sum(decode_energies)
    decode_tokens = generation_length * num_requests
    decode_tps = decode_tokens / (decode_duration_ms / 1000)
    decode_ept = decode_energy_j / decode_tokens
    avg_itl_ms = sum(decode_times) / len(decode_times) / generation_length
    decode_metrics = StageMetrics(
        stage_name="decode",
        duration_ms=decode_duration_ms,
        tokens_processed=decode_tokens,
        tokens_per_second=decode_tps,
        energy_joules=decode_energy_j,
        energy_per_token=decode_ept,
        avg_power_watts=sum(decode_powers) / len(decode_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )
    # End-to-end metrics
    e2e_latency_ms = sum(e2e_times) / len(e2e_times)
    e2e_energy_j = sum(e2e_energies)
    total_tokens = (prompt_length + generation_length) * num_requests
    e2e_tps = total_tokens / (sum(e2e_times) / 1000)
    e2e_ept = e2e_energy_j / total_tokens
    # Create metrics object
    metrics = InferenceMetrics(
        model_name=model_name_or_path,
        gpu_name=gpu_name,
        attention_implementation=attn_implementation,
        num_requests=num_requests,
        prompt_length=prompt_length,
        generation_length=generation_length,
        prefill=prefill_metrics,
        decode=decode_metrics,
        e2e_latency_ms=e2e_latency_ms,
        e2e_tokens_per_second=e2e_tps,
        e2e_energy_joules=e2e_energy_j,
        e2e_energy_per_token=e2e_ept,
        ttft_ms=avg_ttft_ms,
        itl_ms=avg_itl_ms
    )
    # Print results
    if verbose:
        print()
    MetricsReporter.print_inference_metrics(metrics, verbose=verbose)
    # Save results
    if output_dir:
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        # Save JSON
        json_path = output_path / f"inference_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
        MetricsReporter.save_json(metrics, json_path)
    # Cleanup
    monitor.cleanup()
    del model
    torch.cuda.empty_cache()
    return metrics
 def main():
    parser = argparse.ArgumentParser(
        description="LLM Inference Benchmark",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        "--model-path",
        type=str,
        default="./model_cache",
        help="Path to cached model"
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen/Qwen3-4B",
        help="Model name (for reporting)"
    )
    parser.add_argument(
        "--attn-implementation",
        type=str,
        default="auto",
        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
        help="Attention implementation to use"
    )
    parser.add_argument(
        "--num-requests",
        type=int,
        default=10,
        help="Number of inference requests"
    )
    parser.add_argument(
        "--prompt-length",
        type=int,
        default=512,
        help="Prompt length in tokens"
    )
    parser.add_argument(
        "--generation-length",
        type=int,
        default=100,
        help="Number of tokens to generate"
    )
    parser.add_argument(
        "--warmup-requests",
        type=int,
        default=2,
        help="Number of warmup requests"
    )
    parser.add_argument(
        "--device-id",
        type=int,
        default=0,
        help="GPU device ID"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./results",
        help="Output directory for results"
    )
    args = parser.parse_args()
    # Set environment variables for HuggingFace cache
    if Path(args.model_path).exists():
        os.environ['HF_HOME'] = args.model_path
    benchmark_inference(
        model_name_or_path=args.model_name,
        attn_implementation=args.attn_implementation,
        num_requests=args.num_requests,
        prompt_length=args.prompt_length,
        generation_length=args.generation_length,
        warmup_requests=args.warmup_requests,
        device="cuda",
        device_id=args.device_id,
        output_dir=args.output_dir,
        verbose=True
    )
 if __name__ == "__main__":
    main()
--- a/benchmark_pretrain.py
+++ b/benchmark_pretrain.py
@@ -0,0 +1,406 @@
 #!/usr/bin/env python3
 """
 Pretraining Benchmark for LLM Performance Evaluation
 Measures performance and energy metrics for pretraining workloads with
 separate measurements for forward, backward, and optimizer stages.
 """
 import argparse
 import os
 import sys
 import time
 from pathlib import Path
 from typing import Optional
 import torch
 import torch.nn as nn
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from tqdm import tqdm
 # Add utils to path
 sys.path.insert(0, str(Path(__file__).parent))
 from utils.gpu_monitor import get_gpu_monitor
 from utils.metrics import StageMetrics, PretrainMetrics, MetricsReporter
 from utils.attention import get_default_attention, configure_model_attention, validate_attention_for_gpu
 def benchmark_pretrain(
    model_name_or_path: str,
    attn_implementation: str = "auto",
    batch_size: int = 8,
    sequence_length: int = 2048,
    num_steps: int = 10,
    warmup_steps: int = 3,
    device: str = "cuda",
    device_id: int = 0,
    output_dir: Optional[str] = None,
    verbose: bool = True,
 ):
    """
    Run pretraining benchmark.
    Args:
        model_name_or_path: Path to model or HuggingFace identifier
        attn_implementation: Attention implementation to use
        batch_size: Batch size for training
        sequence_length: Sequence length
        num_steps: Number of training steps to measure
        warmup_steps: Number of warmup steps before measurement
        device: Device to use
        device_id: GPU device ID
        output_dir: Directory to save results
        verbose: Print verbose output
    """
    print("=" * 80)
    print("PRETRAINING BENCHMARK")
    print("=" * 80)
    # Initialize GPU monitor
    if verbose:
        print("\n[1/6] Initializing GPU monitor...")
    monitor = get_gpu_monitor(device_id)
    gpu_name = monitor.get_device_name()
    if verbose:
        print(f"  GPU: {gpu_name}")
    # Determine attention implementation
    if attn_implementation == "auto":
        attn_implementation = get_default_attention(gpu_name)
        if verbose:
            print(f"  Auto-selected attention: {attn_implementation}")
    # Validate attention for GPU
    valid, warning = validate_attention_for_gpu(attn_implementation, gpu_name)
    if warning and verbose:
        print(f"  ⚠ {warning}")
    # Load model
    if verbose:
        print(f"\n[2/6] Loading model: {model_name_or_path}")
    # Determine attn_implementation parameter for model loading
    load_attn = "flash_attention_2" if attn_implementation in ["flash_attention_2", "flash_attention_3_hopper"] else attn_implementation
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            torch_dtype=torch.bfloat16,
            attn_implementation=load_attn,
            trust_remote_code=True,
        )
        model = model.to(device)
        # Configure attention (patch if needed for FA3)
        model = configure_model_attention(model, attn_implementation, verbose=verbose)
        if verbose:
            total_params = sum(p.numel() for p in model.parameters())
            trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
            print(f"  Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
            print(f"  Trainable parameters: {trainable_params:,}")
    except Exception as e:
        print(f"✗ Error loading model: {e}")
        sys.exit(1)
    # Setup optimizer
    if verbose:
        print(f"\n[3/6] Setting up optimizer...")
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    # Generate synthetic training data
    if verbose:
        print(f"\n[4/6] Generating synthetic training data...")
        print(f"  Batch size: {batch_size}")
        print(f"  Sequence length: {sequence_length}")
    # Create random input_ids (synthetic data)
    vocab_size = model.config.vocab_size
    input_ids = torch.randint(0, vocab_size, (batch_size, sequence_length), device=device)
    labels = input_ids.clone()
    # Warmup
    if verbose:
        print(f"\n[5/6] Running warmup ({warmup_steps} steps)...")
    model.train()
    for _ in range(warmup_steps):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    # Synchronize before benchmarking
    torch.cuda.synchronize()
    # Benchmark
    if verbose:
        print(f"\n[6/6] Running benchmark ({num_steps} steps)...")
    # Storage for per-step metrics
    forward_times = []
    backward_times = []
    optimizer_times = []
    forward_energies = []
    backward_energies = []
    optimizer_energies = []
    forward_powers = []
    backward_powers = []
    optimizer_powers = []
    memory_usage = []
    gpu_utils = []
    total_tokens = batch_size * sequence_length * num_steps
    for step in tqdm(range(num_steps), desc="Benchmarking"):
        # === FORWARD PASS ===
        monitor.start_monitoring()
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        torch.cuda.synchronize()
        forward_time = time.perf_counter() - start_time
        forward_energy = monitor.get_energy_consumed()
        forward_power = monitor.get_average_power()
        forward_times.append(forward_time * 1000)  # Convert to ms
        forward_energies.append(forward_energy)
        forward_powers.append(forward_power)
        # === BACKWARD PASS ===
        monitor.start_monitoring()
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        loss.backward()
        torch.cuda.synchronize()
        backward_time = time.perf_counter() - start_time
        backward_energy = monitor.get_energy_consumed()
        backward_power = monitor.get_average_power()
        backward_times.append(backward_time * 1000)  # Convert to ms
        backward_energies.append(backward_energy)
        backward_powers.append(backward_power)
        # === OPTIMIZER STEP ===
        monitor.start_monitoring()
        torch.cuda.synchronize()
        start_time = time.perf_counter()
        optimizer.step()
        torch.cuda.synchronize()
        optimizer_time = time.perf_counter() - start_time
        optimizer_energy = monitor.get_energy_consumed()
        optimizer_power = monitor.get_average_power()
        optimizer_times.append(optimizer_time * 1000)  # Convert to ms
        optimizer_energies.append(optimizer_energy)
        optimizer_powers.append(optimizer_power)
        # Get memory and utilization
        metrics = monitor.get_metrics()
        memory_usage.append(metrics.memory_used_gb)
        gpu_utils.append(metrics.gpu_utilization_percent)
    # Compute aggregated metrics
    tokens_per_step = batch_size * sequence_length
    # Forward metrics
    forward_duration_ms = sum(forward_times)
    forward_energy_j = sum(forward_energies)
    forward_tokens = tokens_per_step * num_steps
    forward_tps = forward_tokens / (forward_duration_ms / 1000)
    forward_ept = forward_energy_j / forward_tokens
    forward_metrics = StageMetrics(
        stage_name="forward",
        duration_ms=forward_duration_ms,
        tokens_processed=forward_tokens,
        tokens_per_second=forward_tps,
        energy_joules=forward_energy_j,
        energy_per_token=forward_ept,
        avg_power_watts=sum(forward_powers) / len(forward_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )
    # Backward metrics
    backward_duration_ms = sum(backward_times)
    backward_energy_j = sum(backward_energies)
    backward_tokens = tokens_per_step * num_steps
    backward_tps = backward_tokens / (backward_duration_ms / 1000)
    backward_ept = backward_energy_j / backward_tokens
    backward_metrics = StageMetrics(
        stage_name="backward",
        duration_ms=backward_duration_ms,
        tokens_processed=backward_tokens,
        tokens_per_second=backward_tps,
        energy_joules=backward_energy_j,
        energy_per_token=backward_ept,
        avg_power_watts=sum(backward_powers) / len(backward_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )
    # Optimizer metrics
    optimizer_duration_ms = sum(optimizer_times)
    optimizer_energy_j = sum(optimizer_energies)
    optimizer_tokens = tokens_per_step * num_steps
    optimizer_tps = optimizer_tokens / (optimizer_duration_ms / 1000)
    optimizer_ept = optimizer_energy_j / optimizer_tokens
    optimizer_metrics = StageMetrics(
        stage_name="optimizer",
        duration_ms=optimizer_duration_ms,
        tokens_processed=optimizer_tokens,
        tokens_per_second=optimizer_tps,
        energy_joules=optimizer_energy_j,
        energy_per_token=optimizer_ept,
        avg_power_watts=sum(optimizer_powers) / len(optimizer_powers),
        peak_memory_gb=max(memory_usage),
        avg_gpu_util_percent=sum(gpu_utils) / len(gpu_utils)
    )
    # Overall metrics
    total_duration_ms = forward_duration_ms + backward_duration_ms + optimizer_duration_ms
    total_energy_j = forward_energy_j + backward_energy_j + optimizer_energy_j
    total_tps = total_tokens / (total_duration_ms / 1000)
    total_ept = total_energy_j / total_tokens
    # Create metrics object
    metrics = PretrainMetrics(
        model_name=model_name_or_path,
        gpu_name=gpu_name,
        attention_implementation=attn_implementation,
        batch_size=batch_size,
        sequence_length=sequence_length,
        num_steps=num_steps,
        forward=forward_metrics,
        backward=backward_metrics,
        optimizer=optimizer_metrics,
        total_duration_ms=total_duration_ms,
        total_tokens=total_tokens,
        total_tokens_per_second=total_tps,
        total_energy_joules=total_energy_j,
        total_energy_per_token=total_ept
    )
    # Print results
    MetricsReporter.print_pretrain_metrics(metrics, verbose=verbose)
    # Save results
    if output_dir:
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        # Save JSON
        json_path = output_path / f"pretrain_{gpu_name.replace(' ', '_')}_{attn_implementation}.json"
        MetricsReporter.save_json(metrics, json_path)
    # Cleanup
    monitor.cleanup()
    del model
    torch.cuda.empty_cache()
    return metrics
 def main():
    parser = argparse.ArgumentParser(
        description="LLM Pretraining Benchmark",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        "--model-path",
        type=str,
        default="./model_cache",
        help="Path to cached model"
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen/Qwen3-4B",
        help="Model name (for reporting)"
    )
    parser.add_argument(
        "--attn-implementation",
        type=str,
        default="auto",
        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
        help="Attention implementation to use"
    )
    parser.add_argument(
        "--batch-size",
        type=int,
        default=8,
        help="Batch size"
    )
    parser.add_argument(
        "--sequence-length",
        type=int,
        default=8192,
        help="Sequence length"
    )
    parser.add_argument(
        "--num-steps",
        type=int,
        default=10,
        help="Number of training steps"
    )
    parser.add_argument(
        "--warmup-steps",
        type=int,
        default=3,
        help="Number of warmup steps"
    )
    parser.add_argument(
        "--device-id",
        type=int,
        default=0,
        help="GPU device ID"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./results",
        help="Output directory for results"
    )
    args = parser.parse_args()
    # Set environment variables for HuggingFace cache
    if Path(args.model_path).exists():
        os.environ['HF_HOME'] = args.model_path
    benchmark_pretrain(
        model_name_or_path=args.model_name,
        attn_implementation=args.attn_implementation,
        batch_size=args.batch_size,
        sequence_length=args.sequence_length,
        num_steps=args.num_steps,
        warmup_steps=args.warmup_steps,
        device="cuda",
        device_id=args.device_id,
        output_dir=args.output_dir,
        verbose=True
    )
 if __name__ == "__main__":
    main()
--- a/cache_model.py
+++ b/cache_model.py
@@ -0,0 +1,151 @@
 #!/usr/bin/env python3
 """
 Model Caching Script for LLM Benchmarking
 This script downloads and caches the Qwen3-4B model from HuggingFace
 before running benchmarks on offline compute nodes.
 """
 import argparse
 import os
 import sys
 from pathlib import Path
 def cache_model(model_name: str, cache_dir: str, force: bool = False):
    """
    Download and cache a HuggingFace model.
    Args:
        model_name: HuggingFace model identifier (e.g., "Qwen/Qwen3-4B-Instruct-2507")
        cache_dir: Local directory to cache the model
        force: Force re-download even if model exists
    """
    try:
        from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
    except ImportError:
        print("Error: transformers library not found. Please install it:")
        print("  pip install transformers")
        sys.exit(1)
    # Create cache directory
    cache_path = Path(cache_dir).resolve()
    cache_path.mkdir(parents=True, exist_ok=True)
    print(f"Caching model: {model_name}")
    print(f"Cache directory: {cache_path}")
    print("-" * 60)
    # Set HuggingFace cache directory
    os.environ['HF_HOME'] = str(cache_path)
    # Check if model already exists
    model_path = cache_path / model_name.replace("/", "--")
    if model_path.exists() and not force:
        print(f"Model already cached at: {model_path}")
        print("Use --force to re-download")
        return str(cache_path)
    try:
        # Download config
        print("\n[1/3] Downloading model config...")
        config = AutoConfig.from_pretrained(
            model_name,
            cache_dir=cache_path,
            trust_remote_code=True
        )
        print(f"  ✓ Config downloaded")
        print(f"    - Model type: {config.model_type}")
        print(f"    - Hidden size: {config.hidden_size}")
        print(f"    - Num layers: {config.num_hidden_layers}")
        print(f"    - Num attention heads: {config.num_attention_heads}")
        # Download tokenizer
        print("\n[2/3] Downloading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            cache_dir=cache_path,
            trust_remote_code=True
        )
        print(f"  ✓ Tokenizer downloaded")
        print(f"    - Vocab size: {len(tokenizer)}")
        print(f"    - Model max length: {tokenizer.model_max_length}")
        # Download model weights
        print("\n[3/3] Downloading model weights...")
        print("  (This may take several minutes depending on connection speed)")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            cache_dir=cache_path,
            trust_remote_code=True,
            torch_dtype="auto",
            low_cpu_mem_usage=True
        )
        print(f"  ✓ Model weights downloaded")
        # Calculate total parameters
        total_params = sum(p.numel() for p in model.parameters())
        print(f"    - Total parameters: {total_params:,} ({total_params/1e9:.2f}B)")
        # Clean up model from memory
        del model
        print("\n" + "=" * 60)
        print("✓ Model successfully cached!")
        print("=" * 60)
        print(f"\nCache location: {cache_path}")
        print(f"\nTo use in benchmarks, set:")
        print(f"  --model-path {cache_path}")
        print(f"\nOr set environment variable:")
        print(f"  export HF_HOME={cache_path}")
        return str(cache_path)
    except Exception as e:
        print(f"\n✗ Error downloading model: {e}", file=sys.stderr)
        sys.exit(1)
 def main():
    parser = argparse.ArgumentParser(
        description="Cache HuggingFace model for offline use",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Cache model to default location
  python cache_model.py
  # Cache model to custom directory
  python cache_model.py --cache-dir /path/to/cache
  # Force re-download
  python cache_model.py --force
        """
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen/Qwen3-4B",
        help="HuggingFace model identifier (default: Qwen/Qwen3-4B)"
    )
    parser.add_argument(
        "--cache-dir",
        type=str,
        default="./model_cache",
        help="Directory to cache model (default: ./model_cache in current directory)"
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Force re-download even if model exists"
    )
    args = parser.parse_args()
    cache_model(args.model_name, args.cache_dir, args.force)
 if __name__ == "__main__":
    main()
--- a/configs/a100.yaml
+++ b/configs/a100.yaml
@@ -0,0 +1,26 @@
 # A100 Configuration
 gpu_type: a100
 gpu_model: "NVIDIA A100 80GB"
 # Default attention implementation
 default_attention: flash_attention_2
 # Pretraining defaults
 pretrain:
  batch_size: 8
  sequence_length: 8192
  num_steps: 10
  warmup_steps: 3
 # Inference defaults
 inference:
  num_requests: 10
  prompt_length: 512
  generation_length: 100
  warmup_requests: 2
 # Hardware specs (for reference)
 hardware:
  memory_gb: 80
  tdp_watts: 400
  compute_capability: "8.0"
--- a/configs/h100.yaml
+++ b/configs/h100.yaml
@@ -0,0 +1,26 @@
 # H100 Configuration
 gpu_type: h100
 gpu_model: "NVIDIA H100 80GB"
 # Default attention implementation
 default_attention: flash_attention_3_hopper
 # Pretraining defaults
 pretrain:
  batch_size: 8
  sequence_length: 8192
  num_steps: 10
  warmup_steps: 3
 # Inference defaults
 inference:
  num_requests: 10
  prompt_length: 512
  generation_length: 100
  warmup_requests: 2
 # Hardware specs (for reference)
 hardware:
  memory_gb: 80
  tdp_watts: 700
  compute_capability: "9.0"
--- a/configs/h200.yaml
+++ b/configs/h200.yaml
@@ -0,0 +1,26 @@
 # H200 Configuration
 gpu_type: h200
 gpu_model: "NVIDIA H200 141GB"
 # Default attention implementation
 default_attention: flash_attention_3_hopper
 # Pretraining defaults
 pretrain:
  batch_size: 8
  sequence_length: 8192
  num_steps: 10
  warmup_steps: 3
 # Inference defaults
 inference:
  num_requests: 10
  prompt_length: 512
  generation_length: 100
  warmup_requests: 2
 # Hardware specs (for reference)
 hardware:
  memory_gb: 141
  tdp_watts: 700
  compute_capability: "9.0"
--- a/configs/mi300x.yaml
+++ b/configs/mi300x.yaml
@@ -0,0 +1,26 @@
 # MI300X Configuration
 gpu_type: mi300x
 gpu_model: "AMD Instinct MI300X"
 # Default attention implementation
 default_attention: flash_attention_2
 # Pretraining defaults
 pretrain:
  batch_size: 8
  sequence_length: 8192
  num_steps: 10
  warmup_steps: 3
 # Inference defaults
 inference:
  num_requests: 10
  prompt_length: 512
  generation_length: 100
  warmup_requests: 2
 # Hardware specs (for reference)
 hardware:
  memory_gb: 192
  tdp_watts: 750
  compute_capability: "gfx940"
--- a/quick_start.sh
+++ b/quick_start.sh
@@ -0,0 +1,122 @@
 #!/bin/bash
 # Quick Start Script for LLM Benchmark Suite
 #
 # This script helps you get started quickly with the benchmark suite.
 # It will:
 # 1. Check dependencies
 # 2. Cache the model if needed
 # 3. Run a quick test benchmark
 #
 # Usage: ./quick_start.sh [--skip-cache]
 set -e  # Exit on error
 echo "========================================="
 echo "LLM Benchmark Suite - Quick Start"
 echo "========================================="
 # Parse arguments
 SKIP_CACHE=false
 if [[ "$1" == "--skip-cache" ]]; then
    SKIP_CACHE=true
 fi
 # Check Python
 echo ""
 echo "[1/5] Checking Python..."
 if ! command -v python &> /dev/null; then
    echo "✗ Python not found. Please install Python 3.8+"
    exit 1
 fi
 PYTHON_VERSION=$(python --version 2>&1 | awk '{print $2}')
 echo "  ✓ Python $PYTHON_VERSION found"
 # Check dependencies
 echo ""
 echo "[2/5] Checking dependencies..."
 MISSING_DEPS=()
 if ! python -c "import torch" 2>/dev/null; then
    MISSING_DEPS+=("torch")
 fi
 if ! python -c "import transformers" 2>/dev/null; then
    MISSING_DEPS+=("transformers")
 fi
 if ${#MISSING_DEPS[@]} -gt 0; then
    echo "  ⚠ Missing dependencies: ${MISSING_DEPS[*]}"
    echo "  Installing dependencies..."
    pip install -r requirements.txt
 else
    echo "  ✓ All dependencies installed"
 fi
 # Check GPU
 echo ""
 echo "[3/5] Checking GPU..."
 if python -c "import torch; assert torch.cuda.is_available()" 2>/dev/null; then
    GPU_NAME=$(python -c "import torch; print(torch.cuda.get_device_name(0))")
    echo "  ✓ GPU found: $GPU_NAME"
 else
    echo "  ✗ No GPU found or CUDA not available"
    echo "  This benchmark requires a GPU to run."
    exit 1
 fi
 # Cache model
 if [ "$SKIP_CACHE" = false ]; then
    echo ""
    echo "[4/5] Caching model..."
    if [ -d "./model_cache" ] && [ "$(ls -A ./model_cache)" ]; then
        echo "  ✓ Model cache already exists at ./model_cache"
        echo "  To re-download, remove the directory and run again."
    else
        echo "  Downloading Qwen/Qwen3-4B..."
        echo "  (This may take several minutes depending on your connection)"
        python cache_model.py --model-name Qwen/Qwen3-4B --cache-dir ./model_cache
    fi
 else
    echo ""
    echo "[4/5] Skipping model cache (--skip-cache specified)"
 fi
 # Run quick test
 echo ""
 echo "[5/5] Running quick test benchmark..."
 echo "  This will run a minimal benchmark to verify everything works."
 echo "  Parameters: 2 steps, batch size 2, sequence length 512"
 echo ""
 python run_benchmark.py \
    --mode both \
    --model-path ./model_cache \
    --model-name Qwen/Qwen3-4B \
    --batch-size 2 \
    --sequence-length 512 \
    --num-steps 2 \
    --num-requests 2 \
    --prompt-length 256 \
    --generation-length 20 \
    --output-dir ./results/test
 echo ""
 echo "========================================="
 echo "Quick Start Complete!"
 echo "========================================="
 echo ""
 echo "Next steps:"
 echo "  1. Run full benchmarks:"
 echo "     python run_benchmark.py --mode both"
 echo ""
 echo "  2. Run on different GPUs using SLURM:"
 echo "     sbatch slurm_a100.sh"
 echo "     sbatch slurm_h100.sh"
 echo "     sbatch slurm_h200.sh"
 echo "     sbatch slurm_mi300x.sh"
 echo ""
 echo "  3. View results:"
 echo "     ls -l results/"
 echo ""
 echo "For more information, see README.md"
 echo ""
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,22 @@
 # LLM Benchmark Suite - Requirements
 # Core dependencies
 torch>=2.0.0
 transformers>=4.35.0
 accelerate>=0.24.0
 tokenizers>=0.14.0
 # Attention implementations
 flash-attn>=2.0.0
 # GPU monitoring
 pynvml>=11.5.0  # NVIDIA GPU monitoring
 pyrsmi>=1.0.0   # AMD GPU monitoring
 # Utilities
 numpy>=1.24.0
 pyyaml>=6.0
 tqdm>=4.65.0
 # Optional: for better performance
 triton>=2.0.0
--- a/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
+++ b/results/a100/inference_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
@@ -0,0 +1,37 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA A100-SXM4-80GB",
  "attention_implementation": "flash_attention_2",
  "num_requests": 10,
  "prompt_length": 512,
  "generation_length": 100,
  "prefill": {
    "stage_name": "prefill",
    "duration_ms": 475.62581300735474,
    "tokens_processed": 5120,
    "tokens_per_second": 10764.76477932628,
    "energy_joules": 21.409000039100647,
    "energy_per_token": 0.004181445320136845,
    "avg_power_watts": 68.91171083870925,
    "peak_memory_gb": 45.87115478515625,
    "avg_gpu_util_percent": 38.1
  },
  "decode": {
    "stage_name": "decode",
    "duration_ms": 41460.768724791706,
    "tokens_processed": 1000,
    "tokens_per_second": 24.119186179055195,
    "energy_joules": 4684.697999954224,
    "energy_per_token": 4.684697999954223,
    "avg_power_watts": 112.85507087682042,
    "peak_memory_gb": 45.87115478515625,
    "avg_gpu_util_percent": 38.1
  },
  "e2e_latency_ms": 4193.639453779906,
  "e2e_tokens_per_second": 145.93529242204605,
  "e2e_energy_joules": 4706.106999993324,
  "e2e_energy_per_token": 0.768971732025053,
  "ttft_ms": 47.562581300735474,
  "itl_ms": 41.460768724791706,
  "timestamp": 1768519487.5402663
 }
--- a/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
+++ b/results/a100/pretrain_NVIDIA_A100-SXM4-80GB_flash_attention_2.json
@@ -0,0 +1,47 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA A100-SXM4-80GB",
  "attention_implementation": "flash_attention_2",
  "batch_size": 3,
  "sequence_length": 2048,
  "num_steps": 10,
  "forward": {
    "stage_name": "forward",
    "duration_ms": 3359.0412912890315,
    "tokens_processed": 61440,
    "tokens_per_second": 18290.933237210196,
    "energy_joules": 1292.2280000448227,
    "energy_per_token": 0.021032356771562868,
    "avg_power_watts": 387.19580415542595,
    "peak_memory_gb": 79.66021728515625,
    "avg_gpu_util_percent": 97.8
  },
  "backward": {
    "stage_name": "backward",
    "duration_ms": 6954.944152384996,
    "tokens_processed": 61440,
    "tokens_per_second": 8834.003358449821,
    "energy_joules": 2729.588000059128,
    "energy_per_token": 0.0444268880217957,
    "avg_power_watts": 394.24766095856324,
    "peak_memory_gb": 79.66021728515625,
    "avg_gpu_util_percent": 97.8
  },
  "optimizer": {
    "stage_name": "optimizer",
    "duration_ms": 1153.845101594925,
    "tokens_processed": 61440,
    "tokens_per_second": 53248.048559614595,
    "energy_joules": 362.6529998779297,
    "energy_per_token": 0.005902555336554845,
    "avg_power_watts": 299.1223537953503,
    "peak_memory_gb": 79.66021728515625,
    "avg_gpu_util_percent": 97.8
  },
  "total_duration_ms": 11467.830545268953,
  "total_tokens": 61440,
  "total_tokens_per_second": 5357.595733340081,
  "total_energy_joules": 4384.46899998188,
  "total_energy_per_token": 0.07136180012991342,
  "timestamp": 1768519431.5985208
 }
--- a/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json
+++ b/results/h100/inference_NVIDIA_H100_flash_attention_3_hopper.json
@@ -0,0 +1,37 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H100",
  "attention_implementation": "flash_attention_3_hopper",
  "num_requests": 10,
  "prompt_length": 512,
  "generation_length": 100,
  "prefill": {
    "stage_name": "prefill",
    "duration_ms": 323.99015384726226,
    "tokens_processed": 5120,
    "tokens_per_second": 15802.949377324925,
    "energy_joules": 17.092000007629395,
    "energy_per_token": 0.0033382812514901163,
    "avg_power_watts": 93.64442380045372,
    "peak_memory_gb": 46.02825927734375,
    "avg_gpu_util_percent": 40.0
  },
  "decode": {
    "stage_name": "decode",
    "duration_ms": 30513.75844143331,
    "tokens_processed": 1000,
    "tokens_per_second": 32.772101867403634,
    "energy_joules": 4915.5139999985695,
    "energy_per_token": 4.915513999998569,
    "avg_power_watts": 161.199160874206,
    "peak_memory_gb": 46.02825927734375,
    "avg_gpu_util_percent": 40.0
  },
  "e2e_latency_ms": 3083.7748595280573,
  "e2e_tokens_per_second": 198.4580677506596,
  "e2e_energy_joules": 4932.606000006199,
  "e2e_energy_per_token": 0.8059813725500325,
  "ttft_ms": 32.399015384726226,
  "itl_ms": 30.51375844143331,
  "timestamp": 1768541839.3186588
 }
--- a/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json
+++ b/results/h100/pretrain_NVIDIA_H100_flash_attention_3_hopper.json
@@ -0,0 +1,47 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H100",
  "attention_implementation": "flash_attention_3_hopper",
  "batch_size": 3,
  "sequence_length": 2048,
  "num_steps": 10,
  "forward": {
    "stage_name": "forward",
    "duration_ms": 1748.5067250672728,
    "tokens_processed": 61440,
    "tokens_per_second": 35138.55515633555,
    "energy_joules": 946.9269999563694,
    "energy_per_token": 0.015412223306581534,
    "avg_power_watts": 501.76439870614394,
    "peak_memory_gb": 76.45208740234375,
    "avg_gpu_util_percent": 97.0
  },
  "backward": {
    "stage_name": "backward",
    "duration_ms": 3761.718863155693,
    "tokens_processed": 61440,
    "tokens_per_second": 16332.959010248362,
    "energy_joules": 1904.104000031948,
    "energy_per_token": 0.030991276042186655,
    "avg_power_watts": 491.250130606127,
    "peak_memory_gb": 76.45208740234375,
    "avg_gpu_util_percent": 97.0
  },
  "optimizer": {
    "stage_name": "optimizer",
    "duration_ms": 896.0564862936735,
    "tokens_processed": 61440,
    "tokens_per_second": 68567.1059133025,
    "energy_joules": 349.722000002861,
    "energy_per_token": 0.0056920898437965665,
    "avg_power_watts": 356.92130879075387,
    "peak_memory_gb": 76.45208740234375,
    "avg_gpu_util_percent": 97.0
  },
  "total_duration_ms": 6406.282074516639,
  "total_tokens": 61440,
  "total_tokens_per_second": 9590.586128637759,
  "total_energy_joules": 3200.7529999911785,
  "total_energy_per_token": 0.052095589192564754,
  "timestamp": 1768541796.4011748
 }
--- a/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json
+++ b/results/h100_sdpa/inference_NVIDIA_H100_sdpa.json
@@ -0,0 +1,37 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H100",
  "attention_implementation": "sdpa",
  "num_requests": 10,
  "prompt_length": 512,
  "generation_length": 100,
  "prefill": {
    "stage_name": "prefill",
    "duration_ms": 253.97859653458,
    "tokens_processed": 5120,
    "tokens_per_second": 20159.179040517676,
    "energy_joules": 0.0,
    "energy_per_token": 0.0,
    "avg_power_watts": 0.0,
    "peak_memory_gb": 46.01458740234375,
    "avg_gpu_util_percent": 48.8
  },
  "decode": {
    "stage_name": "decode",
    "duration_ms": 23519.252635538578,
    "tokens_processed": 1000,
    "tokens_per_second": 42.51835785330007,
    "energy_joules": 4544.901999980211,
    "energy_per_token": 4.544901999980211,
    "avg_power_watts": 192.5432634001641,
    "peak_memory_gb": 46.01458740234375,
    "avg_gpu_util_percent": 48.8
  },
  "e2e_latency_ms": 2377.323123207316,
  "e2e_tokens_per_second": 257.43240118504923,
  "e2e_energy_joules": 4544.901999980211,
  "e2e_energy_per_token": 0.7426310457484006,
  "ttft_ms": 25.397859653458,
  "itl_ms": 23.519252635538578,
  "timestamp": 1769149269.5228984
 }
--- a/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json
+++ b/results/h100_sdpa/pretrain_NVIDIA_H100_sdpa.json
@@ -0,0 +1,47 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H100",
  "attention_implementation": "sdpa",
  "batch_size": 3,
  "sequence_length": 2048,
  "num_steps": 10,
  "forward": {
    "stage_name": "forward",
    "duration_ms": 1790.2467511594296,
    "tokens_processed": 61440,
    "tokens_per_second": 34319.29143857359,
    "energy_joules": 981.029000043869,
    "energy_per_token": 0.01596726888092235,
    "avg_power_watts": 520.9058508009567,
    "peak_memory_gb": 76.45208740234375,
    "avg_gpu_util_percent": 100.0
  },
  "backward": {
    "stage_name": "backward",
    "duration_ms": 3854.5540031045675,
    "tokens_processed": 61440,
    "tokens_per_second": 15939.587290906931,
    "energy_joules": 1953.71099999547,
    "energy_per_token": 0.03179868164055127,
    "avg_power_watts": 491.5443624439596,
    "peak_memory_gb": 76.45208740234375,
    "avg_gpu_util_percent": 100.0
  },
  "optimizer": {
    "stage_name": "optimizer",
    "duration_ms": 899.9840868636966,
    "tokens_processed": 61440,
    "tokens_per_second": 68267.87372886644,
    "energy_joules": 365.9209999740124,
    "energy_per_token": 0.005955745442285358,
    "avg_power_watts": 377.8756124501158,
    "peak_memory_gb": 76.45208740234375,
    "avg_gpu_util_percent": 100.0
  },
  "total_duration_ms": 6544.784841127694,
  "total_tokens": 61440,
  "total_tokens_per_second": 9387.627170553957,
  "total_energy_joules": 3300.6610000133514,
  "total_energy_per_token": 0.053721695963758975,
  "timestamp": 1769149234.99943
 }
--- a/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json
+++ b/results/h200/inference_NVIDIA_H200_flash_attention_3_hopper.json
@@ -0,0 +1,37 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H200",
  "attention_implementation": "flash_attention_3_hopper",
  "num_requests": 10,
  "prompt_length": 512,
  "generation_length": 100,
  "prefill": {
    "stage_name": "prefill",
    "duration_ms": 323.8773119999223,
    "tokens_processed": 5120,
    "tokens_per_second": 15808.455270868828,
    "energy_joules": 98.1449999999968,
    "energy_per_token": 0.019168945312499373,
    "avg_power_watts": 250.96736239598317,
    "peak_memory_gb": 46.1302490234375,
    "avg_gpu_util_percent": 32.2
  },
  "decode": {
    "stage_name": "decode",
    "duration_ms": 30558.618001000013,
    "tokens_processed": 1000,
    "tokens_per_second": 32.72399294913388,
    "energy_joules": 4828.459999999999,
    "energy_per_token": 4.828459999999999,
    "avg_power_watts": 157.61927190444868,
    "peak_memory_gb": 46.1302490234375,
    "avg_gpu_util_percent": 32.2
  },
  "e2e_latency_ms": 3088.2495312999936,
  "e2e_tokens_per_second": 198.17051497855476,
  "e2e_energy_joules": 4926.604999999996,
  "e2e_energy_per_token": 0.8050008169934634,
  "ttft_ms": 32.38773119999223,
  "itl_ms": 30.558618001000013,
  "timestamp": 1768541964.4743361
 }
--- a/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json
+++ b/results/h200/pretrain_NVIDIA_H200_flash_attention_3_hopper.json
@@ -0,0 +1,47 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H200",
  "attention_implementation": "flash_attention_3_hopper",
  "batch_size": 3,
  "sequence_length": 2048,
  "num_steps": 10,
  "forward": {
    "stage_name": "forward",
    "duration_ms": 1605.9521619997668,
    "tokens_processed": 61440,
    "tokens_per_second": 38257.67756587068,
    "energy_joules": 817.7539999999863,
    "energy_per_token": 0.01330979817708311,
    "avg_power_watts": 476.6091506406698,
    "peak_memory_gb": 76.5540771484375,
    "avg_gpu_util_percent": 95.1
  },
  "backward": {
    "stage_name": "backward",
    "duration_ms": 3448.8081949999696,
    "tokens_processed": 61440,
    "tokens_per_second": 17814.849804948502,
    "energy_joules": 1765.182000000008,
    "energy_per_token": 0.02873017578125013,
    "avg_power_watts": 498.84691252245983,
    "peak_memory_gb": 76.5540771484375,
    "avg_gpu_util_percent": 95.1
  },
  "optimizer": {
    "stage_name": "optimizer",
    "duration_ms": 545.701982000196,
    "tokens_processed": 61440,
    "tokens_per_second": 112588.92587268984,
    "energy_joules": 332.4770000000135,
    "energy_per_token": 0.005411409505208553,
    "avg_power_watts": 521.4900438388863,
    "peak_memory_gb": 76.5540771484375,
    "avg_gpu_util_percent": 95.1
  },
  "total_duration_ms": 5600.462338999932,
  "total_tokens": 61440,
  "total_tokens_per_second": 10970.522839186035,
  "total_energy_joules": 2915.4130000000077,
  "total_energy_per_token": 0.047451383463541795,
  "timestamp": 1768541921.6000674
 }
--- a/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json
+++ b/results/h200_sdpa/inference_NVIDIA_H200_sdpa.json
@@ -0,0 +1,37 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H200",
  "attention_implementation": "sdpa",
  "num_requests": 10,
  "prompt_length": 512,
  "generation_length": 100,
  "prefill": {
    "stage_name": "prefill",
    "duration_ms": 247.9969559935853,
    "tokens_processed": 5120,
    "tokens_per_second": 20645.414696672466,
    "energy_joules": 73.83399999141693,
    "energy_per_token": 0.014420703123323619,
    "avg_power_watts": 222.33737204549297,
    "peak_memory_gb": 46.1165771484375,
    "avg_gpu_util_percent": 40.0
  },
  "decode": {
    "stage_name": "decode",
    "duration_ms": 23003.622506046668,
    "tokens_processed": 1000,
    "tokens_per_second": 43.47141411041425,
    "energy_joules": 4033.3500000089407,
    "energy_per_token": 4.033350000008941,
    "avg_power_watts": 174.6335604209662,
    "peak_memory_gb": 46.1165771484375,
    "avg_gpu_util_percent": 40.0
  },
  "e2e_latency_ms": 2325.1619462040253,
  "e2e_tokens_per_second": 263.20747292425324,
  "e2e_energy_joules": 4107.184000000358,
  "e2e_energy_per_token": 0.6711084967320846,
  "ttft_ms": 24.79969559935853,
  "itl_ms": 23.003622506046668,
  "timestamp": 1769149520.7919798
 }
--- a/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json
+++ b/results/h200_sdpa/pretrain_NVIDIA_H200_sdpa.json
@@ -0,0 +1,47 @@
 {
  "model_name": "Qwen/Qwen3-4B",
  "gpu_name": "NVIDIA H200",
  "attention_implementation": "sdpa",
  "batch_size": 3,
  "sequence_length": 2048,
  "num_steps": 10,
  "forward": {
    "stage_name": "forward",
    "duration_ms": 1615.8598741167225,
    "tokens_processed": 61440,
    "tokens_per_second": 38023.09902248482,
    "energy_joules": 873.9250000119209,
    "energy_per_token": 0.014224039713735693,
    "avg_power_watts": 541.9081076256928,
    "peak_memory_gb": 76.5540771484375,
    "avg_gpu_util_percent": 100.0
  },
  "backward": {
    "stage_name": "backward",
    "duration_ms": 3462.180594098754,
    "tokens_processed": 61440,
    "tokens_per_second": 17746.04135460864,
    "energy_joules": 1696.024000003934,
    "energy_per_token": 0.027604557291730693,
    "avg_power_watts": 472.8399628680292,
    "peak_memory_gb": 76.5540771484375,
    "avg_gpu_util_percent": 100.0
  },
  "optimizer": {
    "stage_name": "optimizer",
    "duration_ms": 551.849422918167,
    "tokens_processed": 61440,
    "tokens_per_second": 111334.71821915968,
    "energy_joules": 316.88299998641014,
    "energy_per_token": 0.005157600911237144,
    "avg_power_watts": 499.2301039455484,
    "peak_memory_gb": 76.5540771484375,
    "avg_gpu_util_percent": 100.0
  },
  "total_duration_ms": 5629.889891133644,
  "total_tokens": 61440,
  "total_tokens_per_second": 10913.179687005982,
  "total_energy_joules": 2886.832000002265,
  "total_energy_per_token": 0.04698619791670353,
  "timestamp": 1769149487.0005488
 }
--- a/run_benchmark.py
+++ b/run_benchmark.py
@@ -0,0 +1,248 @@
 #!/usr/bin/env python3
 """
 Main LLM Benchmark Runner
 Orchestrates pretraining and inference benchmarks with auto-detection
 of GPU type and configuration.
 """
 import argparse
 import sys
 from pathlib import Path
 # Import benchmark functions
 import benchmark_pretrain
 import benchmark_inference
 from utils.gpu_monitor import get_gpu_monitor, list_available_gpus
 from utils.metrics import MetricsReporter
 def main():
    parser = argparse.ArgumentParser(
        description="LLM Benchmark Suite - Compare GPU performance for pretraining and inference",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Run both pretrain and inference benchmarks
  python run_benchmark.py --mode both
  # Run only pretraining benchmark
  python run_benchmark.py --mode pretrain --num-steps 20
  # Run inference with custom settings
  python run_benchmark.py --mode inference --num-requests 20 --generation-length 200
  # Use specific attention implementation
  python run_benchmark.py --attn-implementation flash_attention_3_hopper
        """
    )
    # Model configuration
    parser.add_argument(
        "--model-path",
        type=str,
        default="./model_cache",
        help="Path to cached model directory"
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="Qwen/Qwen3-4B",
        help="Model name for reporting"
    )
    # Benchmark mode
    parser.add_argument(
        "--mode",
        type=str,
        default="both",
        choices=["pretrain", "inference", "both"],
        help="Benchmark mode to run"
    )
    # Attention configuration
    parser.add_argument(
        "--attn-implementation",
        type=str,
        default="auto",
        choices=["auto", "flash_attention_2", "flash_attention_3_hopper", "sdpa", "eager"],
        help="Attention implementation (auto selects based on GPU)"
    )
    # Pretraining parameters
    pretrain_group = parser.add_argument_group("pretraining parameters")
    pretrain_group.add_argument(
        "--batch-size",
        type=int,
        default=3,
        help="Batch size for pretraining"
    )
    pretrain_group.add_argument(
        "--sequence-length",
        type=int,
        default=2048,
        help="Sequence length for pretraining"
    )
    pretrain_group.add_argument(
        "--num-steps",
        type=int,
        default=10,
        help="Number of training steps"
    )
    pretrain_group.add_argument(
        "--warmup-steps",
        type=int,
        default=3,
        help="Number of warmup steps"
    )
    # Inference parameters
    inference_group = parser.add_argument_group("inference parameters")
    inference_group.add_argument(
        "--num-requests",
        type=int,
        default=10,
        help="Number of inference requests"
    )
    inference_group.add_argument(
        "--prompt-length",
        type=int,
        default=512,
        help="Prompt length in tokens"
    )
    inference_group.add_argument(
        "--generation-length",
        type=int,
        default=100,
        help="Number of tokens to generate"
    )
    inference_group.add_argument(
        "--warmup-requests",
        type=int,
        default=2,
        help="Number of warmup requests"
    )
    # General parameters
    parser.add_argument(
        "--device-id",
        type=int,
        default=0,
        help="GPU device ID"
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="./results",
        help="Output directory for results"
    )
    parser.add_argument(
        "--list-gpus",
        action="store_true",
        help="List available GPUs and exit"
    )
    args = parser.parse_args()
    # List GPUs if requested
    if args.list_gpus:
        print("Available GPUs:")
        gpus = list_available_gpus()
        if not gpus:
            print("  No GPUs found!")
        else:
            for gpu in gpus:
                print(f"  {gpu}")
        return
    # Print header
    print("=" * 80)
    print("LLM BENCHMARK SUITE")
    print("=" * 80)
    print(f"\nModel: {args.model_name}")
    print(f"Model Path: {args.model_path}")
    print(f"Mode: {args.mode}")
    print(f"Attention: {args.attn_implementation}")
    print(f"Output Directory: {args.output_dir}")
    # Detect GPU
    print("\nDetecting GPU...")
    try:
        monitor = get_gpu_monitor(args.device_id)
        gpu_name = monitor.get_device_name()
        print(f"  GPU {args.device_id}: {gpu_name}")
        monitor.cleanup()
    except Exception as e:
        print(f"✗ Error detecting GPU: {e}")
        sys.exit(1)
    # Create output directory
    output_path = Path(args.output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    # Run benchmarks
    pretrain_metrics = None
    inference_metrics = None
    if args.mode in ["pretrain", "both"]:
        print("\n" + "=" * 80)
        print("Running Pretraining Benchmark...")
        print("=" * 80)
        pretrain_metrics = benchmark_pretrain.benchmark_pretrain(
            model_name_or_path=args.model_name,
            attn_implementation=args.attn_implementation,
            batch_size=args.batch_size,
            sequence_length=args.sequence_length,
            num_steps=args.num_steps,
            warmup_steps=args.warmup_steps,
            device="cuda",
            device_id=args.device_id,
            output_dir=args.output_dir,
            verbose=True
        )
    if args.mode in ["inference", "both"]:
        print("\n" + "=" * 80)
        print("Running Inference Benchmark...")
        print("=" * 80)
        inference_metrics = benchmark_inference.benchmark_inference(
            model_name_or_path=args.model_name,
            attn_implementation=args.attn_implementation,
            num_requests=args.num_requests,
            prompt_length=args.prompt_length,
            generation_length=args.generation_length,
            warmup_requests=args.warmup_requests,
            device="cuda",
            device_id=args.device_id,
            output_dir=args.output_dir,
            verbose=True
        )
    # Summary
    print("\n" + "=" * 80)
    print("BENCHMARK COMPLETE")
    print("=" * 80)
    print(f"\nResults saved to: {output_path}")
    if pretrain_metrics:
        print(f"\nPretraining:")
        print(f"  Duration: {pretrain_metrics.total_duration_ms:.2f} ms")
        print(f"  Throughput: {pretrain_metrics.total_tokens_per_second:.2f} tokens/s")
        print(f"  Energy: {pretrain_metrics.total_energy_joules:.2f} J")
        print(f"  Energy/token: {pretrain_metrics.total_energy_per_token*1000:.4f} mJ/token")
    if inference_metrics:
        print(f"\nInference:")
        print(f"  TTFT: {inference_metrics.ttft_ms:.2f} ms")
        print(f"  ITL: {inference_metrics.itl_ms:.2f} ms/token")
        print(f"  Throughput: {inference_metrics.e2e_tokens_per_second:.2f} tokens/s")
        print(f"  Energy: {inference_metrics.e2e_energy_joules:.2f} J")
        print(f"  Energy/token: {inference_metrics.e2e_energy_per_token*1000:.4f} mJ/token")
 if __name__ == "__main__":
    main()
--- a/slurm_a100.sh
+++ b/slurm_a100.sh
@@ -0,0 +1,45 @@
 #!/bin/bash
 #SBATCH --job-name=llm_bench_a100
 #SBATCH --partition=a100        # Adjust to your A100 partition name
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:a100:1       # Request 1 A100 GPU
 #SBATCH -C a100_80
 #SBATCH --time=02:00:00
 #SBATCH --output=logs/benchmark_a100_sdpa_%j.out
 #SBATCH --error=logs/benchmark_a100_sdpa_%j.err
 # Create logs directory
 mkdir -p logs
 # Print job info
 echo "========================================="
 echo "Job ID: $SLURM_JOB_ID"
 echo "Job Name: $SLURM_JOB_NAME"
 echo "Node: $SLURM_NODELIST"
 echo "Date: $(date)"
 echo "========================================="
 # Set cache paths
 export TRANSFORMERS_CACHE=$(pwd)/model_cache
 export HF_HOME=$(pwd)/model_cache
 # Path to apptainer image
 APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_updated_ao.sif"
 # Run benchmark inside apptainer
 apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
    --mode both \
    --model-path ./model_cache \
    --model-name Qwen/Qwen3-4B \
    --attn-implementation sdpa \
    --batch-size 3 \
    --sequence-length 2048 \
    --num-steps 10 \
    --num-requests 10 \
    --prompt-length 512 \
    --generation-length 100 \
    --output-dir ./results/a100
 echo "========================================="
 echo "Benchmark Complete!"
 echo "========================================="
--- a/slurm_h100.sh
+++ b/slurm_h100.sh
@@ -0,0 +1,46 @@
 #!/bin/bash
 #SBATCH --job-name=llm_bench_h100
 #SBATCH --partition=h100        # Adjust to your H100 partition name
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:h100:1       # Request 1 H100 GPU
 #SBATCH --time=02:00:00
 #SBATCH --output=logs/benchmark_h100_%j.out
 #SBATCH --error=logs/benchmark_h100_%j.err
 # Create logs directory
 mkdir -p logs
 # Print job info
 echo "========================================="
 echo "Job ID: $SLURM_JOB_ID"
 echo "Job Name: $SLURM_JOB_NAME"
 echo "Node: $SLURM_NODELIST"
 echo "Date: $(date)"
 echo "========================================="
 # Set cache paths
 export TRANSFORMERS_CACHE=$(pwd)/model_cache
 export HF_HOME=$(pwd)/model_cache
 # Path to apptainer image
 APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
 # Run benchmark with FlashAttention-3 Hopper inside apptainer
 apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
    --mode both \
    --model-path ./model_cache \
    --model-name Qwen/Qwen3-4B \
    --attn-implementation sdpa \
    --batch-size 3 \
    --sequence-length 2048 \
    --num-steps 10 \
    --num-requests 10 \
    --prompt-length 512 \
    --generation-length 100 \
    --output-dir ./results/h100_sdpa
 # --attn-implementation flash_attention_3_hopper \
 echo "========================================="
 echo "Benchmark Complete!"
 echo "========================================="
--- a/slurm_h200.sh
+++ b/slurm_h200.sh
@@ -0,0 +1,45 @@
 #!/bin/bash
 #SBATCH --job-name=llm_bench_h200
 #SBATCH --partition=h200        # Adjust to your H200 partition name
 #SBATCH --nodes=1
 #SBATCH --gres=gpu:h200:1       # Request 1 H200 GPU
 #SBATCH --time=02:00:00
 #SBATCH --output=logs/benchmark_h200_%j.out
 #SBATCH --error=logs/benchmark_h200_%j.err
 # Create logs directory
 mkdir -p logs
 # Print job info
 echo "========================================="
 echo "Job ID: $SLURM_JOB_ID"
 echo "Job Name: $SLURM_JOB_NAME"
 echo "Node: $SLURM_NODELIST"
 echo "Date: $(date)"
 echo "========================================="
 # Set cache paths
 export TRANSFORMERS_CACHE=$(pwd)/model_cache
 export HF_HOME=$(pwd)/model_cache
 # Path to apptainer image
 APPTAINER_IMAGE="/hnvme/workspace/ihpc125h-llm-profiler/pytorch_25.10_updated_ao.sif"
 # Run benchmark with FlashAttention-3 Hopper inside apptainer
 apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
    --mode both \
    --model-path ./model_cache \
    --model-name Qwen/Qwen3-4B \
    --attn-implementation sdpa \
    --batch-size 3 \
    --sequence-length 2048 \
    --num-steps 10 \
    --num-requests 10 \
    --prompt-length 512 \
    --generation-length 100 \
    --output-dir ./results/h200_sdpa
    # --attn-implementation flash_attention_3_hopper \
 echo "========================================="
 echo "Benchmark Complete!"
 echo "========================================="
--- a/slurm_mi300x.sh
+++ b/slurm_mi300x.sh
@@ -0,0 +1,42 @@
 #!/bin/bash
 #SBATCH --job-name=llm_bench_mi300x
 #SBATCH --nodes=1
 #SBATCH -w=aquavan1     # Request MI300X GPUs
 #SBATCH --time=02:00:00
 #SBATCH --output=logs/benchmark_mi300x_%j.out
 #SBATCH --error=logs/benchmark_mi300x_%j.err
 # Create logs directory
 mkdir -p logs
 # Print job info
 echo "========================================="
 echo "Job ID: $SLURM_JOB_ID"
 echo "Job Name: $SLURM_JOB_NAME"
 echo "Node: $SLURM_NODELIST"
 echo "Date: $(date)"
 echo "========================================="
 # Set cache paths
 export TRANSFORMERS_CACHE=$(pwd)/models
 export HF_HOME=$(pwd)/models
 # Path to apptainer image
 #APPTAINER_IMAGE="/home/woody/ihpc/ihpc125h/pytorch_25.10_updated_ao.sif"
 apptainer exec --writable ../rocm_sandbox/ python run_benchmark.py \
    --mode both \
    --model-path ./model_cache \
    --model-name Qwen/Qwen3-4B \
    --attn-implementation sdpa \
    --batch-size 3 \
    --sequence-length 2048 \
    --num-steps 10 \
    --num-requests 10 \
    --prompt-length 512 \
    --generation-length 100 \
    --output-dir ./results/mi300x_sdpa
 echo "========================================="
 echo "Benchmark Complete!"
 echo "========================================="
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,3 @@
 """Utility package for LLM benchmarking."""
 __version__ = "1.0.0"
--- a/utils/attention.py
+++ b/utils/attention.py
@@ -0,0 +1,295 @@
 """
 Attention Implementation Helpers for LLM Benchmarking
 Provides functions for configuring different attention implementations
 based on GPU type.
 """
 from typing import Optional
 import warnings
 def get_default_attention(gpu_name: str) -> str:
    """
    Get default attention implementation for GPU type.
    Args:
        gpu_name: GPU device name (from monitoring)
    Returns:
        Attention implementation string
    """
    gpu_lower = gpu_name.lower()
    # H100/H200: FlashAttention-3 Hopper
    if 'h100' in gpu_lower or 'h200' in gpu_lower:
        return "flash_attention_3_hopper"
    # A100, MI300X, other: FlashAttention-2
    return "flash_attention_2"
 def configure_model_attention(model, attn_implementation: str, verbose: bool = True):
    """
    Configure model to use specified attention implementation.
    This function patches the model if needed to use the specified attention.
    For standard implementations like flash_attention_2, the model should already
    be loaded with the correct implementation via AutoModelForCausalLM.from_pretrained().
    For FlashAttention-3 Hopper, this patches the model's attention modules.
    Args:
        model: The loaded model
        attn_implementation: Attention implementation to use
        verbose: Print configuration messages
    Returns:
        Configured model
    """
    if verbose:
        print(f"Configuring attention: {attn_implementation}")
    if attn_implementation == "flash_attention_3_hopper":
        # Patch model to use FlashAttention-3 Hopper
        try:
            import flash_attn_interface
        except ImportError:
            raise ImportError(
                "flash_attn_interface not found. This is required for FlashAttention-3.\n"
                "Install with appropriate method for your system."
            )
        # Patch the model's attention function
        _patch_fa3_hopper(model, verbose=verbose)
    elif attn_implementation == "flash_attention_2":
        # Model should already be loaded with FA2
        if verbose:
            print("  Using FlashAttention-2 (configured during model loading)")
    elif attn_implementation == "sdpa":
        # PyTorch Scaled Dot Product Attention
        if verbose:
            print("  Using PyTorch SDPA")
    elif attn_implementation == "eager":
        # Standard PyTorch attention
        if verbose:
            print("  Using eager attention")
    else:
        warnings.warn(f"Unknown attention implementation: {attn_implementation}")
    return model
 def _patch_fa3_hopper(model, verbose: bool = True):
    """
    Patch model to use FlashAttention-3 Hopper.
    This replaces the attention computation in the model's attention layers
    with calls to flash_attn_interface.flash_attn_func().
    Args:
        model: The model to patch
        verbose: Print patching messages
    """
    import flash_attn_interface
    import torch
    # Counter for patched modules
    num_patched = 0
    # Iterate through all modules in the model
    for name, module in model.named_modules():
        # Look for attention modules (this will vary by model architecture)
        # Common names: "self_attn", "attn", "attention"
        if any(attn_name in name.lower() for attn_name in ['self_attn', 'attention']):
            # Check if module has a forward method we can patch
            if hasattr(module, 'forward'):
                # Save original forward
                original_forward = module.forward
                # Create patched forward function
                def create_patched_forward(orig_forward):
                    def patched_forward(hidden_states, *args, **kwargs):
                        # Check if this is an attention computation
                        # For Qwen models, attention modules typically have q, k, v projections
                        if hasattr(module, 'q_proj') and hasattr(module, 'k_proj') and hasattr(module, 'v_proj'):
                            # Extract batch, seq_len, hidden_dim
                            batch_size, seq_len, hidden_dim = hidden_states.shape
                            # Compute Q, K, V
                            q = module.q_proj(hidden_states)
                            k = module.k_proj(hidden_states)
                            v = module.v_proj(hidden_states)
                            # Reshape for multi-head attention
                            num_heads = module.num_heads
                            head_dim = hidden_dim // num_heads
                            q = q.view(batch_size, seq_len, num_heads, head_dim)
                            k = k.view(batch_size, seq_len, num_heads, head_dim)
                            v = v.view(batch_size, seq_len, num_heads, head_dim)
                            # Call FlashAttention-3
                            # Note: flash_attn_func expects (batch, seqlen, nheads, headdim)
                            attn_output = flash_attn_interface.flash_attn_func(
                                q, k, v,
                                dropout_p=0.0,
                                softmax_scale=None,  # Will use default 1/sqrt(head_dim)
                                causal=True,  # For causal LM
                            )
                            # Reshape back
                            attn_output = attn_output.view(batch_size, seq_len, hidden_dim)
                            # Apply output projection if it exists
                            if hasattr(module, 'o_proj'):
                                attn_output = module.o_proj(attn_output)
                            return (attn_output,) + (None,) * (len(orig_forward(hidden_states, *args, **kwargs)) - 1)
                        else:
                            # Not an attention module we can patch, use original
                            return orig_forward(hidden_states, *args, **kwargs)
                    return patched_forward
                # Apply patch
                module.forward = create_patched_forward(original_forward)
                num_patched += 1
    if verbose:
        if num_patched > 0:
            print(f"  ✓ Patched {num_patched} attention modules to use FlashAttention-3 Hopper")
        else:
            warnings.warn("  ⚠ No attention modules found to patch for FlashAttention-3")
 def get_attention_info(attn_implementation: str) -> dict:
    """
    Get information about an attention implementation.
    Args:
        attn_implementation: Attention implementation string
    Returns:
        Dictionary with info about the implementation
    """
    info = {
        "flash_attention_2": {
            "name": "FlashAttention-2",
            "description": "Optimized attention for A100 and other GPUs",
            "gpu_support": ["A100", "MI300X", "V100", "RTX"],
            "memory_efficient": True,
            "requires_cuda": True,
        },
        "flash_attention_3_hopper": {
            "name": "FlashAttention-3 Hopper",
            "description": "Optimized attention for H100/H200 Hopper architecture",
            "gpu_support": ["H100", "H200"],
            "memory_efficient": True,
            "requires_cuda": True,
        },
        "sdpa": {
            "name": "PyTorch SDPA",
            "description": "PyTorch Scaled Dot Product Attention",
            "gpu_support": ["All"],
            "memory_efficient": True,
            "requires_cuda": False,
        },
        "eager": {
            "name": "Eager Attention",
            "description": "Standard PyTorch attention implementation",
            "gpu_support": ["All"],
            "memory_efficient": False,
            "requires_cuda": False,
        },
    }
    return info.get(attn_implementation, {
        "name": attn_implementation,
        "description": "Unknown attention implementation",
        "gpu_support": ["Unknown"],
        "memory_efficient": False,
        "requires_cuda": False,
    })
 def validate_attention_for_gpu(attn_implementation: str, gpu_name: str) -> tuple[bool, Optional[str]]:
    """
    Validate if attention implementation is suitable for GPU.
    Args:
        attn_implementation: Attention implementation
        gpu_name: GPU device name
    Returns:
        Tuple of (is_valid, warning_message)
    """
    gpu_lower = gpu_name.lower()
    # FlashAttention-3 Hopper validation
    if attn_implementation == "flash_attention_3_hopper":
        if 'h100' not in gpu_lower and 'h200' not in gpu_lower:
            return False, (
                f"FlashAttention-3 Hopper is optimized for H100/H200. "
                f"Current GPU: {gpu_name}. Consider using flash_attention_2 instead."
            )
    # FlashAttention-2 on Hopper GPUs
    if attn_implementation == "flash_attention_2":
        if 'h100' in gpu_lower or 'h200' in gpu_lower:
            return True, (
                f"FlashAttention-2 will work on {gpu_name}, but FlashAttention-3 Hopper "
                f"may provide better performance."
            )
    return True, None
 if __name__ == "__main__":
    """Test attention configuration."""
    print("=" * 60)
    print("Attention Implementation Test")
    print("=" * 60)
    # Test getting default attention for different GPUs
    test_gpus = [
        "NVIDIA A100 80GB",
        "NVIDIA H100 80GB",
        "NVIDIA H200 141GB",
        "AMD Instinct MI300X",
    ]
    print("\nDefault attention implementations:")
    for gpu in test_gpus:
        attn = get_default_attention(gpu)
        print(f"  {gpu:30s} → {attn}")
    # Test validation
    print("\nValidation tests:")
    test_cases = [
        ("flash_attention_3_hopper", "NVIDIA H100 80GB"),
        ("flash_attention_3_hopper", "NVIDIA A100 80GB"),
        ("flash_attention_2", "NVIDIA H100 80GB"),
        ("flash_attention_2", "NVIDIA A100 80GB"),
    ]
    for attn, gpu in test_cases:
        valid, warning = validate_attention_for_gpu(attn, gpu)
        status = "✓" if valid else "✗"
        print(f"  {status} {attn:30s} on {gpu:25s}")
        if warning:
            print(f"    ⚠ {warning}")
    # Test getting info
    print("\nAttention implementation info:")
    for attn in ["flash_attention_2", "flash_attention_3_hopper", "sdpa"]:
        info = get_attention_info(attn)
        print(f"\n  {info['name']}:")
        print(f"    Description: {info['description']}")
        print(f"    GPU Support: {', '.join(info['gpu_support'])}")
        print(f"    Memory Efficient: {info['memory_efficient']}")
--- a/utils/gpu_monitor.py
+++ b/utils/gpu_monitor.py
@@ -0,0 +1,562 @@
 """
 GPU Monitoring Infrastructure for LLM Benchmarking
 Provides unified interface for monitoring both NVIDIA and AMD GPUs.
 """
 import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Optional, List
 import warnings
@dataclass
 class GPUMetrics:
    """Container for GPU metrics."""
    timestamp: float
    power_watts: float
    gpu_utilization_percent: float
    memory_used_gb: float
    memory_total_gb: float
    temperature_celsius: Optional[float] = None
    energy_joules: Optional[float] = None  # Cumulative energy
 class GPUMonitor(ABC):
    """Abstract base class for GPU monitoring."""
    def __init__(self, device_id: int = 0):
        """
        Initialize GPU monitor.
        Args:
            device_id: GPU device ID to monitor
        """
        self.device_id = device_id
        self.start_time = None
        self.start_energy = None
        self.last_metrics = None
    @abstractmethod
    def get_metrics(self) -> GPUMetrics:
        """Get current GPU metrics."""
        pass
    @abstractmethod
    def get_device_name(self) -> str:
        """Get GPU device name."""
        pass
    @abstractmethod
    def cleanup(self):
        """Cleanup resources."""
        pass
    def start_monitoring(self):
        """Start energy monitoring session."""
        self.start_time = time.time()
        metrics = self.get_metrics()
        self.start_energy = metrics.energy_joules if metrics.energy_joules is not None else 0.0
        self.last_metrics = metrics
    def get_energy_consumed(self) -> float:
        """
        Get energy consumed since start_monitoring() was called.
        Returns:
            Energy in Joules
        """
        if self.start_time is None:
            raise RuntimeError("Must call start_monitoring() first")
        current_metrics = self.get_metrics()
        if current_metrics.energy_joules is not None:
            # If GPU provides cumulative energy, use it
            return current_metrics.energy_joules - self.start_energy
        else:
            # Otherwise, integrate power over time
            elapsed_time = time.time() - self.start_time
            # Use average of start and current power
            avg_power = (self.last_metrics.power_watts + current_metrics.power_watts) / 2.0
            return avg_power * elapsed_time
    def get_average_power(self) -> float:
        """
        Get average power consumption since start_monitoring().
        Returns:
            Average power in Watts
        """
        if self.start_time is None:
            raise RuntimeError("Must call start_monitoring() first")
        elapsed_time = time.time() - self.start_time
        if elapsed_time == 0:
            return 0.0
        energy = self.get_energy_consumed()
        return energy / elapsed_time
 class NVIDIAMonitor(GPUMonitor):
    """NVIDIA GPU monitor using pynvml."""
    def __init__(self, device_id: int = 0):
        """Initialize NVIDIA monitor."""
        try:
            import pynvml
            self.pynvml = pynvml
        except ImportError:
            raise ImportError(
                "pynvml not found. Install with: pip install pynvml"
            )
        try:
            self.pynvml.nvmlInit()
            self.handle = self.pynvml.nvmlDeviceGetHandleByIndex(device_id)
        except Exception as e:
            raise RuntimeError(f"Failed to initialize NVIDIA GPU {device_id}: {e}")
        super().__init__(device_id)
    def get_metrics(self) -> GPUMetrics:
        """Get current NVIDIA GPU metrics."""
        try:
            # Power (in milliwatts)
            power_mw = self.pynvml.nvmlDeviceGetPowerUsage(self.handle)
            power_watts = power_mw / 1000.0
            # Utilization
            util = self.pynvml.nvmlDeviceGetUtilizationRates(self.handle)
            gpu_util = util.gpu
            # Memory
            mem_info = self.pynvml.nvmlDeviceGetMemoryInfo(self.handle)
            memory_used_gb = mem_info.used / (1024**3)
            memory_total_gb = mem_info.total / (1024**3)
            # Temperature
            try:
                temp = self.pynvml.nvmlDeviceGetTemperature(
                    self.handle, 
                    self.pynvml.NVML_TEMPERATURE_GPU
                )
            except:
                temp = None
            # Try to get cumulative energy (newer GPUs)
            energy_joules = None
            try:
                energy_mj = self.pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
                energy_joules = energy_mj / 1000.0
            except:
                # Not supported on this GPU, will use power integration
                pass
            return GPUMetrics(
                timestamp=time.time(),
                power_watts=power_watts,
                gpu_utilization_percent=gpu_util,
                memory_used_gb=memory_used_gb,
                memory_total_gb=memory_total_gb,
                temperature_celsius=temp,
                energy_joules=energy_joules
            )
        except Exception as e:
            raise RuntimeError(f"Failed to get NVIDIA GPU metrics: {e}")
    def get_device_name(self) -> str:
        """Get NVIDIA GPU device name."""
        try:
            name = self.pynvml.nvmlDeviceGetName(self.handle)
            if isinstance(name, bytes):
                name = name.decode('utf-8')
            return name
        except:
            return f"NVIDIA GPU {self.device_id}"
    def cleanup(self):
        """Cleanup NVIDIA resources."""
        try:
            self.pynvml.nvmlShutdown()
        except:
            pass
 class AMDMonitor(GPUMonitor):
    """AMD GPU monitor using rocm-smi command line tool."""
    def __init__(self, device_id: int = 0):
        """Initialize AMD monitor."""
        import subprocess
        import shutil
        # Check if rocm-smi is available
        if shutil.which('rocm-smi') is None:
            raise RuntimeError("rocm-smi command not found. Make sure ROCm is installed and in PATH.")
        self.device_id = device_id
        # Verify device exists
        try:
            result = subprocess.run(
                ['rocm-smi', '--showid'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode != 0:
                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
        except subprocess.TimeoutExpired:
            raise RuntimeError("rocm-smi command timed out")
        except Exception as e:
            raise RuntimeError(f"Failed to initialize AMD GPU {device_id}: {e}")
        super().__init__(device_id)
    def _parse_detailed_output(self, output: str) -> dict:
        """Parse rocm-smi detailed output format."""
        lines = output.strip().split('\n')
        # Parse detailed format: GPU[X] : Metric : Value
        metrics = {
            'temperature': None,
            'power': None,
            'vram_percent': None,
            'gpu_percent': None,
        }
        device_prefix = f"GPU[{self.device_id}]"
        for line in lines:
            if not line.strip() or not line.startswith(device_prefix):
                continue
            # Split by colon
            parts = line.split(':')
            if len(parts) < 3:
                continue
            metric_name = parts[1].strip().lower()
            value_str = parts[2].strip()
            try:
                # Temperature (Sensor junction)
                if 'temperature' in metric_name and 'junction' in metric_name:
                    metrics['temperature'] = float(value_str)
                # Power consumption
                elif 'power' in metric_name and 'package' in metric_name:
                    metrics['power'] = float(value_str)
                # GPU utilization
                elif 'gpu use' in metric_name:
                    metrics['gpu_percent'] = float(value_str)
                # VRAM usage percentage
                elif 'memory allocated' in metric_name and 'vram%' in metric_name:
                    metrics['vram_percent'] = float(value_str)
            except (ValueError, IndexError):
                continue
        # Validate we got the required metrics
        if metrics['temperature'] is None:
            raise ValueError(f"Could not find temperature for GPU[{self.device_id}]")
        if metrics['power'] is None:
            raise ValueError(f"Could not find power for GPU[{self.device_id}]")
        if metrics['gpu_percent'] is None:
            metrics['gpu_percent'] = 0.0
        if metrics['vram_percent'] is None:
            metrics['vram_percent'] = 0.0
        return metrics
    def _get_memory_info(self) -> tuple:
        """Get memory usage in GB using rocm-smi --showmeminfo."""
        import subprocess
        try:
            result = subprocess.run(
                ['rocm-smi', '--showmeminfo', 'vram', '-d', str(self.device_id)],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode != 0:
                return 0.0, 0.0
            # Parse output for memory info
            # Looking for lines like "GPU memory used: X MiB" and "GPU memory total: Y MiB"
            used_gb = 0.0
            total_gb = 0.0
            for line in result.stdout.split('\n'):
                if 'Used' in line or 'used' in line:
                    # Extract number
                    parts = line.split()
                    for i, part in enumerate(parts):
                        if part.replace('.', '').isdigit():
                            used_bytes = float(part)
                            # Check if next part indicates unit
                            if i + 1 < len(parts):
                                unit = parts[i + 1].lower()
                                if 'mb' in unit or 'mib' in unit:
                                    used_gb = used_bytes / 1024
                                elif 'gb' in unit or 'gib' in unit:
                                    used_gb = used_bytes
                                elif 'kb' in unit or 'kib' in unit:
                                    used_gb = used_bytes / (1024 * 1024)
                            break
                if 'Total' in line or 'total' in line:
                    parts = line.split()
                    for i, part in enumerate(parts):
                        if part.replace('.', '').isdigit():
                            total_bytes = float(part)
                            if i + 1 < len(parts):
                                unit = parts[i + 1].lower()
                                if 'mb' in unit or 'mib' in unit:
                                    total_gb = total_bytes / 1024
                                elif 'gb' in unit or 'gib' in unit:
                                    total_gb = total_bytes
                                elif 'kb' in unit or 'kib' in unit:
                                    total_gb = total_bytes / (1024 * 1024)
                            break
            return used_gb, total_gb
        except Exception:
            return 0.0, 0.0
    def get_metrics(self) -> GPUMetrics:
        """Get current AMD GPU metrics."""
        import subprocess
        try:
            # Get main metrics from concise output
            result = subprocess.run(
                ['rocm-smi', '--showid', '--showtemp', '--showpower', '--showuse', '--showmemuse'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode != 0:
                raise RuntimeError(f"rocm-smi failed: {result.stderr}")
            metrics = self._parse_detailed_output(result.stdout)
            # Get detailed memory info
            memory_used_gb, memory_total_gb = self._get_memory_info()
            # If we couldn't get absolute memory, estimate from percentage
            if memory_total_gb == 0.0:
                # MI300X has ~192GB, MI250X has ~128GB - use a reasonable default
                memory_total_gb = 192.0  # Assume MI300X
                memory_used_gb = memory_total_gb * (metrics['vram_percent'] / 100.0)
            return GPUMetrics(
                timestamp=time.time(),
                power_watts=metrics['power'],
                gpu_utilization_percent=metrics['gpu_percent'],
                memory_used_gb=memory_used_gb,
                memory_total_gb=memory_total_gb,
                temperature_celsius=metrics['temperature'],
                energy_joules=None  # Will use power integration
            )
        except subprocess.TimeoutExpired:
            raise RuntimeError("rocm-smi command timed out")
        except Exception as e:
            raise RuntimeError(f"Failed to get AMD GPU metrics: {e}")
    def get_device_name(self) -> str:
        """Get AMD GPU device name."""
        import subprocess
        try:
            result = subprocess.run(
                ['rocm-smi', '--showproductname', '-d', str(self.device_id)],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode == 0:
                # Parse output to find device name
                for line in result.stdout.split('\n'):
                    if 'Card series' in line or 'Card model' in line or 'name' in line.lower():
                        parts = line.split(':')
                        if len(parts) > 1:
                            return parts[1].strip()
        except Exception:
            pass
        return f"AMD GPU {self.device_id}"
    def cleanup(self):
        """Cleanup AMD resources."""
        # No cleanup needed for command-line tool
        pass
 def get_gpu_monitor(device_id: int = 0) -> GPUMonitor:
    """
    Factory function to automatically detect and create appropriate GPU monitor.
    Args:
        device_id: GPU device ID to monitor
    Returns:
        GPUMonitor instance (NVIDIAMonitor or AMDMonitor)
    Raises:
        RuntimeError: If no supported GPU is found
    """
    # Try AMD first (rocm-smi based) as it's more commonly available
    try:
        return AMDMonitor(device_id)
    except:
        pass
    # Try NVIDIA if AMD fails
    try:
        return NVIDIAMonitor(device_id)
    except:
        pass
    # Try to import torch to detect GPU type as last resort
    try:
        import torch
        if torch.cuda.is_available():
            # Check if it's NVIDIA or AMD
            device_name = torch.cuda.get_device_name(device_id).lower()
            if 'nvidia' in device_name or 'tesla' in device_name or 'geforce' in device_name:
                return NVIDIAMonitor(device_id)
            elif 'amd' in device_name or 'radeon' in device_name or 'mi300' in device_name or 'mi200' in device_name:
                return AMDMonitor(device_id)
    except:
        pass
    raise RuntimeError(
        "No supported GPU found. Make sure either ROCm (rocm-smi) or NVIDIA (pynvml) drivers are installed."
    )
 def list_available_gpus() -> List[str]:
    """
    List all available GPUs.
    Returns:
        List of GPU names
    """
    gpus = []
    # Try NVIDIA
    try:
        import pynvml
        pynvml.nvmlInit()
        device_count = pynvml.nvmlDeviceGetCount()
        for i in range(device_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            name = pynvml.nvmlDeviceGetName(handle)
            if isinstance(name, bytes):
                name = name.decode('utf-8')
            gpus.append(f"GPU {i}: {name} (NVIDIA)")
        pynvml.nvmlShutdown()
    except:
        pass
    # Try AMD with rocm-smi
    try:
        import subprocess
        import shutil
        if shutil.which('rocm-smi'):
            result = subprocess.run(
                ['rocm-smi', '--showid'],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode == 0:
                # Parse device IDs from output
                for line in result.stdout.split('\n'):
                    if not line.strip() or line.startswith('=') or 'Device' in line or 'ROCm' in line:
                        continue
                    parts = line.split()
                    if parts and parts[0].isdigit():
                        device_id = int(parts[0])
                        # Try to get device name
                        name_result = subprocess.run(
                            ['rocm-smi', '--showproductname', '-d', str(device_id)],
                            capture_output=True,
                            text=True,
                            timeout=5
                        )
                        name = f"AMD GPU"
                        if name_result.returncode == 0:
                            for name_line in name_result.stdout.split('\n'):
                                if 'Card' in name_line or 'name' in name_line.lower():
                                    parts_name = name_line.split(':')
                                    if len(parts_name) > 1:
                                        name = parts_name[1].strip()
                                        break
                        gpus.append(f"GPU {device_id}: {name} (AMD)")
    except:
        pass
    return gpus
 if __name__ == "__main__":
    """Test GPU monitoring."""
    print("=" * 60)
    print("GPU Monitoring Test")
    print("=" * 60)
    # List available GPUs
    print("\nAvailable GPUs:")
    gpus = list_available_gpus()
    if not gpus:
        print("  No GPUs found!")
        exit(1)
    for gpu in gpus:
        print(f"  {gpu}")
    # Test monitoring
    print("\nTesting GPU 0 monitoring...")
    try:
        monitor = get_gpu_monitor(0)
        print(f"  Device: {monitor.get_device_name()}")
        # Get metrics
        metrics = monitor.get_metrics()
        print(f"\nCurrent Metrics:")
        print(f"  Power: {metrics.power_watts:.2f} W")
        print(f"  GPU Utilization: {metrics.gpu_utilization_percent:.1f}%")
        print(f"  Memory: {metrics.memory_used_gb:.2f} / {metrics.memory_total_gb:.2f} GB")
        if metrics.temperature_celsius:
            print(f"  Temperature: {metrics.temperature_celsius:.1f}°C")
        # Test energy monitoring
        print("\nTesting energy monitoring (5 seconds)...")
        monitor.start_monitoring()
        time.sleep(5)
        energy = monitor.get_energy_consumed()
        avg_power = monitor.get_average_power()
        print(f"  Energy consumed: {energy:.2f} J")
        print(f"  Average power: {avg_power:.2f} W")
        monitor.cleanup()
        print("\n✓ Monitoring test successful!")
    except Exception as e:
        print(f"\n✗ Error: {e}")
        exit(1)
--- a/utils/metrics.py
+++ b/utils/metrics.py
@@ -0,0 +1,473 @@
 """
 Metrics Collection and Reporting for LLM Benchmarking
 Provides centralized metrics collection, aggregation, and reporting.
 """
 import json
 import csv
 from dataclasses import dataclass, asdict, field
 from typing import Dict, List, Optional, Any
 from pathlib import Path
 import time
@dataclass
 class StageMetrics:
    """Metrics for a specific stage (e.g., forward pass, prefill, etc.)."""
    stage_name: str
    duration_ms: float
    tokens_processed: int
    tokens_per_second: float
    energy_joules: float
    energy_per_token: float
    avg_power_watts: float
    peak_memory_gb: float
    avg_gpu_util_percent: float
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return asdict(self)
@dataclass
 class PretrainMetrics:
    """Metrics for pretraining benchmark."""
    model_name: str
    gpu_name: str
    attention_implementation: str
    batch_size: int
    sequence_length: int
    num_steps: int
    # Stage-specific metrics
    forward: StageMetrics
    backward: StageMetrics
    optimizer: StageMetrics
    # Overall metrics
    total_duration_ms: float
    total_tokens: int
    total_tokens_per_second: float
    total_energy_joules: float
    total_energy_per_token: float
    timestamp: float = field(default_factory=time.time)
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            "model_name": self.model_name,
            "gpu_name": self.gpu_name,
            "attention_implementation": self.attention_implementation,
            "batch_size": self.batch_size,
            "sequence_length": self.sequence_length,
            "num_steps": self.num_steps,
            "forward": self.forward.to_dict(),
            "backward": self.backward.to_dict(),
            "optimizer": self.optimizer.to_dict(),
            "total_duration_ms": self.total_duration_ms,
            "total_tokens": self.total_tokens,
            "total_tokens_per_second": self.total_tokens_per_second,
            "total_energy_joules": self.total_energy_joules,
            "total_energy_per_token": self.total_energy_per_token,
            "timestamp": self.timestamp,
        }
@dataclass
 class InferenceMetrics:
    """Metrics for inference benchmark."""
    model_name: str
    gpu_name: str
    attention_implementation: str
    num_requests: int
    prompt_length: int
    generation_length: int
    # Stage-specific metrics
    prefill: StageMetrics  # Time to First Token
    decode: StageMetrics   # Inter-Token Latency
    # End-to-end metrics
    e2e_latency_ms: float
    e2e_tokens_per_second: float
    e2e_energy_joules: float
    e2e_energy_per_token: float
    # Additional metrics
    ttft_ms: float  # Time to First Token (same as prefill duration)
    itl_ms: float   # Inter-Token Latency (decode duration / num_tokens)
    timestamp: float = field(default_factory=time.time)
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            "model_name": self.model_name,
            "gpu_name": self.gpu_name,
            "attention_implementation": self.attention_implementation,
            "num_requests": self.num_requests,
            "prompt_length": self.prompt_length,
            "generation_length": self.generation_length,
            "prefill": self.prefill.to_dict(),
            "decode": self.decode.to_dict(),
            "e2e_latency_ms": self.e2e_latency_ms,
            "e2e_tokens_per_second": self.e2e_tokens_per_second,
            "e2e_energy_joules": self.e2e_energy_joules,
            "e2e_energy_per_token": self.e2e_energy_per_token,
            "ttft_ms": self.ttft_ms,
            "itl_ms": self.itl_ms,
            "timestamp": self.timestamp,
        }
 class MetricsCollector:
    """Collects metrics during benchmark runs."""
    def __init__(self):
        """Initialize metrics collector."""
        self.metrics_history: List[Dict[str, Any]] = []
    def add_pretrain_metrics(self, metrics: PretrainMetrics):
        """Add pretraining metrics."""
        self.metrics_history.append({
            "type": "pretrain",
            "metrics": metrics.to_dict()
        })
    def add_inference_metrics(self, metrics: InferenceMetrics):
        """Add inference metrics."""
        self.metrics_history.append({
            "type": "inference",
            "metrics": metrics.to_dict()
        })
    def get_all_metrics(self) -> List[Dict[str, Any]]:
        """Get all collected metrics."""
        return self.metrics_history
    def clear(self):
        """Clear all metrics."""
        self.metrics_history.clear()
 class MetricsReporter:
    """Formats and outputs benchmark results."""
    @staticmethod
    def print_pretrain_metrics(metrics: PretrainMetrics, verbose: bool = True):
        """Print pretraining metrics to console."""
        print("\n" + "=" * 80)
        print("PRETRAINING BENCHMARK RESULTS")
        print("=" * 80)
        print(f"\nModel: {metrics.model_name}")
        print(f"GPU: {metrics.gpu_name}")
        print(f"Attention: {metrics.attention_implementation}")
        print(f"Batch Size: {metrics.batch_size}")
        print(f"Sequence Length: {metrics.sequence_length}")
        print(f"Training Steps: {metrics.num_steps}")
        print("\n" + "-" * 80)
        print("STAGE BREAKDOWN")
        print("-" * 80)
        # Forward pass
        print(f"\n[1] FORWARD PASS")
        MetricsReporter._print_stage_metrics(metrics.forward, verbose)
        # Backward pass
        print(f"\n[2] BACKWARD PASS")
        MetricsReporter._print_stage_metrics(metrics.backward, verbose)
        # Optimizer step
        print(f"\n[3] OPTIMIZER STEP")
        MetricsReporter._print_stage_metrics(metrics.optimizer, verbose)
        # Overall
        print("\n" + "-" * 80)
        print("OVERALL METRICS")
        print("-" * 80)
        print(f"  Total Duration:        {metrics.total_duration_ms:>10.2f} ms")
        print(f"  Total Tokens:          {metrics.total_tokens:>10,}")
        print(f"  Throughput:            {metrics.total_tokens_per_second:>10.2f} tokens/s")
        print(f"  Total Energy:          {metrics.total_energy_joules:>10.2f} J")
        print(f"  Energy per Token:      {metrics.total_energy_per_token*1000:>10.4f} mJ/token")
        print("=" * 80 + "\n")
    @staticmethod
    def print_inference_metrics(metrics: InferenceMetrics, verbose: bool = True):
        """Print inference metrics to console."""
        print("\n" + "=" * 80)
        print("INFERENCE BENCHMARK RESULTS")
        print("=" * 80)
        print(f"\nModel: {metrics.model_name}")
        print(f"GPU: {metrics.gpu_name}")
        print(f"Attention: {metrics.attention_implementation}")
        print(f"Requests: {metrics.num_requests}")
        print(f"Prompt Length: {metrics.prompt_length}")
        print(f"Generation Length: {metrics.generation_length}")
        print("\n" + "-" * 80)
        print("STAGE BREAKDOWN")
        print("-" * 80)
        # Prefill
        print(f"\n[1] PREFILL (Time to First Token)")
        MetricsReporter._print_stage_metrics(metrics.prefill, verbose)
        print(f"  TTFT:                  {metrics.ttft_ms:>10.2f} ms")
        # Decode
        print(f"\n[2] DECODE (Inter-Token Latency)")
        MetricsReporter._print_stage_metrics(metrics.decode, verbose)
        print(f"  ITL:                   {metrics.itl_ms:>10.2f} ms/token")
        # End-to-end
        print("\n" + "-" * 80)
        print("END-TO-END METRICS")
        print("-" * 80)
        print(f"  Request Latency:       {metrics.e2e_latency_ms:>10.2f} ms")
        print(f"  Throughput:            {metrics.e2e_tokens_per_second:>10.2f} tokens/s")
        print(f"  Total Energy:          {metrics.e2e_energy_joules:>10.2f} J")
        print(f"  Energy per Token:      {metrics.e2e_energy_per_token*1000:>10.4f} mJ/token")
        print("=" * 80 + "\n")
    @staticmethod
    def _print_stage_metrics(stage: StageMetrics, verbose: bool = True):
        """Print metrics for a single stage."""
        print(f"  Duration:              {stage.duration_ms:>10.2f} ms")
        print(f"  Tokens:                {stage.tokens_processed:>10,}")
        print(f"  Throughput:            {stage.tokens_per_second:>10.2f} tokens/s")
        print(f"  Energy:                {stage.energy_joules:>10.2f} J")
        print(f"  Energy per Token:      {stage.energy_per_token*1000:>10.4f} mJ/token")
        if verbose:
            print(f"  Avg Power:             {stage.avg_power_watts:>10.2f} W")
            print(f"  Peak Memory:           {stage.peak_memory_gb:>10.2f} GB")
            print(f"  Avg GPU Utilization:   {stage.avg_gpu_util_percent:>10.1f} %")
    @staticmethod
    def save_json(metrics: Any, output_path: Path):
        """
        Save metrics to JSON file.
        Args:
            metrics: PretrainMetrics or InferenceMetrics object
            output_path: Path to output JSON file
        """
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(metrics.to_dict(), f, indent=2)
        print(f"Metrics saved to: {output_path}")
    @staticmethod
    def save_csv(metrics_list: List[Any], output_path: Path, benchmark_type: str = "pretrain"):
        """
        Save multiple metrics to CSV file for comparison.
        Args:
            metrics_list: List of PretrainMetrics or InferenceMetrics objects
            output_path: Path to output CSV file
            benchmark_type: "pretrain" or "inference"
        """
        if not metrics_list:
            print("No metrics to save")
            return
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', newline='') as f:
            if benchmark_type == "pretrain":
                MetricsReporter._save_pretrain_csv(metrics_list, f)
            else:
                MetricsReporter._save_inference_csv(metrics_list, f)
        print(f"CSV saved to: {output_path}")
    @staticmethod
    def _save_pretrain_csv(metrics_list: List[PretrainMetrics], file):
        """Save pretraining metrics to CSV."""
        fieldnames = [
            'gpu_name', 'attention_implementation', 'batch_size', 'sequence_length', 'num_steps',
            'forward_duration_ms', 'forward_tokens_per_sec', 'forward_energy_j', 'forward_energy_per_token_mj',
            'backward_duration_ms', 'backward_tokens_per_sec', 'backward_energy_j', 'backward_energy_per_token_mj',
            'optimizer_duration_ms', 'optimizer_tokens_per_sec', 'optimizer_energy_j', 'optimizer_energy_per_token_mj',
            'total_duration_ms', 'total_tokens_per_sec', 'total_energy_j', 'total_energy_per_token_mj',
            'timestamp'
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for m in metrics_list:
            writer.writerow({
                'gpu_name': m.gpu_name,
                'attention_implementation': m.attention_implementation,
                'batch_size': m.batch_size,
                'sequence_length': m.sequence_length,
                'num_steps': m.num_steps,
                'forward_duration_ms': m.forward.duration_ms,
                'forward_tokens_per_sec': m.forward.tokens_per_second,
                'forward_energy_j': m.forward.energy_joules,
                'forward_energy_per_token_mj': m.forward.energy_per_token * 1000,
                'backward_duration_ms': m.backward.duration_ms,
                'backward_tokens_per_sec': m.backward.tokens_per_second,
                'backward_energy_j': m.backward.energy_joules,
                'backward_energy_per_token_mj': m.backward.energy_per_token * 1000,
                'optimizer_duration_ms': m.optimizer.duration_ms,
                'optimizer_tokens_per_sec': m.optimizer.tokens_per_second,
                'optimizer_energy_j': m.optimizer.energy_joules,
                'optimizer_energy_per_token_mj': m.optimizer.energy_per_token * 1000,
                'total_duration_ms': m.total_duration_ms,
                'total_tokens_per_sec': m.total_tokens_per_second,
                'total_energy_j': m.total_energy_joules,
                'total_energy_per_token_mj': m.total_energy_per_token * 1000,
                'timestamp': m.timestamp,
            })
    @staticmethod
    def _save_inference_csv(metrics_list: List[InferenceMetrics], file):
        """Save inference metrics to CSV."""
        fieldnames = [
            'gpu_name', 'attention_implementation', 'num_requests', 'prompt_length', 'generation_length',
            'prefill_duration_ms', 'prefill_tokens_per_sec', 'prefill_energy_j', 'prefill_energy_per_token_mj',
            'ttft_ms',
            'decode_duration_ms', 'decode_tokens_per_sec', 'decode_energy_j', 'decode_energy_per_token_mj',
            'itl_ms',
            'e2e_latency_ms', 'e2e_tokens_per_sec', 'e2e_energy_j', 'e2e_energy_per_token_mj',
            'timestamp'
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for m in metrics_list:
            writer.writerow({
                'gpu_name': m.gpu_name,
                'attention_implementation': m.attention_implementation,
                'num_requests': m.num_requests,
                'prompt_length': m.prompt_length,
                'generation_length': m.generation_length,
                'prefill_duration_ms': m.prefill.duration_ms,
                'prefill_tokens_per_sec': m.prefill.tokens_per_second,
                'prefill_energy_j': m.prefill.energy_joules,
                'prefill_energy_per_token_mj': m.prefill.energy_per_token * 1000,
                'ttft_ms': m.ttft_ms,
                'decode_duration_ms': m.decode.duration_ms,
                'decode_tokens_per_sec': m.decode.tokens_per_second,
                'decode_energy_j': m.decode.energy_joules,
                'decode_energy_per_token_mj': m.decode.energy_per_token * 1000,
                'itl_ms': m.itl_ms,
                'e2e_latency_ms': m.e2e_latency_ms,
                'e2e_tokens_per_sec': m.e2e_tokens_per_second,
                'e2e_energy_j': m.e2e_energy_joules,
                'e2e_energy_per_token_mj': m.e2e_energy_per_token * 1000,
                'timestamp': m.timestamp,
            })
 if __name__ == "__main__":
    """Test metrics reporting."""
    # Create sample pretraining metrics
    forward = StageMetrics(
        stage_name="forward",
        duration_ms=100.5,
        tokens_processed=1024,
        tokens_per_second=10189.3,
        energy_joules=25.3,
        energy_per_token=0.0247,
        avg_power_watts=251.7,
        peak_memory_gb=45.2,
        avg_gpu_util_percent=95.3
    )
    backward = StageMetrics(
        stage_name="backward",
        duration_ms=205.2,
        tokens_processed=1024,
        tokens_per_second=4991.2,
        energy_joules=51.6,
        energy_per_token=0.0504,
        avg_power_watts=251.5,
        peak_memory_gb=48.6,
        avg_gpu_util_percent=97.1
    )
    optimizer = StageMetrics(
        stage_name="optimizer",
        duration_ms=15.3,
        tokens_processed=1024,
        tokens_per_second=66928.1,
        energy_joules=3.8,
        energy_per_token=0.0037,
        avg_power_watts=248.4,
        peak_memory_gb=48.6,
        avg_gpu_util_percent=42.1
    )
    pretrain_metrics = PretrainMetrics(
        model_name="Qwen/Qwen2.5-3B-Instruct",
        gpu_name="NVIDIA A100 80GB",
        attention_implementation="flash_attention_2",
        batch_size=8,
        sequence_length=2048,
        num_steps=10,
        forward=forward,
        backward=backward,
        optimizer=optimizer,
        total_duration_ms=321.0,
        total_tokens=10240,
        total_tokens_per_second=31900.3,
        total_energy_joules=80.7,
        total_energy_per_token=0.00788
    )
    # Print pretrain metrics
    MetricsReporter.print_pretrain_metrics(pretrain_metrics)
    # Create sample inference metrics
    prefill = StageMetrics(
        stage_name="prefill",
        duration_ms=45.2,
        tokens_processed=512,
        tokens_per_second=11327.4,
        energy_joules=11.3,
        energy_per_token=0.0221,
        avg_power_watts=250.0,
        peak_memory_gb=42.1,
        avg_gpu_util_percent=89.2
    )
    decode = StageMetrics(
        stage_name="decode",
        duration_ms=223.5,
        tokens_processed=100,
        tokens_per_second=447.4,
        energy_joules=55.9,
        energy_per_token=0.559,
        avg_power_watts=250.1,
        peak_memory_gb=42.1,
        avg_gpu_util_percent=62.3
    )
    inference_metrics = InferenceMetrics(
        model_name="Qwen/Qwen2.5-3B-Instruct",
        gpu_name="NVIDIA A100 80GB",
        attention_implementation="flash_attention_2",
        num_requests=10,
        prompt_length=512,
        generation_length=100,
        prefill=prefill,
        decode=decode,
        e2e_latency_ms=268.7,
        e2e_tokens_per_second=2277.9,
        e2e_energy_joules=67.2,
        e2e_energy_per_token=0.110,
        ttft_ms=45.2,
        itl_ms=2.235
    )
    # Print inference metrics
    MetricsReporter.print_inference_metrics(inference_metrics)
		`@@ -0,0 +1,3 @@`
							`"""Utility package for LLM benchmarking."""`

							`__version__ = "1.0.0"`