Files
cocogoat/slurm_a100.sh
2026-02-05 23:18:26 +01:00

46 lines
1.3 KiB
Bash
Executable File

#!/bin/bash
#SBATCH --job-name=llm_bench_a100
#SBATCH --partition=a100 # Adjust to your A100 partition name
#SBATCH --nodes=1
#SBATCH --gres=gpu:a100:1 # Request 1 A100 GPU
#SBATCH -C a100_80
#SBATCH --time=02:00:00
#SBATCH --output=logs/benchmark_a100_sdpa_%j.out
#SBATCH --error=logs/benchmark_a100_sdpa_%j.err
# Create logs directory
mkdir -p logs
# Print job info
echo "========================================="
echo "Job ID: $SLURM_JOB_ID"
echo "Job Name: $SLURM_JOB_NAME"
echo "Node: $SLURM_NODELIST"
echo "Date: $(date)"
echo "========================================="
# Set cache paths
export TRANSFORMERS_CACHE=$(pwd)/model_cache
export HF_HOME=$(pwd)/model_cache
# Path to apptainer image
APPTAINER_IMAGE="/anvme/workspace/ihpc125h-llm-profiles/pytorch_25.10_updated_ao.sif"
# Run benchmark inside apptainer
apptainer exec --nv $APPTAINER_IMAGE python run_benchmark.py \
--mode both \
--model-path ./model_cache \
--model-name Qwen/Qwen3-4B \
--attn-implementation sdpa \
--batch-size 3 \
--sequence-length 2048 \
--num-steps 10 \
--num-requests 10 \
--prompt-length 512 \
--generation-length 100 \
--output-dir ./results/a100
echo "========================================="
echo "Benchmark Complete!"
echo "========================================="