{ "model_name": "Qwen/Qwen3-4B", "gpu_name": "NVIDIA H100", "attention_implementation": "flash_attention_3_hopper", "batch_size": 3, "sequence_length": 2048, "num_steps": 10, "forward": { "stage_name": "forward", "duration_ms": 1748.5067250672728, "tokens_processed": 61440, "tokens_per_second": 35138.55515633555, "energy_joules": 946.9269999563694, "energy_per_token": 0.015412223306581534, "avg_power_watts": 501.76439870614394, "peak_memory_gb": 76.45208740234375, "avg_gpu_util_percent": 97.0 }, "backward": { "stage_name": "backward", "duration_ms": 3761.718863155693, "tokens_processed": 61440, "tokens_per_second": 16332.959010248362, "energy_joules": 1904.104000031948, "energy_per_token": 0.030991276042186655, "avg_power_watts": 491.250130606127, "peak_memory_gb": 76.45208740234375, "avg_gpu_util_percent": 97.0 }, "optimizer": { "stage_name": "optimizer", "duration_ms": 896.0564862936735, "tokens_processed": 61440, "tokens_per_second": 68567.1059133025, "energy_joules": 349.722000002861, "energy_per_token": 0.0056920898437965665, "avg_power_watts": 356.92130879075387, "peak_memory_gb": 76.45208740234375, "avg_gpu_util_percent": 97.0 }, "total_duration_ms": 6406.282074516639, "total_tokens": 61440, "total_tokens_per_second": 9590.586128637759, "total_energy_joules": 3200.7529999911785, "total_energy_per_token": 0.052095589192564754, "timestamp": 1768541796.4011748 }