{ "model_name": "Qwen/Qwen3-4B", "gpu_name": "NVIDIA A100-SXM4-80GB", "attention_implementation": "flash_attention_2", "batch_size": 3, "sequence_length": 2048, "num_steps": 10, "forward": { "stage_name": "forward", "duration_ms": 3359.0412912890315, "tokens_processed": 61440, "tokens_per_second": 18290.933237210196, "energy_joules": 1292.2280000448227, "energy_per_token": 0.021032356771562868, "avg_power_watts": 387.19580415542595, "peak_memory_gb": 79.66021728515625, "avg_gpu_util_percent": 97.8 }, "backward": { "stage_name": "backward", "duration_ms": 6954.944152384996, "tokens_processed": 61440, "tokens_per_second": 8834.003358449821, "energy_joules": 2729.588000059128, "energy_per_token": 0.0444268880217957, "avg_power_watts": 394.24766095856324, "peak_memory_gb": 79.66021728515625, "avg_gpu_util_percent": 97.8 }, "optimizer": { "stage_name": "optimizer", "duration_ms": 1153.845101594925, "tokens_processed": 61440, "tokens_per_second": 53248.048559614595, "energy_joules": 362.6529998779297, "energy_per_token": 0.005902555336554845, "avg_power_watts": 299.1223537953503, "peak_memory_gb": 79.66021728515625, "avg_gpu_util_percent": 97.8 }, "total_duration_ms": 11467.830545268953, "total_tokens": 61440, "total_tokens_per_second": 5357.595733340081, "total_energy_joules": 4384.46899998188, "total_energy_per_token": 0.07136180012991342, "timestamp": 1768519431.5985208 }