{ "model_name": "Qwen/Qwen3-4B", "gpu_name": "NVIDIA H200", "attention_implementation": "flash_attention_3_hopper", "batch_size": 3, "sequence_length": 2048, "num_steps": 10, "forward": { "stage_name": "forward", "duration_ms": 1605.9521619997668, "tokens_processed": 61440, "tokens_per_second": 38257.67756587068, "energy_joules": 817.7539999999863, "energy_per_token": 0.01330979817708311, "avg_power_watts": 476.6091506406698, "peak_memory_gb": 76.5540771484375, "avg_gpu_util_percent": 95.1 }, "backward": { "stage_name": "backward", "duration_ms": 3448.8081949999696, "tokens_processed": 61440, "tokens_per_second": 17814.849804948502, "energy_joules": 1765.182000000008, "energy_per_token": 0.02873017578125013, "avg_power_watts": 498.84691252245983, "peak_memory_gb": 76.5540771484375, "avg_gpu_util_percent": 95.1 }, "optimizer": { "stage_name": "optimizer", "duration_ms": 545.701982000196, "tokens_processed": 61440, "tokens_per_second": 112588.92587268984, "energy_joules": 332.4770000000135, "energy_per_token": 0.005411409505208553, "avg_power_watts": 521.4900438388863, "peak_memory_gb": 76.5540771484375, "avg_gpu_util_percent": 95.1 }, "total_duration_ms": 5600.462338999932, "total_tokens": 61440, "total_tokens_per_second": 10970.522839186035, "total_energy_joules": 2915.4130000000077, "total_energy_per_token": 0.047451383463541795, "timestamp": 1768541921.6000674 }