diff --git a/arch_analysis/iaca_output_force_soa.txt b/arch_analysis/iaca_output_force_soa.txt new file mode 100644 index 0000000..451061a --- /dev/null +++ b/arch_analysis/iaca_output_force_soa.txt @@ -0,0 +1,104 @@ +iwia021h@testfront1:~/MD-Bench/ICC$ iaca -arch SKX force.o +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - force.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 36.70 Cycles Throughput Bottleneck: Backend +Loop Count: 23 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 17.5 0.0 | 11.0 | 20.5 17.0 | 20.5 17.0 | 7.0 | 20.5 | 7.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1* | | | | | | | | | mov r13, r8 +| 1 | | 1.0 | | | | | | | imul r13, rcx +| 1 | | | | | | 1.0 | | | vbroadcastsd zmm2, xmm6 +| 1 | | | | | | 1.0 | | | vbroadcastsd zmm1, xmm7 +| 1 | | | | | | 1.0 | | | vbroadcastsd zmm0, xmm12 +| 1 | | | | | | | 1.0 | | movsxd rbx, r12d +| 1 | | | | | | | 1.0 | | add r13, r10 +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x40], rax +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x38], r8 +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x30], r10 +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x28], rsi +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x20], rcx +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x50], r9 +| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x48], rdx +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu ymm3, ymmword ptr [r13+rbx*4] +| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm3, ymm3 +| 1 | | 1.0 | | | | | | | vpaddd ymm3, ymm3, ymm4 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r10d, dword ptr [r13+rbx*4] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r9d, dword ptr [r13+rbx*4+0x4] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r8d, dword ptr [r13+rbx*4+0x8] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov esi, dword ptr [r13+rbx*4+0xc] +| 1 | | 1.0 | | | | | | | lea r10d, ptr [r10+r10*2] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov ecx, dword ptr [r13+rbx*4+0x10] +| 1 | | 1.0 | | | | | | | lea r9d, ptr [r9+r9*2] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov edx, dword ptr [r13+rbx*4+0x14] +| 1 | | 1.0 | | | | | | | lea r8d, ptr [r8+r8*2] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov eax, dword ptr [r13+rbx*4+0x18] +| 1 | | 1.0 | | | | | | | lea esi, ptr [rsi+rsi*2] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r15d, dword ptr [r13+rbx*4+0x1c] +| 1 | | 1.0 | | | | | | | lea ecx, ptr [rcx+rcx*2] +| 1 | | 1.0 | | | | | | | lea edx, ptr [rdx+rdx*2] +| 1 | | 1.0 | | | | | | | lea eax, ptr [rax+rax*2] +| 1 | | 1.0 | | | | | | | lea r15d, ptr [r15+r15*2] +| 1 | | | | | | 1.0 | | | vpcmpeqb k1, xmm0, xmm0 +| 1 | | | | | | 1.0 | | | vpcmpeqb k2, xmm0, xmm0 +| 1 | | | | | | 1.0 | | | vpcmpeqb k3, xmm0, xmm0 +| 1* | | | | | | | | | vpxord zmm4, zmm4, zmm4 +| 1* | | | | | | | | | vpxord zmm17, zmm17, zmm17 +| 1* | | | | | | | | | vpxord zmm18, zmm18, zmm18 +| 5^ | 2.0 | | 4.0 4.0 | 4.0 4.0 | | | 1.0 | | vgatherdpd zmm4, k1, zmmword ptr [rdi+ymm3*8+0x10] +| 5^ | 1.5 | | 4.0 4.0 | 4.0 4.0 | | 0.5 | 1.0 | | vgatherdpd zmm17, k2, zmmword ptr [rdi+ymm3*8+0x8] +| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm18, k3, zmmword ptr [rdi+ymm3*8] +| 1 | | | | | | | 1.0 | | add r12d, 0x8 +| 1 | | | | | | | 1.0 | | add rbx, 0x8 +| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm26, zmm0, zmm4 +| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm24, zmm1, zmm17 +| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm23, zmm2, zmm18 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm3, zmm24, zmm24 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm3, zmm23, zmm23 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm3, zmm26, zmm26 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm22, zmm3 +| 1 | | | | | | 1.0 | | | vcmppd k2, zmm3, zmm14, 0x1 +| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm22, 0x1e +| 2^ | 1.0 | | 0.5 0.5 | 0.5 0.5 | | | | | vfnmadd213pd zmm3, zmm22, qword ptr [rip]{1to8} +| 1 | 1.0 | | | | | | | | knotw k1, k0 +| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm3, zmm3 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm22{k1}, zmm3, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm22{k1}, zmm4, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm17, zmm22, zmm13 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm22, zmm10 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm20, zmm22, zmm17 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm18, zmm22, zmm20 +| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm22, zmm20, zmm5 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm21, zmm18, zmm19 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm21, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k2}, zmm25, zmm23 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k2}, zmm25, zmm24 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k2}, zmm25, zmm26 +| 1* | | | | | | | | | cmp r12d, r14d +| 0*F | | | | | | | | | jb 0xfffffffffffffed3 +Total Num Of Uops: 91 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. +There were bubbles in the frontend.