diff --git a/static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out b/static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out new file mode 100644 index 0000000..922b350 --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out @@ -0,0 +1,198 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - gromacs-icc-avx512-dp.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4] +| 1 | | | | | | | 1.0 | | inc rsi +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380] +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0] +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0] +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240] +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200] +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100] +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280] +| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2] +| 1 | | | | | | | 1.0 | | shl r12d, 0x3 +| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1] +| 1 | | | | | | | 1.0 | | movsxd r12, r12d +| 1 | | 1.0 | | | | | | | cmp r13d, r11d +| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1] +| 1 | | | | | | | 1.0 | | mov edx, 0x0 +| 1 | | | | | | | 1.0 | | setz dl +| 1 | | 1.0 | | | | | | | cmp eax, r11d +| 1 | | | | | | | 1.0 | | mov eax, 0x0 +| 1* | | | | | | | | | mov r13d, edx +| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80] +| 1 | | | | | | | 1.0 | | setz al +| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80] +| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8] +| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40] +| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80] +| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80] +| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40] +| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8] +| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40] +| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40] +| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8] +| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8] +| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29 +| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26 +| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23 +| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20 +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0] +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12 +| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11 +| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11 +| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11 +| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14 +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80] +| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13 +| 1 | | 1.0 | | | | | | | neg r13d +| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13 +| 1* | | | | | | | | | mov r12d, eax +| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12 +| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19 +| 1 | | 1.0 | | | | | | | add r13d, 0xff +| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13 +| 1 | | | | | | | 1.0 | | nop +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400] +| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14 +| 1 | | | | | | | 1.0 | | shl r12d, 0x4 +| 1 | | 1.0 | | | | | | | sub r13d, r12d +| 1 | | | | | | 1.0 | | | kmovb k5, r13d +| 1 | 1.0 | | | | | | | | kmovw r13d, k1 +| 1 | 1.0 | | | | | | | | kmovb r12d, k5 +| 1 | | | | | | 1.0 | | | kmovb k5, r12d +| 1 | | | | | | 1.0 | | | kmovb k1, r13d +| 1* | | | | | | | | | mov r13d, eax +| 1 | 1.0 | | | | | | | | kandb k5, k5, k1 +| 1 | 1.0 | | | | | | | | kmovb r12d, k5 +| 1 | | | | | | 1.0 | | | kmovw k5, r12d +| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1] +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29 +| 1 | | | | | | | 1.0 | | neg r12d +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31 +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440] +| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30 +| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13 +| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29 +| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31 +| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30 +| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1 +| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12 +| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18 +| 1 | | 1.0 | | | | | | | add r12d, 0xff +| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14 +| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19 +| 1 | | | | | | | 1.0 | | shl r13d, 0x5 +| 1 | | 1.0 | | | | | | | sub r12d, r13d +| 1 | | | | | | 1.0 | | | kmovb k1, r12d +| 1 | 1.0 | | | | | | | | kmovw r12d, k6 +| 1 | 1.0 | | | | | | | | kmovb r13d, k1 +| 1 | | | | | | 1.0 | | | kmovb k1, r13d +| 1 | | | | | | 1.0 | | | kmovb k6, r12d +| 1* | | | | | | | | | mov r12d, eax +| 1 | 1.0 | | | | | | | | kandb k1, k1, k6 +| 1 | 1.0 | | | | | | | | kmovb r13d, k1 +| 1 | | | | | | 1.0 | | | kmovw k1, r13d +| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4] +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26 +| 1 | | | | | | | 1.0 | | neg r13d +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28 +| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12 +| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26 +| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27 +| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28 +| 1 | | | | | | | 1.0 | | add r13d, 0xff +| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14 +| 1 | | | | | | | 1.0 | | shl edx, 0x3 +| 1 | | | | | | | 1.0 | | shl r12d, 0x6 +| 1 | | 1.0 | | | | | | | neg edx +| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17 +| 1 | | 1.0 | | | | | | | sub r13d, r12d +| 1 | | | | | | 1.0 | | | kmovb k6, r13d +| 1 | | 1.0 | | | | | | | add edx, 0xff +| 1 | | | | | | | 1.0 | | shl eax, 0x7 +| 1 | | 1.0 | | | | | | | sub edx, eax +| 1 | 1.0 | | | | | | | | kmovb eax, k6 +| 1 | | | | | | 1.0 | | | kmovb k6, eax +| 1 | 1.0 | | | | | | | | kmovw eax, k7 +| 1 | | | | | | 1.0 | | | kmovb k7, eax +| 1 | 1.0 | | | | | | | | kandb k7, k6, k7 +| 1 | | | | | | 1.0 | | | kmovb k6, edx +| 1 | 1.0 | | | | | | | | kmovb edx, k7 +| 1 | | | | | | 1.0 | | | kmovw k7, edx +| 1 | 1.0 | | | | | | | | kmovw edx, k0 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25 +| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19 +| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12 +| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15 +| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24 +| 1 | 1.0 | | | | | | | | kmovb eax, k6 +| 1 | | | | | | 1.0 | | | kmovb k6, eax +| 1 | | | | | | 1.0 | | | kmovb k0, edx +| 1 | 1.0 | | | | | | | | kandb k0, k6, k0 +| 1 | 1.0 | | | | | | | | kmovb r12d, k0 +| 1 | | | | | | 1.0 | | | kmovw k6, r12d +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20 +| 1* | | | | | | | | | cmp rsi, rdi +| 0*F | | | | | | | | | jl 0xfffffffffffffc6f +Total Num Of Uops: 187 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out b/static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out new file mode 100644 index 0000000..3e68cec --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out @@ -0,0 +1,152 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - gromacs-icc-avx512-sp.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4] +| 1* | | | | | | | | | mov r12d, r13d +| 1 | | | | | | | 1.0 | | movsxd rdi, edi +| 1 | | 1.0 | | | | | | | inc rax +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140] +| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100] +| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0] +| 1 | | | | | | | 1.0 | | setz r12b +| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2] +| 1 | | | | | | | 1.0 | | shl r14, 0x5 +| 1* | | | | | | | | | mov r8d, r12d +| 1 | | 1.0 | | | | | | | neg r8d +| 1* | | | | | | | | | mov r11d, r12d +| 1 | | 1.0 | | | | | | | add r8d, 0xff +| 1 | | | | | | 1.0 | | | kmovw k0, r8d +| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2] +| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40] +| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1] +| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40] +| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20] +| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1] +| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20] +| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40] +| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1] +| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20] +| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40] +| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20] +| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1] +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2 +| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30 +| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1 +| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26 +| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23 +| 1 | 1.0 | | | | | | | | kandw k2, k0, k3 +| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1 +| 1 | | | | | | | 1.0 | | neg r9d +| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31 +| 1 | | | | | | | 1.0 | | add r9d, 0xff +| 1 | | | | | | 1.0 | | | kmovw k4, r9d +| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31 +| 1 | 1.0 | | | | | | | | kandw k1, k4, k5 +| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30 +| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8] +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31 +| 1 | | | | | | | 1.0 | | neg r10d +| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26 +| 1 | | 1.0 | | | | | | | add r10d, r12d +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31 +| 1 | | | | | | | 1.0 | | add r10d, 0xff +| 1 | | | | | | 1.0 | | | kmovw k6, r10d +| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30 +| 1 | 1.0 | | | | | | | | kandw k4, k6, k7 +| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27 +| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1 +| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5 +| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6 +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1] +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6 +| 1 | | | | | | | 1.0 | | shl r11d, 0x4 +| 1 | | 1.0 | | | | | | | sub r12d, r11d +| 1 | | 1.0 | | | | | | | add r12d, 0xff +| 1 | | | | | | 1.0 | | | kmovw k0, r12d +| 1 | 1.0 | | | | | | | | kandw k5, k0, k3 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27 +| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25 +| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7 +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20] +| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31 +| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4 +| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40] +| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30 +| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2 +| 1* | | | | | | | | | cmp rax, rdx +| 0*F | | | | | | | | | jb 0xfffffffffffffd30 +Total Num Of Uops: 142 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out new file mode 100644 index 0000000..fc903d9 --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out @@ -0,0 +1,154 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - gromacs-icx-avx512-dp.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4] +| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2] +| 1 | | | | | | | 1.0 | | shl rdx, 0x6 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10] +| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28 +| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150] +| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29 +| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17 +| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18 +| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19 +| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19 +| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18 +| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20 +| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30 +| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1] +| 1 | | | | | | | 1.0 | | cmp r11, rdx +| 1 | | | | | | | 1.0 | | setnz dl +| 1 | | | | | | | 1.0 | | setz al +| 1 | | 1.0 | | | | | | | add ecx, ecx +| 1 | | 1.0 | | | | | | | inc ecx +| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx +| 1 | | | | | | | 1.0 | | setz cl +| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210] +| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28 +| 1 | | | | | | | 1.0 | | setnz dil +| 1* | | | | | | | | | mov ebp, edi +| 1 | | | | | | | 1.0 | | shl bpl, 0x4 +| 1 | | 1.0 | | | | | | | sub bpl, al +| 1 | | 1.0 | | | | | | | add bpl, 0xef +| 1 | | | | | | 1.0 | | | kmovd k1, ebp +| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110] +| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29 +| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1] +| 1* | | | | | | | | | mov ebp, edi +| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18 +| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18 +| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18 +| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18 +| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0] +| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28 +| 1 | | | | | | | 1.0 | | shl bpl, 0x5 +| 1 | | 1.0 | | | | | | | or bpl, al +| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd +| 1 | | | | | | 1.0 | | | kmovd k1, ebp +| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0] +| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16 +| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17 +| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16 +| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17 +| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17 +| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4] +| 1 | | | | | | | 1.0 | | shl dil, 0x6 +| 1 | | 0.5 | | | | | 0.5 | | or dil, al +| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb +| 1 | | | | | | 1.0 | | | kmovd k1, edi +| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190] +| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28 +| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29 +| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3 +| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16 +| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1 +| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3 +| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18 +| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3 +| 1 | | | | | | | 1.0 | | shl dl, 0x3 +| 1 | | | | | | | 1.0 | | shl cl, 0x7 +| 1 | | 1.0 | | | | | | | or cl, dl +| 1 | | 1.0 | | | | | | | add cl, 0xf7 +| 1 | | | | | | 1.0 | | | kmovd k1, ecx +| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1 +| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3 +| 1 | | 0.5 | | | | | 0.5 | | inc rbx +| 1* | | | | | | | | | cmp r9, rbx +| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a +Total Num Of Uops: 129 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out new file mode 100644 index 0000000..7708a0d --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out @@ -0,0 +1,288 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 12200 +Total Cycles: 4745 +Total uOps: 14000 + +Dispatch Width: 6 +uOps Per Cycle: 2.95 +IPC: 2.57 +Block RThroughput: 34.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 5 0.50 * movslq (%r10,%rbx,4), %rcx + 1 1 0.50 leaq (%rcx,%rcx,2), %rdx + 1 1 0.50 shlq $6, %rdx + 2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28 + 2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29 + 2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30 + 2 8 0.50 * vmovupd 16(%rsp), %zmm3 + 1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3 + 1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31 + 2 8 0.50 * vmovupd 336(%rsp), %zmm16 + 1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16 + 1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17 + 1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17 + 1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17 + 3 4 2.00 vrcp14pd %zmm17, %zmm18 + 1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19 + 1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19 + 1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19 + 1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20 + 1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18 + 1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18 + 1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20 + 1 1 0.50 leal (%rcx,%rcx), %edx + 1 1 0.25 cmpq %rdx, %r11 + 1 1 0.50 setne %dl + 1 1 0.50 sete %al + 1 1 0.25 addl %ecx, %ecx + 1 1 0.25 incl %ecx + 1 1 0.25 cmpq %rcx, %r11 + 1 1 0.50 sete %cl + 1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18 + 2 8 0.50 * vmovupd 528(%rsp), %zmm19 + 1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19 + 1 1 0.50 setne %dil + 1 1 0.25 movl %edi, %ebp + 1 1 0.50 shlb $4, %bpl + 1 1 0.25 subb %al, %bpl + 1 1 0.25 addb $-17, %bpl + 1 1 1.00 kmovd %ebp, %k1 + 1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1} + 2 8 0.50 * vmovupd 272(%rsp), %zmm17 + 1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17 + 1 1 0.50 leal (%rdx,%rdx), %eax + 1 1 0.25 movl %edi, %ebp + 1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18 + 1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} + 1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3 + 1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3 + 1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3 + 1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} + 3 4 2.00 vrcp14pd %zmm3, %zmm16 + 1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} + 1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18 + 1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18 + 1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18 + 1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31 + 1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16 + 1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16 + 2 8 0.50 * vmovupd 464(%rsp), %zmm31 + 1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31 + 1 1 0.50 shlb $5, %bpl + 1 1 0.25 orb %al, %bpl + 1 1 0.25 orb $-35, %bpl + 1 1 1.00 kmovd %ebp, %k1 + 1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1} + 2 8 0.50 * vmovupd 208(%rsp), %zmm3 + 1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3 + 1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16 + 1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18 + 1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16 + 1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} + 1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19 + 1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19 + 1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19 + 1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} + 3 4 2.00 vrcp14pd %zmm19, %zmm17 + 1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} + 1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16 + 1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16 + 1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16 + 1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20 + 1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17 + 1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17 + 1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16 + 1 1 0.50 leal (,%rdx,4), %eax + 1 1 0.50 shlb $6, %dil + 1 1 0.25 orb %al, %dil + 1 1 0.25 orb $-69, %dil + 1 1 1.00 kmovd %edi, %k1 + 1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1} + 2 8 0.50 * vmovupd 400(%rsp), %zmm17 + 1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17 + 1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19 + 1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20 + 1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16 + 1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} + 1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28 + 1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28 + 1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28 + 1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} + 3 4 2.00 vrcp14pd %zmm28, %zmm3 + 1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} + 1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16 + 1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16 + 1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16 + 1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18 + 1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3 + 1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3 + 1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3 + 1 1 0.50 shlb $3, %dl + 1 1 0.50 shlb $7, %cl + 1 1 0.25 orb %dl, %cl + 1 1 0.25 addb $-9, %cl + 1 1 1.00 kmovd %ecx, %k1 + 1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1} + 1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3 + 1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} + 1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} + 1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} + 1 1 0.25 incq %rbx + 1 1 0.25 cmpq %rbx, %r9 + 1 1 0.50 jne .LBB5_12 + + +Resources: +[0] - SKXDivider +[1] - SKXFPDivider +[2] - SKXPort0 +[3] - SKXPort1 +[4] - SKXPort2 +[5] - SKXPort3 +[6] - SKXPort4 +[7] - SKXPort5 +[8] - SKXPort6 +[9] - SKXPort7 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + - - 45.53 20.45 5.50 5.50 - 44.64 18.38 - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: + - - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx + - - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx + - - 0.01 - - - - - 0.99 - shlq $6, %rdx + - - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28 + - - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29 + - - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30 + - - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3 + - - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3 + - - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31 + - - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16 + - - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16 + - - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17 + - - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17 + - - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17 + - - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18 + - - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19 + - - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19 + - - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19 + - - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20 + - - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18 + - - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18 + - - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20 + - - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx + - - - - - - - - 1.00 - cmpq %rdx, %r11 + - - - - - - - - 1.00 - setne %dl + - - 0.44 - - - - - 0.56 - sete %al + - - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx + - - - 0.53 - - - 0.46 0.01 - incl %ecx + - - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11 + - - 0.02 - - - - - 0.98 - sete %cl + - - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18 + - - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19 + - - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19 + - - 0.04 - - - - - 0.96 - setne %dil + - - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp + - - 0.01 - - - - - 0.99 - shlb $4, %bpl + - - - 0.96 - - - - 0.04 - subb %al, %bpl + - - - 0.06 - - - - 0.94 - addb $-17, %bpl + - - - - - - - 1.00 - - kmovd %ebp, %k1 + - - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1} + - - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17 + - - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17 + - - - 1.00 - - - - - - leal (%rdx,%rdx), %eax + - - - 0.05 - - - - 0.95 - movl %edi, %ebp + - - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18 + - - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} + - - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3 + - - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3 + - - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3 + - - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} + - - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16 + - - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} + - - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18 + - - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18 + - - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18 + - - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31 + - - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16 + - - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16 + - - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31 + - - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31 + - - 0.01 - - - - - 0.99 - shlb $5, %bpl + - - - 0.94 - - - - 0.06 - orb %al, %bpl + - - - 0.04 - - - - 0.96 - orb $-35, %bpl + - - - - - - - 1.00 - - kmovd %ebp, %k1 + - - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1} + - - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3 + - - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3 + - - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16 + - - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18 + - - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16 + - - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} + - - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19 + - - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19 + - - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19 + - - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} + - - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17 + - - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} + - - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16 + - - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16 + - - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16 + - - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20 + - - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17 + - - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17 + - - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16 + - - - 1.00 - - - - - - leal (,%rdx,4), %eax + - - - - - - - - 1.00 - shlb $6, %dil + - - - 0.02 - - - - 0.98 - orb %al, %dil + - - - 0.48 - - - - 0.52 - orb $-69, %dil + - - - - - - - 1.00 - - kmovd %edi, %k1 + - - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1} + - - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17 + - - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17 + - - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19 + - - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20 + - - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16 + - - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} + - - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28 + - - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28 + - - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28 + - - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} + - - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3 + - - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} + - - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16 + - - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16 + - - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16 + - - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18 + - - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3 + - - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3 + - - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3 + - - - - - - - - 1.00 - shlb $3, %dl + - - - - - - - - 1.00 - shlb $7, %cl + - - - 1.00 - - - - - - orb %dl, %cl + - - - 0.52 - - - - 0.48 - addb $-9, %cl + - - - - - - - 1.00 - - kmovd %ecx, %k1 + - - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1} + - - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3 + - - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} + - - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} + - - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} + - - - 0.48 - - - - 0.52 - incq %rbx + - - - 0.52 - - - - 0.48 - cmpq %rbx, %r9 + - - - - - - - - 1.00 - jne .LBB5_12 diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out new file mode 100644 index 0000000..d4c0cba --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out @@ -0,0 +1,167 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-icx-avx512-dp.s +Architecture: CSX +Timestamp: 2023-02-10 16:30:53 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +-------------------------------------------------------------------------------------------------- +2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 +2242 | | | | | | | | || | | # LLVM-MCA-BEGIN +2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1 +2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx +2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx +2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx +2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV +2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV +2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV +2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload +2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3 +2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31 +2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload +2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16 +2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17 +2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17 +2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17 +2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18 +2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19 +2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19 +2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19 +2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20 +2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18 +2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18 +2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20 +2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx +2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11 +2269 | 0.00 | | | | | | 1.00 | || | | setne %dl +2270 | 0.00 | | | | | | 1.00 | || | | sete %al +2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx +2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx +2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11 +2274 | 0.00 | | | | | | 1.00 | || | | sete %cl +2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18 +2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload +2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19 +2278 | 0.00 | | | | | | 1.00 | || | | setne %dil +2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp +2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl +2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl +2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl +2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1 +2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1} +2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload +2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17 +2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax +2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp +2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18 +2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14 +2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3 +2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3 +2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3 +2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11 +2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16 +2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7 +2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18 +2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18 +2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18 +2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31 +2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16 +2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16 +2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload +2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31 +2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl +2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl +2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl +2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1 +2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1} +2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload +2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3 +2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16 +2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18 +2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16 +2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15 +2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19 +2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19 +2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19 +2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10 +2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17 +2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6 +2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16 +2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16 +2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16 +2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20 +2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17 +2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17 +2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16 +2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax +2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil +2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil +2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil +2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1 +2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1} +2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload +2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17 +2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19 +2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20 +2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16 +2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13 +2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28 +2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28 +2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28 +2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9 +2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3 +2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5 +2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16 +2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16 +2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16 +2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18 +2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3 +2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3 +2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3 +2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl +2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl +2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl +2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl +2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1 +2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1} +2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3 +2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12 +2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8 +2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4 +2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx +2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9 +2366 | | | | | | | | || | | * jne .LBB5_12 +2367 | | | | | | | | || | | # LLVM-MCA-END + + 44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307] +2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363] +2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362] +2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361] +2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346] +2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344] +2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340] +2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321] +2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319] +2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315] +2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296] +2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294] +2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290] +2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332] +2364 | 1.0 | incq %rbx | [2364] + diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out new file mode 100644 index 0000000..3a67dc4 --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out @@ -0,0 +1,162 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - gromacs-icx-avx512-sp.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4] +| 1* | | | | | | | | | mov rsi, rax +| 1 | | | | | | | 1.0 | | shl rsi, 0x5 +| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80] +| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140] +| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16 +| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp] +| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100] +| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16 +| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0] +| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0] +| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16 +| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180] +| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15 +| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16 +| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27 +| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24 +| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21 +| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17 +| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30 +| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31 +| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4 +| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4 +| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13 +| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31 +| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5 +| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1 +| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31 +| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31 +| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5 +| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13 +| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1 +| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5 +| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2 +| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5 +| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5 +| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1 +| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13 +| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2 +| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31 +| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3 +| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31 +| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31 +| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2 +| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13 +| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3 +| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5 +| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3 +| 1* | | | | | | | | | xor esi, esi +| 1* | | | | | | | | | xor edi, edi +| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff +| 1 | | | | | | | 1.0 | | setz sil +| 1 | | | | | | | 1.0 | | setnz dil +| 1 | | 1.0 | | | | | | | mov eax, 0xff +| 1 | | | | | | | 1.0 | | cmovz eax, r8d +| 1 | | 1.0 | | | | | | | mov ecx, 0xff +| 1 | | | | | | | 1.0 | | cmovz ecx, r9d +| 1 | | 1.0 | | | | | | | xor esi, 0xff +| 1 | | | | | | 1.0 | | | kmovd k1, esi +| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1 +| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14 +| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4 +| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4 +| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4 +| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2] +| 1 | | | | | | | 1.0 | | or esi, 0xfc +| 1 | | | | | | 1.0 | | | kmovd k1, esi +| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1 +| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14 +| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1 +| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21 +| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1 +| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21 +| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1 +| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1 +| 1 | | | | | | 1.0 | | | kmovd k1, eax +| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1 +| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14 +| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2 +| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2 +| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2 +| 1 | | | | | | 1.0 | | | kmovd k1, ecx +| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1 +| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14 +| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3 +| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18 +| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4 +| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3 +| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5 +| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5 +| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0] +| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1] +| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4 +| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3 +| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20] +| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5 +| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40] +| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1 +| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1 +| 1* | | | | | | | | | cmp r10, rdx +| 0*F | | | | | | | | | jz 0x34 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0] +| 1 | | 1.0 | | | | | | | inc rdx +| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc +Total Num Of Uops: 140 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out new file mode 100644 index 0000000..72dd89b --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out @@ -0,0 +1,304 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 13000 +Total Cycles: 5640 +Total uOps: 15400 + +Dispatch Width: 6 +uOps Per Cycle: 2.73 +IPC: 2.30 +Block RThroughput: 40.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 5 0.50 * movslq (%r11,%rdx,4), %rax + 1 1 0.25 movq %rax, %rsi + 1 1 0.50 shlq $5, %rsi + 1 1 0.50 leaq (%rsi,%rsi,2), %rbx + 2 8 0.50 * vmovups (%rdi,%rbx), %zmm15 + 2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16 + 2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27 + 2 8 0.50 * vmovups 128(%rsp), %zmm1 + 1 4 0.50 vsubps %zmm15, %zmm1, %zmm24 + 2 8 0.50 * vmovups 320(%rsp), %zmm1 + 1 4 0.50 vsubps %zmm16, %zmm1, %zmm25 + 1 4 0.50 vsubps %zmm27, %zmm9, %zmm26 + 2 8 0.50 * vmovups (%rsp), %zmm1 + 1 4 0.50 vsubps %zmm15, %zmm1, %zmm21 + 2 8 0.50 * vmovups 256(%rsp), %zmm1 + 1 4 0.50 vsubps %zmm16, %zmm1, %zmm22 + 1 4 0.50 vsubps %zmm27, %zmm10, %zmm23 + 2 8 0.50 * vmovups 448(%rsp), %zmm1 + 1 4 0.50 vsubps %zmm15, %zmm1, %zmm17 + 2 8 0.50 * vmovups 192(%rsp), %zmm1 + 1 4 0.50 vsubps %zmm16, %zmm1, %zmm19 + 1 4 0.50 vsubps %zmm27, %zmm11, %zmm20 + 2 8 0.50 * vmovups 384(%rsp), %zmm1 + 1 4 0.50 vsubps %zmm15, %zmm1, %zmm18 + 1 4 0.50 vsubps %zmm16, %zmm8, %zmm16 + 1 4 0.50 vsubps %zmm27, %zmm12, %zmm15 + 1 4 0.50 vmulps %zmm26, %zmm26, %zmm27 + 1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27 + 1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27 + 1 4 0.50 vmulps %zmm23, %zmm23, %zmm28 + 1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28 + 1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28 + 1 4 0.50 vmulps %zmm20, %zmm20, %zmm29 + 1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29 + 1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29 + 1 4 0.50 vmulps %zmm15, %zmm15, %zmm30 + 1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30 + 3 4 2.00 vrcp14ps %zmm27, %zmm31 + 3 4 2.00 vrcp14ps %zmm28, %zmm1 + 3 4 2.00 vrcp14ps %zmm29, %zmm2 + 1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30 + 3 4 2.00 vrcp14ps %zmm30, %zmm3 + 1 4 0.50 vmulps %zmm31, %zmm6, %zmm4 + 1 4 0.50 vmulps %zmm4, %zmm31, %zmm4 + 1 4 0.50 vmulps %zmm4, %zmm31, %zmm4 + 1 4 0.50 vaddps %zmm13, %zmm4, %zmm5 + 1 4 0.50 vmulps %zmm31, %zmm7, %zmm31 + 1 4 0.50 vmulps %zmm5, %zmm31, %zmm5 + 1 4 0.50 vmulps %zmm1, %zmm6, %zmm31 + 1 4 0.50 vmulps %zmm31, %zmm1, %zmm31 + 1 4 0.50 vmulps %zmm31, %zmm1, %zmm31 + 1 4 0.50 vmulps %zmm5, %zmm4, %zmm4 + 1 4 0.50 vaddps %zmm13, %zmm31, %zmm5 + 1 4 0.50 vmulps %zmm1, %zmm7, %zmm1 + 1 4 0.50 vmulps %zmm5, %zmm1, %zmm1 + 1 4 0.50 vmulps %zmm2, %zmm6, %zmm5 + 1 4 0.50 vmulps %zmm5, %zmm2, %zmm5 + 1 4 0.50 vmulps %zmm5, %zmm2, %zmm5 + 1 4 0.50 vmulps %zmm1, %zmm31, %zmm1 + 1 4 0.50 vaddps %zmm13, %zmm5, %zmm31 + 1 4 0.50 vmulps %zmm2, %zmm7, %zmm2 + 1 4 0.50 vmulps %zmm31, %zmm2, %zmm2 + 1 4 0.50 vmulps %zmm3, %zmm6, %zmm31 + 1 4 0.50 vmulps %zmm31, %zmm3, %zmm31 + 1 4 0.50 vmulps %zmm31, %zmm3, %zmm31 + 1 4 0.50 vmulps %zmm2, %zmm5, %zmm2 + 1 4 0.50 vaddps %zmm13, %zmm31, %zmm5 + 1 4 0.50 vmulps %zmm3, %zmm7, %zmm3 + 1 4 0.50 vmulps %zmm5, %zmm3, %zmm3 + 1 4 0.50 vmulps %zmm3, %zmm31, %zmm3 + 1 0 0.17 xorl %esi, %esi + 1 0 0.17 xorl %edi, %edi + 1 1 0.25 testl $2147483647, %eax + 1 1 0.50 sete %sil + 1 1 0.50 setne %dil + 1 1 0.25 movl $255, %eax + 1 1 0.50 cmovel %r8d, %eax + 1 1 0.25 movl $255, %ecx + 1 1 0.50 cmovel %r9d, %ecx + 1 1 0.25 xorl $255, %esi + 1 1 1.00 kmovd %esi, %k1 + 1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1} + 1 4 0.50 vmulps %zmm14, %zmm4, %zmm4 + 1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z} + 1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z} + 1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z} + 1 1 0.50 leal (%rdi,%rdi,2), %esi + 1 1 0.25 orl $252, %esi + 1 1 1.00 kmovd %esi, %k1 + 1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1} + 1 4 0.50 vmulps %zmm14, %zmm1, %zmm1 + 1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z} + 1 4 0.50 vaddps %zmm21, %zmm5, %zmm5 + 1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z} + 1 4 0.50 vaddps %zmm21, %zmm24, %zmm21 + 1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z} + 1 4 0.50 vaddps %zmm1, %zmm4, %zmm1 + 1 1 1.00 kmovd %eax, %k1 + 1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1} + 1 4 0.50 vmulps %zmm14, %zmm2, %zmm2 + 1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z} + 1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z} + 1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z} + 1 1 1.00 kmovd %ecx, %k1 + 1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1} + 1 4 0.50 vmulps %zmm14, %zmm3, %zmm3 + 1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z} + 1 4 0.50 vaddps %zmm18, %zmm4, %zmm4 + 1 4 0.50 vaddps %zmm4, %zmm5, %zmm4 + 1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z} + 1 4 0.50 vaddps %zmm5, %zmm17, %zmm5 + 1 4 0.50 vaddps %zmm5, %zmm21, %zmm5 + 1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z} + 1 5 0.50 * movq 176(%r15), %rax + 1 4 0.50 vaddps %zmm3, %zmm2, %zmm2 + 2 8 0.50 * vmovups (%rax,%rbx), %zmm3 + 1 4 0.50 vsubps %zmm4, %zmm3, %zmm3 + 2 1 1.00 * vmovups %zmm3, (%rax,%rbx) + 1 4 0.50 vaddps %zmm2, %zmm1, %zmm1 + 2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2 + 1 4 0.50 vsubps %zmm5, %zmm2, %zmm2 + 2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx) + 2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2 + 1 4 0.50 vsubps %zmm1, %zmm2, %zmm1 + 2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx) + 1 1 0.25 cmpq %rdx, %r10 + 1 1 0.50 je .LBB4_18 + 1 5 0.50 * movq 160(%r15), %rdi + 1 1 0.25 incq %rdx + 1 1 0.50 jmp .LBB4_8 + + +Resources: +[0] - SKXDivider +[1] - SKXFPDivider +[2] - SKXPort0 +[3] - SKXPort1 +[4] - SKXPort2 +[5] - SKXPort3 +[6] - SKXPort4 +[7] - SKXPort5 +[8] - SKXPort6 +[9] - SKXPort7 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + - - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00 + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: + - - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax + - - - - - - - - 1.00 - movq %rax, %rsi + - - - - - - - - 1.00 - shlq $5, %rsi + - - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx + - - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15 + - - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16 + - - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27 + - - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1 + - - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24 + - - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1 + - - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25 + - - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26 + - - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1 + - - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21 + - - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1 + - - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22 + - - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23 + - - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1 + - - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17 + - - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1 + - - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19 + - - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20 + - - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1 + - - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18 + - - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16 + - - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15 + - - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27 + - - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27 + - - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27 + - - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28 + - - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28 + - - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28 + - - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29 + - - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29 + - - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29 + - - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30 + - - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30 + - - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31 + - - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1 + - - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2 + - - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30 + - - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3 + - - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4 + - - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4 + - - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4 + - - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5 + - - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5 + - - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31 + - - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4 + - - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5 + - - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1 + - - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1 + - - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5 + - - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5 + - - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5 + - - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1 + - - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2 + - - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2 + - - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31 + - - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2 + - - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5 + - - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3 + - - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3 + - - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3 + - - - - - - - - - - xorl %esi, %esi + - - - - - - - - - - xorl %edi, %edi + - - - - - - - - 1.00 - testl $2147483647, %eax + - - - - - - - - 1.00 - sete %sil + - - - - - - - - 1.00 - setne %dil + - - - 1.00 - - - - - - movl $255, %eax + - - - - - - - - 1.00 - cmovel %r8d, %eax + - - - 1.00 - - - - - - movl $255, %ecx + - - - - - - - - 1.00 - cmovel %r9d, %ecx + - - - 1.00 - - - - - - xorl $255, %esi + - - - - - - - 1.00 - - kmovd %esi, %k1 + - - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1} + - - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4 + - - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z} + - - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z} + - - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z} + - - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi + - - - - - - - - 1.00 - orl $252, %esi + - - - - - - - 1.00 - - kmovd %esi, %k1 + - - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1} + - - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1 + - - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z} + - - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5 + - - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z} + - - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21 + - - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z} + - - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1 + - - - - - - - 1.00 - - kmovd %eax, %k1 + - - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1} + - - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2 + - - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z} + - - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z} + - - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z} + - - - - - - - 1.00 - - kmovd %ecx, %k1 + - - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1} + - - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3 + - - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z} + - - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4 + - - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4 + - - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z} + - - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5 + - - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5 + - - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z} + - - - - 1.00 - - - - - movq 176(%r15), %rax + - - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2 + - - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3 + - - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3 + - - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx) + - - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1 + - - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2 + - - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2 + - - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx) + - - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2 + - - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1 + - - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx) + - - - - - - - - 1.00 - cmpq %rdx, %r10 + - - - - - - - - 1.00 - je .LBB4_18 + - - - - 0.50 0.50 - - - - movq 160(%r15), %rdi + - - - 1.00 - - - - - - incq %rdx + - - - - - - - - 1.00 - jmp .LBB4_8 diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out new file mode 100644 index 0000000..e3f5617 --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out @@ -0,0 +1,161 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-icx-avx512-sp.s +Architecture: CSX +Timestamp: 2023-02-10 16:31:04 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +-------------------------------------------------------------------------------------------------- +1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649 +1663 | | | | | | | | || | | # LLVM-MCA-BEGIN +1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1 +1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax +1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi +1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi +1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx +1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV +1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV +1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV +1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload +1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24 +1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload +1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25 +1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26 +1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload +1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21 +1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload +1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22 +1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23 +1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload +1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17 +1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload +1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19 +1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20 +1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload +1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18 +1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16 +1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15 +1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27 +1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27 +1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27 +1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28 +1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28 +1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28 +1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29 +1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29 +1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29 +1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30 +1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30 +1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31 +1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1 +1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2 +1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30 +1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3 +1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4 +1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4 +1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4 +1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5 +1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31 +1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5 +1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31 +1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31 +1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31 +1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4 +1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5 +1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1 +1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1 +1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5 +1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5 +1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5 +1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1 +1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31 +1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2 +1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2 +1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31 +1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31 +1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31 +1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2 +1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5 +1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3 +1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3 +1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3 +1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi +1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi +1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF +1738 | 0.00 | | | | | | 1.00 | || | | sete %sil +1739 | 0.00 | | | | | | 1.00 | || | | setne %dil +1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax +1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax +1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx +1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx +1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi +1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1 +1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1} +1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4 +1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z} +1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z} +1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z} +1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi +1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi +1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1 +1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1} +1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1 +1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z} +1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5 +1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z} +1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21 +1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z} +1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1 +1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1 +1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1} +1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2 +1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z} +1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z} +1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z} +1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1 +1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1} +1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3 +1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z} +1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4 +1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4 +1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z} +1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5 +1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5 +1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z} +1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax +1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2 +1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV +1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3 +1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV +1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1 +1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV +1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2 +1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV +1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV +1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1 +1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV +1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10 +1791 | | | | | | | | || | | * je .LBB4_18 +1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1 +1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi +1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx +1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8 +1796 | | | | | | | | || | | # LLVM-MCA-END + + 50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +1794 | 1.0 | incq %rdx | [1794] + diff --git a/static_analysis/jan/analyses/lammps-icc-avx2-iaca.out b/static_analysis/jan/analyses/lammps-icc-avx2-iaca.out new file mode 100644 index 0000000..2e5b07d --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx2-iaca.out @@ -0,0 +1,88 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - lammps-icc-avx2.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4] +| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0 +| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0 +| 1 | 1.0 | | | | | | | | vmovq r15, xmm2 +| 1* | | | | | | | | | mov r8d, ecx +| 1 | | | | | | | 1.0 | | shr rcx, 0x20 +| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2] +| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2] +| 1 | | | | | | | 1.0 | | movsxd rcx, r8d +| 1 | | | | | | | 1.0 | | movsxd r8, r14d +| 1* | | | | | | | | | mov r14d, r15d +| 1 | | | | | | | 1.0 | | shr r15, 0x20 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8] +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10] +| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2] +| 1 | | | | | | | 1.0 | | movsxd r14, r14d +| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2] +| 1 | | | | | | | 1.0 | | movsxd r15, r15d +| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10] +| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1 +| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10] +| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1 +| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10] +| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6 +| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6 +| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14 +| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1 +| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1 +| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7 +| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2 +| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6 +| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0 +| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1 +| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7 +| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7 +| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14 +| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60] +| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14 +| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14 +| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3 +| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40] +| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7 +| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14 +| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7 +| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7 +| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6 +| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6 +| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7 +| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2 +| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6 +| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0 +| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1 +| 1 | | | | | | | 1.0 | | add rdx, 0x4 +| 1* | | | | | | | | | cmp rdx, rsi +| 0*F | | | | | | | | | jb 0xffffffffffffff02 +Total Num Of Uops: 62 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out b/static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out new file mode 100644 index 0000000..1dd97c8 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out @@ -0,0 +1,156 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 5600 +Total Cycles: 2352 +Total uOps: 6300 + +Dispatch Width: 6 +uOps Per Cycle: 2.68 +IPC: 2.38 +Block RThroughput: 10.5 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0 + 1 2 1.00 vmovq %xmm0, %rcx + 1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2 + 1 2 1.00 vmovq %xmm2, %r15 + 1 1 0.25 movl %ecx, %r8d + 1 1 0.50 shrq $32, %rcx + 1 1 0.50 leal (%rcx,%rcx,2), %r14d + 1 1 0.50 leal (%r8,%r8,2), %r8d + 1 1 0.25 movslq %r8d, %rcx + 1 1 0.25 movslq %r14d, %r8 + 1 1 0.25 movl %r15d, %r14d + 1 1 0.50 shrq $32, %r15 + 1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7 + 1 6 0.50 * vmovups (%r11,%r8,8), %xmm6 + 1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14 + 1 1 0.50 leal (%r14,%r14,2), %r14d + 1 1 0.25 movslq %r14d, %r14 + 1 1 0.50 leal (%r15,%r15,2), %r15d + 1 1 0.25 movslq %r15d, %r15 + 2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 + 2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 + 1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0 + 2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 + 2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 + 1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14 + 1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1 + 1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6 + 1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7 + 1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2 + 1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0 + 1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14 + 1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14 + 1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14 + 1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1 + 1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7 + 2 3 1.00 vptest %ymm7, %ymm1 + 1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7 + 2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14 + 1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14 + 1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15 + 1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14 + 2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7 + 1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15 + 1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7 + 1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6 + 1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2 + 1 1 0.33 vandpd %ymm6, %ymm1, %ymm6 + 1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13 + 1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6 + 1 1 0.33 vandpd %ymm2, %ymm1, %ymm0 + 1 1 0.33 vandpd %ymm6, %ymm1, %ymm1 + 1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12 + 1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11 + 1 1 0.25 addq $4, %rdx + 1 1 0.25 cmpq %rsi, %rdx + 1 1 0.50 jb ..B1.22 + + +Resources: +[0] - SKXDivider +[1] - SKXFPDivider +[2] - SKXPort0 +[3] - SKXPort1 +[4] - SKXPort2 +[5] - SKXPort3 +[6] - SKXPort4 +[7] - SKXPort5 +[8] - SKXPort6 +[9] - SKXPort7 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + - 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: + - - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0 + - - 1.00 - - - - - - - vmovq %xmm0, %rcx + - - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2 + - - 1.00 - - - - - - - vmovq %xmm2, %r15 + - - - - - - - - 1.00 - movl %ecx, %r8d + - - 0.06 - - - - - 0.94 - shrq $32, %rcx + - - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d + - - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d + - - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx + - - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8 + - - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d + - - 0.51 - - - - - 0.49 - shrq $32, %r15 + - - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7 + - - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6 + - - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14 + - - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d + - - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14 + - - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d + - - 0.04 - - - - - 0.96 - movslq %r15d, %r15 + - - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 + - - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 + - - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0 + - - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 + - - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 + - - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14 + - - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1 + - - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6 + - - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7 + - - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2 + - - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0 + - - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14 + - - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14 + - - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14 + - - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1 + - - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7 + - - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1 + - 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7 + - - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14 + - - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14 + - - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15 + - - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14 + - - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7 + - - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15 + - - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7 + - - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6 + - - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2 + - - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6 + - - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13 + - - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6 + - - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0 + - - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1 + - - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12 + - - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11 + - - 0.01 - - - - - 0.99 - addq $4, %rdx + - - - - - - - 0.02 0.98 - cmpq %rsi, %rdx + - - 0.45 - - - - - 0.55 - jb ..B1.22 diff --git a/static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out b/static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out new file mode 100644 index 0000000..4d7d2b2 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out @@ -0,0 +1,158 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 5600 +Total Cycles: 2306 +Total uOps: 6300 + +Dispatch Width: 6 +uOps Per Cycle: 2.73 +IPC: 2.43 +Block RThroughput: 10.5 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0 + 1 2 1.00 vmovq %xmm0, %rcx + 1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2 + 1 2 1.00 vmovq %xmm2, %r15 + 1 1 0.25 movl %ecx, %r8d + 1 1 0.50 shrq $32, %rcx + 1 1 0.50 leal (%rcx,%rcx,2), %r14d + 1 1 0.50 leal (%r8,%r8,2), %r8d + 1 1 0.25 movslq %r8d, %rcx + 1 1 0.25 movslq %r14d, %r8 + 1 1 0.25 movl %r15d, %r14d + 1 1 0.50 shrq $32, %r15 + 1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7 + 1 6 0.50 * vmovups (%r11,%r8,8), %xmm6 + 1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14 + 1 1 0.50 leal (%r14,%r14,2), %r14d + 1 1 0.25 movslq %r14d, %r14 + 1 1 0.50 leal (%r15,%r15,2), %r15d + 1 1 0.25 movslq %r15d, %r15 + 2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 + 2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 + 1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0 + 2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 + 2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 + 1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14 + 1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1 + 1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6 + 1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7 + 1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2 + 1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0 + 1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14 + 1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14 + 1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14 + 1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1 + 1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7 + 2 3 1.00 vptest %ymm7, %ymm1 + 1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7 + 2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14 + 1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14 + 1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15 + 1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14 + 2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7 + 1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15 + 1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7 + 1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6 + 1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2 + 1 1 0.33 vandpd %ymm6, %ymm1, %ymm6 + 1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13 + 1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6 + 1 1 0.33 vandpd %ymm2, %ymm1, %ymm0 + 1 1 0.33 vandpd %ymm6, %ymm1, %ymm1 + 1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12 + 1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11 + 1 1 0.25 addq $4, %rdx + 1 1 0.25 cmpq %rsi, %rdx + 1 1 0.50 jb ..B1.22 + + +Resources: +[0] - ICXDivider +[1] - ICXFPDivider +[2] - ICXPort0 +[3] - ICXPort1 +[4] - ICXPort2 +[5] - ICXPort3 +[6] - ICXPort4 +[7] - ICXPort5 +[8] - ICXPort6 +[9] - ICXPort7 +[10] - ICXPort8 +[11] - ICXPort9 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] + - 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: + - - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0 + - - 1.00 - - - - - - - - - vmovq %xmm0, %rcx + - - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2 + - - 1.00 - - - - - - - - - vmovq %xmm2, %r15 + - - - - - - - - 1.00 - - - movl %ecx, %r8d + - - 0.96 - - - - - 0.04 - - - shrq $32, %rcx + - - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d + - - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d + - - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx + - - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8 + - - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d + - - 0.52 - - - - - 0.48 - - - shrq $32, %r15 + - - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7 + - - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6 + - - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14 + - - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d + - - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14 + - - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d + - - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15 + - - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 + - - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 + - - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0 + - - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 + - - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 + - - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14 + - - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1 + - - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6 + - - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7 + - - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2 + - - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0 + - - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14 + - - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14 + - - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14 + - - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1 + - - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7 + - - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1 + - 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7 + - - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14 + - - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14 + - - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15 + - - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14 + - - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7 + - - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15 + - - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7 + - - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6 + - - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2 + - - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6 + - - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13 + - - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6 + - - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0 + - - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1 + - - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12 + - - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11 + - - 0.01 - - - - - 0.99 - - - addq $4, %rdx + - - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx + - - 0.01 - - - - - 0.99 - - - jb ..B1.22 diff --git a/static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out b/static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out new file mode 100644 index 0000000..30304a5 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out @@ -0,0 +1,97 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: lammps-icc-avx2.s +Architecture: CSX +Timestamp: 2023-02-10 16:29:58 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +---------------------------------------------------------------------------------------------------- + 256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e + 257 | | | | | | | | || | | # LLVM-MCA-BEGIN + 258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21 + 259 | | | | | | | | || | | # Execution count [2.50e+01] + 260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21 + 261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21 + 262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21 + 263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21 + 264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21 + 265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21 + 266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36 + 267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36 + 268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36 + 269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36 + 270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21 + 271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21 + 272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36 + 273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36 + 274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36 + 275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36 + 276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36 + 277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36 + 278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36 + 279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36 + 280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36 + 281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36 + 282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36 + 283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36 + 284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36 + 285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36 + 286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36 + 287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36 + 288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36 + 289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36 + 290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49 + 291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49 + 292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63 + 293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22 + 294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22 + 295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22 + 296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22 + 297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 + 298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22 + 299 | | | | | | | | || | | # Execution count [1.25e+01] + 300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39 + 301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill] + 302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44 + 303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50 + 304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55 + 305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill] + 306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64 + 307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70 + 308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31 + 309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31 + 310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31 + 311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17 + 312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31 + 313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31 + 314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31 + 315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17 + 316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17 + 317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 + 318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22 + 319 | | | | | | | | || | | # Execution count [2.50e+01] + 320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9 + 321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9 + 322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9 + 323 | | | | | | | | || | | # LLVM-MCA-END + + 13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- + 316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316] + 315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315] + 311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311] + 320 | 1.0 | addq $4, %rdx #59.9| [320] + diff --git a/static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out b/static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out new file mode 100644 index 0000000..c06ace3 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out @@ -0,0 +1,97 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: lammps-icc-avx2.s +Architecture: ICX +Timestamp: 2023-02-10 16:29:48 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | +----------------------------------------------------------------------------------------------------------------------- + 256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e + 257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN + 258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21 + 259 | | | | | | | | | | || | | # Execution count [2.50e+01] + 260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21 + 261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21 + 262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21 + 263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21 + 264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21 + 265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21 + 266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36 + 267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36 + 268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36 + 269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36 + 270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21 + 271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21 + 272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36 + 273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36 + 274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36 + 275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36 + 276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36 + 277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36 + 278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36 + 279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36 + 280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36 + 281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36 + 282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36 + 283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36 + 284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36 + 285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36 + 286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36 + 287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36 + 288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36 + 289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36 + 290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49 + 291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49 + 292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63 + 293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22 + 294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22 + 295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22 + 296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22 + 297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 + 298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22 + 299 | | | | | | | | | | || | | # Execution count [1.25e+01] + 300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39 + 301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill] + 302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44 + 303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50 + 304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55 + 305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill] + 306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64 + 307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70 + 308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31 + 309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31 + 310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31 + 311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17 + 312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31 + 313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31 + 314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31 + 315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17 + 316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17 + 317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 + 318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22 + 319 | | | | | | | | | | || | | # Execution count [2.50e+01] + 320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9 + 321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9 + 322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9 + 323 | | | | | | | | | | || | | # LLVM-MCA-END + + 12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- + 316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316] + 315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315] + 311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311] + 320 | 1.0 | addq $4, %rdx #59.9| [320] + diff --git a/static_analysis/jan/analyses/lammps-icc-avx512-iaca.out b/static_analysis/jan/analyses/lammps-icc-avx512-iaca.out new file mode 100644 index 0000000..875f7b4 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx512-iaca.out @@ -0,0 +1,75 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - lammps-icc-avx512.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4 +| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15 +| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4] +| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17 +| 1 | | | | | | | 1.0 | | add r15, 0x8 +| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18 +| 1 | 1.0 | | | | | | | | kmovw k2, k5 +| 1 | 1.0 | | | | | | | | kmovw k3, k5 +| 1 | 1.0 | | | | | | | | kmovw k1, k5 +| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21 +| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20 +| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22 +| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8] +| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8] +| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10] +| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21 +| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20 +| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31 +| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1 +| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e +| 1* | | | | | | | | | vmovaps zmm23, zmm31 +| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8} +| 1 | 1.0 | | | | | | | | knotw k4, k0 +| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28 +| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27 +| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18 +| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19 +| 1* | | | | | | | | | cmp r15, r14 +| 0*F | | | | | | | | | jb 0xffffffffffffff0c +Total Num Of Uops: 57 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. +There were bubbles in the frontend. diff --git a/static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out b/static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out new file mode 100644 index 0000000..282e79b --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out @@ -0,0 +1,128 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 4200 +Total Cycles: 2465 +Total uOps: 5800 + +Dispatch Width: 6 +uOps Per Cycle: 2.35 +IPC: 1.70 +Block RThroughput: 13.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5 + 1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4 + 2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z} + 1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18 + 1 1 0.25 addq $8, %r15 + 1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19 + 1 1 1.00 kmovw %k5, %k2 + 1 1 1.00 kmovw %k5, %k3 + 1 1 1.00 kmovw %k5, %k1 + 1 0 0.17 vpxord %zmm21, %zmm21, %zmm21 + 1 0 0.17 vpxord %zmm20, %zmm20, %zmm20 + 1 0 0.17 vpxord %zmm22, %zmm22, %zmm22 + 5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2} + 5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3} + 5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1} + 1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18 + 1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17 + 1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19 + 1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31 + 1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31 + 1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31 + 3 4 2.00 vrcp14pd %zmm31, %zmm30 + 1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5} + 1 4 1.00 vfpclasspd $30, %zmm30, %k0 + 1 1 0.50 vmovaps %zmm31, %zmm23 + 2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 + 1 1 1.00 knotw %k0, %k4 + 1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24 + 1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4} + 1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4} + 1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25 + 1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27 + 1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28 + 1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26 + 1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30 + 1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29 + 1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23 + 1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6} + 1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6} + 1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6} + 1 1 0.25 cmpq %r14, %r15 + 1 1 0.50 jb ..B1.16 + + +Resources: +[0] - SKXDivider +[1] - SKXFPDivider +[2] - SKXPort0 +[3] - SKXPort1 +[4] - SKXPort2 +[5] - SKXPort3 +[6] - SKXPort4 +[7] - SKXPort5 +[8] - SKXPort6 +[9] - SKXPort7 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] + - - 19.02 6.79 12.64 13.36 - 16.03 5.16 - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: + - - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5 + - - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4 + - - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z} + - - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18 + - - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15 + - - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19 + - - 1.00 - - - - - - - kmovw %k5, %k2 + - - 1.00 - - - - - - - kmovw %k5, %k3 + - - 1.00 - - - - - - - kmovw %k5, %k1 + - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21 + - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20 + - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22 + - - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2} + - - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3} + - - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1} + - - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18 + - - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17 + - - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19 + - - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31 + - - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31 + - - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31 + - - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30 + - - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5} + - - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0 + - - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23 + - - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 + - - 1.00 - - - - - - - knotw %k0, %k4 + - - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24 + - - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4} + - - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4} + - - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25 + - - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27 + - - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28 + - - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26 + - - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30 + - - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29 + - - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23 + - - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6} + - - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6} + - - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6} + - - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15 + - - 0.14 - - - - - 0.86 - jb ..B1.16 diff --git a/static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out b/static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out new file mode 100644 index 0000000..f7286c6 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out @@ -0,0 +1,130 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 4200 +Total Cycles: 2465 +Total uOps: 5800 + +Dispatch Width: 6 +uOps Per Cycle: 2.35 +IPC: 1.70 +Block RThroughput: 13.0 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5 + 1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4 + 2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z} + 1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18 + 1 1 0.25 addq $8, %r15 + 1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19 + 1 1 1.00 kmovw %k5, %k2 + 1 1 1.00 kmovw %k5, %k3 + 1 1 1.00 kmovw %k5, %k1 + 1 0 0.17 vpxord %zmm21, %zmm21, %zmm21 + 1 0 0.17 vpxord %zmm20, %zmm20, %zmm20 + 1 0 0.17 vpxord %zmm22, %zmm22, %zmm22 + 5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2} + 5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3} + 5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1} + 1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18 + 1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17 + 1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19 + 1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31 + 1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31 + 1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31 + 3 4 2.00 vrcp14pd %zmm31, %zmm30 + 1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5} + 1 4 1.00 vfpclasspd $30, %zmm30, %k0 + 1 1 0.50 vmovaps %zmm31, %zmm23 + 2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 + 1 1 1.00 knotw %k0, %k4 + 1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24 + 1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4} + 1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4} + 1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25 + 1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27 + 1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28 + 1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26 + 1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30 + 1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29 + 1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23 + 1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6} + 1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6} + 1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6} + 1 1 0.25 cmpq %r14, %r15 + 1 1 0.50 jb ..B1.16 + + +Resources: +[0] - ICXDivider +[1] - ICXFPDivider +[2] - ICXPort0 +[3] - ICXPort1 +[4] - ICXPort2 +[5] - ICXPort3 +[6] - ICXPort4 +[7] - ICXPort5 +[8] - ICXPort6 +[9] - ICXPort7 +[10] - ICXPort8 +[11] - ICXPort9 + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] + - - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: + - - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5 + - - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4 + - - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z} + - - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18 + - - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15 + - - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19 + - - 1.00 - - - - - - - - - kmovw %k5, %k2 + - - 1.00 - - - - - - - - - kmovw %k5, %k3 + - - 1.00 - - - - - - - - - kmovw %k5, %k1 + - - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21 + - - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20 + - - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22 + - - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2} + - - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3} + - - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1} + - - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18 + - - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17 + - - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19 + - - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31 + - - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31 + - - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31 + - - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30 + - - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5} + - - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0 + - - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23 + - - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 + - - 1.00 - - - - - - - - - knotw %k0, %k4 + - - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24 + - - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4} + - - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4} + - - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25 + - - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27 + - - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28 + - - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26 + - - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30 + - - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29 + - - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23 + - - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6} + - - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6} + - - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6} + - - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15 + - - 0.14 - - - - - 0.86 - - - jb ..B1.16 diff --git a/static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out b/static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out new file mode 100644 index 0000000..cc463eb --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out @@ -0,0 +1,77 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: lammps-icc-avx512.s +Architecture: CSX +Timestamp: 2023-02-10 16:30:08 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +-------------------------------------------------------------------------------------------------- + 200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4 + 201 | | | | | | | | || | | # LLVM-MCA-BEGIN + 202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15 + 203 | | | | | | | | || | | # Execution count [2.50e+01] + 204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9 + 205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9 + 206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21 + 207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36 + 208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9 + 209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36 + 210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36 + 211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36 + 212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36 + 213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36 + 214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36 + 215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36 + 216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36 + 217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36 + 218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36 + 219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36 + 220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36 + 221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36 + 222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49 + 223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49 + 224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63 + 225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39 + 226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22 + 227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39 + 228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39 + 229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39 + 230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39 + 231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39 + 232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39 + 233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39 + 234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38 + 235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55 + 236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44 + 237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50 + 238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55 + 239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64 + 240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70 + 241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17 + 242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17 + 243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17 + 244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9 + 245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9 + 246 | | | | | | | | || | | # LLVM-MCA-END + + 18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- + 243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243] + 242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242] + 241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241] + 208 | 1.0 | addq $8, %r15 #59.9| [208] + 205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205] + diff --git a/static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out b/static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out new file mode 100644 index 0000000..1a96553 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out @@ -0,0 +1,77 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: lammps-icc-avx512.s +Architecture: ICX +Timestamp: 2023-02-10 16:29:42 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | +------------------------------------------------------------------------------------------------------------------------ + 200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4 + 201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN + 202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15 + 203 | | | | | | | | | | || | | # Execution count [2.50e+01] + 204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9 + 205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9 + 206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21 + 207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36 + 208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9 + 209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36 + 210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36 + 211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36 + 212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36 + 213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36 + 214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36 + 215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36 + 216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36 + 217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36 + 218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36 + 219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36 + 220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36 + 221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36 + 222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49 + 223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49 + 224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63 + 225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39 + 226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22 + 227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39 + 228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39 + 229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39 + 230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39 + 231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39 + 232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39 + 233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39 + 234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38 + 235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55 + 236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44 + 237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50 + 238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55 + 239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64 + 240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70 + 241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17 + 242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17 + 243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17 + 244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9 + 245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9 + 246 | | | | | | | | | | || | | # LLVM-MCA-END + + 18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- + 243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243] + 242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242] + 241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241] + 208 | 1.0 | addq $8, %r15 #59.9| [208] + 205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205] + diff --git a/static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out b/static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out new file mode 100644 index 0000000..2d91079 --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out @@ -0,0 +1,197 @@ + +[0] Code Region + +Iterations: 100 +Instructions: 7000 +Total Cycles: 3866 +Total uOps: 7900 + +Dispatch Width: 6 +uOps Per Cycle: 2.04 +IPC: 1.81 +Block RThroughput: 21.5 + + +Instruction Info: +[1]: #uOps +[2]: Latency +[3]: RThroughput +[4]: MayLoad +[5]: MayStore +[6]: HasSideEffects (U) + +[1] [2] [3] [4] [5] [6] Instructions: + 1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1 + 1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11 + 2 4 1.50 vpmovsxdq %xmm11, %ymm1 + 1 1 0.50 vpsllq $3, %ymm1, %ymm1 + 1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1 + 1 1 1.00 vmovq %xmm1, %r14 + 2 1 1.00 vpextrq $1, %xmm1, %r9 + 1 4 1.00 vextracti128 $1, %ymm1, %xmm1 + 1 8 0.50 * vmovsd (%r14), %xmm2 + 1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6 + 2 4 1.50 vpmovsxdq %xmm6, %ymm6 + 1 1 0.50 vpsllq $3, %ymm6, %ymm6 + 1 1 1.00 vmovq %xmm1, %rdi + 1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6 + 1 1 1.00 vmovq %xmm6, %rcx + 2 1 1.00 vpextrq $1, %xmm1, %rbx + 2 1 1.00 vpextrq $1, %xmm6, %rax + 1 4 1.00 vextracti128 $1, %ymm6, %xmm1 + 1 8 0.50 * vmovsd (%rdi), %xmm6 + 1 1 1.00 vmovq %xmm1, %rdi + 2 1 1.00 vpextrq $1, %xmm1, %rsi + 1 8 0.50 * vmovsd (%rdi), %xmm1 + 1 8 0.50 * vmovsd (%rcx), %xmm7 + 1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12 + 1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2 + 1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4 + 2 4 1.50 vpmovsxdq %xmm4, %ymm4 + 1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7 + 1 1 0.50 vpsllq $3, %ymm4, %ymm4 + 1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4 + 1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6 + 2 1 1.00 vpextrq $1, %xmm4, %rax + 1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1 + 1 1 1.00 vmovq %xmm4, %rcx + 1 4 1.00 vextracti128 $1, %ymm4, %xmm4 + 1 1 1.00 vmovq %xmm4, %rsi + 1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2 + 2 1 1.00 vpextrq $1, %xmm4, %rdi + 1 8 0.50 * vmovsd (%rsi), %xmm4 + 1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2 + 1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4 + 1 8 0.50 * vmovsd (%rcx), %xmm6 + 1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1 + 1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6 + 1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4 + 1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1 + 1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4 + 1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6 + 1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6 + 1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6 + 1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7 + 1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7 + 1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11 + 1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11 + 1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12 + 1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11 + 1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12 + 1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7 + 1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7 + 1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7 + 1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6 + 1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2 + 1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0 + 1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1 + 1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4 + 1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15 + 1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13 + 1 1 0.25 addq $4, %rbp + 1 1 0.25 cmpq %rdx, %rbp + 1 1 0.50 jb .LBB0_9 + + +Resources: +[0] - Zn3AGU0 +[1] - Zn3AGU1 +[2] - Zn3AGU2 +[3] - Zn3ALU0 +[4] - Zn3ALU1 +[5] - Zn3ALU2 +[6] - Zn3ALU3 +[7] - Zn3BRU1 +[8] - Zn3FPP0 +[9] - Zn3FPP1 +[10] - Zn3FPP2 +[11] - Zn3FPP3 +[12.0] - Zn3FPP45 +[12.1] - Zn3FPP45 +[13] - Zn3FPSt +[14.0] - Zn3LSU +[14.1] - Zn3LSU +[14.2] - Zn3LSU +[15.0] - Zn3Load +[15.1] - Zn3Load +[15.2] - Zn3Load +[16.0] - Zn3Store +[16.1] - Zn3Store + + +Resource pressure per iteration: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] + - - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - - + +Resource pressure by instruction: +[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions: + - - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1 + - - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11 + - - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1 + - - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1 + - - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1 + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14 + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9 + - - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1 + - - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2 + - - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6 + - - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6 + - - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6 + - - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi + - - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6 + - - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax + - - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1 + - - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6 + - - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi + - - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi + - - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1 + - - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7 + - - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12 + - - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2 + - - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4 + - - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4 + - - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7 + - - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4 + - - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4 + - - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6 + - - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax + - - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1 + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx + - - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4 + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi + - - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2 + - - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi + - - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4 + - - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2 + - - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4 + - - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6 + - - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1 + - - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6 + - - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4 + - - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1 + - - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4 + - - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6 + - - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6 + - - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6 + - - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7 + - - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7 + - - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11 + - - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11 + - - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12 + - - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11 + - - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12 + - - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7 + - - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7 + - - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7 + - - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6 + - - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2 + - - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0 + - - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1 + - - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4 + - - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15 + - - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13 + - - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp + - - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp + - - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9 diff --git a/static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out b/static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out new file mode 100644 index 0000000..54716eb --- /dev/null +++ b/static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out @@ -0,0 +1,108 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: lammps-icx-avx2zen.s +Architecture: ZEN3 +Timestamp: 2023-02-10 16:31:30 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD | +-------------------------------------------------------------------------------------------------------------------------------------------- + 175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc + 176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN + 177 | | | | | | | | | | | | | | | | || | | .LBB0_9: # + 178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1 + 179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 + 180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3] + 181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11 + 182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1 + 183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1 + 184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1 + 185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14 + 186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9 + 187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1 + 188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero + 189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6 + 190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6 + 191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6 + 192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi + 193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6 + 194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx + 195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx + 196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax + 197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1 + 198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero + 199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi + 200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi + 201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero + 202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero + 203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2] + 204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0] + 205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4 + 206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4 + 207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0] + 208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4 + 209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4 + 210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0] + 211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax + 212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0] + 213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx + 214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4 + 215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi + 216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2 + 217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi + 218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero + 219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2 + 220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0] + 221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero + 222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1 + 223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0] + 224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4 + 225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1 + 226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4 + 227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6 + 228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6 + 229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6 + 230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] + 231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7 + 232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11 + 233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11 + 234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + 235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11 + 236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12 + 237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload + 238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7 + 239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7 + 240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6 + 241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0 + 242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0 + 243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15 + 244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13 + 245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15 + 246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13 + 247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp + 248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp + 249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9 + 250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END + + 18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- + 244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246] + 243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245] + 241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242] + 247 | 1.0 | addq $4, %rbp | [247] + 246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246] + 245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245] + 242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242] + diff --git a/static_analysis/jan/gromacs-icc-avx512-dp.o b/static_analysis/jan/gromacs-icc-avx512-dp.o new file mode 100644 index 0000000..cc857d4 Binary files /dev/null and b/static_analysis/jan/gromacs-icc-avx512-dp.o differ diff --git a/static_analysis/jan/gromacs-icc-avx512-dp.s b/static_analysis/jan/gromacs-icc-avx512-dp.s new file mode 100644 index 0000000..40b09dd --- /dev/null +++ b/static_analysis/jan/gromacs-icc-avx512-dp.s @@ -0,0 +1,4334 @@ +# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; +# mark_description "0226_000000"; +# mark_description "-I/apps/likwid/5.2.2/include -I././gromacs/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GN"; +# mark_description "U_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=8 -D__SIMD_KERNEL__ -D__ISA_AVX"; +# mark_description "512__ -DENABLE_OMP_SIMD -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o build-gromacs-"; +# mark_description "ICC-AVX512-DP/force_lj.s"; + .file "force_lj.c" + .text +..TXTST0: +.L_2__routine_start_computeForceLJ_ref_0: +# -- Begin computeForceLJ_ref + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_ref +# --- computeForceLJ_ref(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_ref: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_ref.1: +..L2: + #19.91 + pushq %rbp #19.91 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #19.91 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #19.91 + pushq %r12 #19.91 + pushq %r13 #19.91 + pushq %r14 #19.91 + pushq %r15 #19.91 + pushq %rbx #19.91 + subq $152, %rsp #19.91 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r15 #19.91 + movl $.L_2__STRING.0, %edi #20.5 + xorl %eax, %eax #20.5 + movq %rcx, %r13 #19.91 + movq %rdx, %rbx #19.91 + movq %rsi, %r14 #19.91 +..___tag_value_computeForceLJ_ref.11: +# debug_printf(const char *, ...) + call debug_printf #20.5 +..___tag_value_computeForceLJ_ref.12: + # LOE rbx r12 r13 r14 r15 +..B1.2: # Preds ..B1.1 + # Execution count [1.00e+00] + vmovsd 144(%r15), %xmm16 #23.27 + xorl %ecx, %ecx #30.5 + vmulsd %xmm16, %xmm16, %xmm0 #23.45 + xorl %esi, %esi #32.27 + vmovsd 56(%r15), %xmm1 #24.23 + vmovsd 40(%r15), %xmm2 #25.24 + movl 20(%r14), %edx #30.26 + vmovsd %xmm0, 8(%rsp) #23.45[spill] + vmovsd %xmm1, 16(%rsp) #24.23[spill] + vmovsd %xmm2, 24(%rsp) #25.24[spill] + testl %edx, %edx #30.26 + jle ..B1.24 # Prob 9% #30.26 + # LOE rbx rsi r12 r13 r14 edx ecx +..B1.3: # Preds ..B1.2 + # Execution count [9.00e-01] + movq 176(%r14), %rdi #32.27 + movq 192(%r14), %rax #33.32 + vxorpd %ymm2, %ymm2, %ymm2 #34.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #33.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #33.9 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B1.4: # Preds ..B1.22 ..B1.3 + # Execution count [5.00e+00] + movl %ecx, %r8d #31.27 + movl %ecx, %r9d #31.27 + sarl $1, %r8d #31.27 + andl $1, %r9d #31.27 + shll $2, %r9d #31.27 + lea (%r8,%r8,2), %r10d #31.27 + lea (%r9,%r10,8), %r11d #31.27 + movslq %r11d, %r11 #32.27 + lea (%rdi,%r11,8), %r12 #32.27 + movl (%rsi,%rax), %r11d #33.32 + testl %r11d, %r11d #33.32 + jle ..B1.22 # Prob 50% #33.32 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B1.5: # Preds ..B1.4 + # Execution count [4.50e+00] + cmpl $16, %r11d #33.9 + jl ..B1.153 # Prob 10% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B1.6: # Preds ..B1.5 + # Execution count [4.50e+00] + lea 128(%r12), %r8 #36.13 + andq $63, %r8 #33.9 + testl $7, %r8d #33.9 + je ..B1.8 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B1.7: # Preds ..B1.6 + # Execution count [2.25e+00] + xorl %r8d, %r8d #33.9 + jmp ..B1.10 # Prob 100% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B1.8: # Preds ..B1.6 + # Execution count [2.25e+00] + testl %r8d, %r8d #33.9 + je ..B1.10 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B1.9: # Preds ..B1.8 + # Execution count [2.50e+01] + negl %r8d #33.9 + addl $64, %r8d #33.9 + shrl $3, %r8d #33.9 + cmpl %r8d, %r11d #33.9 + cmovl %r11d, %r8d #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B1.10: # Preds ..B1.7 ..B1.9 ..B1.8 + # Execution count [5.00e+00] + movl %r11d, %r10d #33.9 + subl %r8d, %r10d #33.9 + andl $15, %r10d #33.9 + negl %r10d #33.9 + addl %r11d, %r10d #33.9 + cmpl $1, %r8d #33.9 + jb ..B1.14 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B1.11: # Preds ..B1.10 + # Execution count [4.50e+00] + vpbroadcastd %r8d, %xmm3 #33.9 + xorl %r15d, %r15d #33.9 + vmovdqa %xmm0, %xmm4 #33.9 + movslq %r8d, %r9 #33.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B1.12: # Preds ..B1.12 ..B1.11 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #33.9 + vpaddd %xmm1, %xmm4, %xmm4 #33.9 + vmovupd %ymm2, (%r12,%r15,8){%k1} #34.13 + vmovupd %ymm2, 64(%r12,%r15,8){%k1} #35.13 + vmovupd %ymm2, 128(%r12,%r15,8){%k1} #36.13 + addq $4, %r15 #33.9 + cmpq %r9, %r15 #33.9 + jb ..B1.12 # Prob 82% #33.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B1.13: # Preds ..B1.12 + # Execution count [4.50e+00] + cmpl %r8d, %r11d #33.9 + je ..B1.22 # Prob 10% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B1.14: # Preds ..B1.13 ..B1.10 + # Execution count [2.50e+01] + lea 16(%r8), %r9d #33.9 + cmpl %r9d, %r10d #33.9 + jl ..B1.18 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B1.15: # Preds ..B1.14 + # Execution count [4.50e+00] + movslq %r8d, %r8 #33.9 + movslq %r10d, %r9 #33.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B1.16: # Preds ..B1.16 ..B1.15 + # Execution count [2.50e+01] + vmovupd %ymm2, (%r12,%r8,8) #34.13 + vmovupd %ymm2, 32(%r12,%r8,8) #34.13 + vmovupd %ymm2, 64(%r12,%r8,8) #34.13 + vmovupd %ymm2, 128(%r12,%r8,8) #35.13 + vmovupd %ymm2, 192(%r12,%r8,8) #36.13 + vmovupd %ymm2, 96(%r12,%r8,8) #34.13 + vmovupd %ymm2, 160(%r12,%r8,8) #35.13 + vmovupd %ymm2, 224(%r12,%r8,8) #36.13 + addq $16, %r8 #33.9 + cmpq %r9, %r8 #33.9 + jb ..B1.16 # Prob 82% #33.9 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B1.18: # Preds ..B1.16 ..B1.14 ..B1.153 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #33.9 + cmpl %r11d, %r8d #33.9 + ja ..B1.22 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B1.19: # Preds ..B1.18 + # Execution count [4.50e+00] + movslq %r10d, %r9 #34.13 + negl %r10d #33.9 + addl %r11d, %r10d #33.9 + xorl %r8d, %r8d #33.9 + movslq %r11d, %r11 #33.9 + vmovdqa %xmm0, %xmm4 #33.9 + vpbroadcastd %r10d, %xmm3 #33.9 + subq %r9, %r11 #33.9 + lea (%r12,%r9,8), %r12 #34.13 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B1.20: # Preds ..B1.20 ..B1.19 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #33.9 + vpaddd %xmm1, %xmm4, %xmm4 #33.9 + vmovupd %ymm2, (%r12,%r8,8){%k1} #34.13 + vmovupd %ymm2, 64(%r12,%r8,8){%k1} #35.13 + vmovupd %ymm2, 128(%r12,%r8,8){%k1} #36.13 + addq $4, %r8 #33.9 + cmpq %r11, %r8 #33.9 + jb ..B1.20 # Prob 82% #33.9 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B1.22: # Preds ..B1.20 ..B1.4 ..B1.13 ..B1.18 + # Execution count [5.00e+00] + incl %ecx #30.5 + addq $56, %rsi #30.5 + cmpl %edx, %ecx #30.5 + jb ..B1.4 # Prob 82% #30.5 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B1.24: # Preds ..B1.22 ..B1.2 + # Execution count [1.00e+00] + xorl %eax, %eax #40.16 + vzeroupper #40.16 +..___tag_value_computeForceLJ_ref.16: +# getTimeStamp() + call getTimeStamp #40.16 +..___tag_value_computeForceLJ_ref.17: + # LOE rbx r12 r13 r14 xmm0 +..B1.156: # Preds ..B1.24 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #40.16[spill] + # LOE rbx r12 r13 r14 +..B1.25: # Preds ..B1.156 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #44.5 +..___tag_value_computeForceLJ_ref.19: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #44.5 +..___tag_value_computeForceLJ_ref.20: + # LOE rbx r12 r13 r14 +..B1.26: # Preds ..B1.25 + # Execution count [9.00e-01] + movl 20(%r14), %eax #47.26 + movl %eax, 56(%rsp) #47.26[spill] + testl %eax, %eax #47.26 + jle ..B1.149 # Prob 0% #47.26 + # LOE rbx r12 r13 r14 +..B1.27: # Preds ..B1.26 + # Execution count [9.00e-01] + movq 160(%r14), %r9 #51.27 + xorl %edx, %edx #47.5 + movq 176(%r14), %r8 #52.27 + movq 8(%rbx), %rsi #53.19 + movslq 16(%rbx), %rdi #53.44 + movq 24(%rbx), %r11 #54.25 + movl 32(%rbx), %r10d #77.28 + movq (%r13), %rcx #122.9 + movq 8(%r13), %rbx #123.9 + movq 16(%r13), %rax #124.9 + movl 56(%rsp), %r14d #47.5[spill] + # LOE rax rcx rbx rsi rdi r8 r9 r11 r12 r13 edx r10d r14d +..B1.28: # Preds ..B1.28 ..B1.27 + # Execution count [5.00e+00] + incl %edx #47.5 + incq %rcx #122.9 + cmpl %r14d, %edx #47.5 + jb ..B1.28 # Prob 82% #47.5 + # LOE rax rcx rbx rsi rdi r8 r9 r11 r12 r13 edx r10d r14d +..B1.29: # Preds ..B1.28 + # Execution count [9.00e-01] + vmovsd 24(%rsp), %xmm0 #91.54[spill] + xorl %edx, %edx #48.22 + vmovsd 16(%rsp), %xmm7 #48.22[spill] + vmovsd 8(%rsp), %xmm10 #48.22[spill] + movq %rcx, (%r13) #122.9 + xorl %ecx, %ecx #47.5 + vmovsd .L_2il0floatpacket.2(%rip), %xmm9 #91.67 + vmovsd .L_2il0floatpacket.4(%rip), %xmm8 #89.44 + vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm3 #91.54 + movl %r10d, 32(%rsp) #48.22[spill] + movq %r11, 64(%rsp) #48.22[spill] + movq %r13, 72(%rsp) #48.22[spill] + # LOE rax rdx rbx rsi rdi r8 r9 ecx xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.30: # Preds ..B1.147 ..B1.29 + # Execution count [5.00e+00] + movl %ecx, %r13d #48.22 + movl %ecx, %r15d #50.27 + sarl $1, %r13d #48.22 + andl $1, %r15d #50.27 + shll $2, %r15d #50.27 + movq 64(%rsp), %r10 #54.25[spill] + lea (%r13,%r13,2), %r11d #50.27 + movslq (%r10,%rdx,4), %r12 #54.25 + lea (%r15,%r11,8), %r14d #50.27 + movslq %r14d, %r14 #50.27 + xorl %r10d, %r10d #56.9 + lea (%r9,%r14,8), %r11 #51.27 + lea (%r8,%r14,8), %r14 #52.27 + testq %r12, %r12 #56.28 + jle ..B1.147 # Prob 10% #56.28 + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 ecx r13d r15d xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.31: # Preds ..B1.30 + # Execution count [4.50e+00] + movq %rax, 24(%rsp) #[spill] + movq %rdx, 48(%rsp) #[spill] + movl %ecx, 40(%rsp) #[spill] + movq %rbx, 16(%rsp) #[spill] + movq %rdi, 8(%rsp) #[spill] + movq %r8, 80(%rsp) #[spill] + movq %r9, 88(%rsp) #[spill] + movl 32(%rsp), %eax #[spill] + # LOE rsi r10 r11 r12 r14 eax r13d r15d xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.32: # Preds ..B1.145 ..B1.31 + # Execution count [2.50e+01] + movl (%rsi,%r10,4), %r9d #57.22 + xorb %dl, %dl #59.21 + movslq %r9d, %r9 #58.31 + xorb %cl, %cl #63.13 + movq %r10, 112(%rsp) #63.13[spill] + movl %r15d, %r8d #63.13 + movq %r12, 104(%rsp) #63.13[spill] + xorl %ebx, %ebx #63.13 + movq %rsi, 96(%rsp) #63.13[spill] + movq 80(%rsp), %rsi #63.13[spill] + lea (%r9,%r9,2), %rdi #60.28 + movq 88(%rsp), %r10 #63.13[spill] + movq 72(%rsp), %r12 #63.13[spill] + shlq $6, %rdi #60.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.33: # Preds ..B1.144 ..B1.32 + # Execution count [1.00e+02] + vmovsd (%r11,%rbx,8), %xmm6 #64.33 + vxorpd %xmm2, %xmm2, %xmm2 #67.30 + vmovapd %xmm2, %xmm1 #68.30 + vmovsd 64(%r11,%rbx,8), %xmm5 #65.33 + vmovapd %xmm1, %xmm0 #69.30 + vmovsd 128(%r11,%rbx,8), %xmm4 #66.33 + testl %eax, %eax #77.28 + je ..B1.38 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.34: # Preds ..B1.33 + # Execution count [5.00e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.40 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.35: # Preds ..B1.34 + # Execution count [2.50e+01] + testl %r8d, %r8d #77.99 + jl ..B1.40 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.36: # Preds ..B1.35 + # Execution count [6.25e+00] + jle ..B1.54 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.37: # Preds ..B1.36 + # Execution count [3.12e+00] + cmpl $2, %r8d #77.99 + jl ..B1.70 # Prob 50% #77.99 + jmp ..B1.49 # Prob 100% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.38: # Preds ..B1.33 + # Execution count [5.00e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.40 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.39: # Preds ..B1.38 + # Execution count [2.50e+01] + testl %r8d, %r8d #78.100 + je ..B1.54 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.40: # Preds ..B1.38 ..B1.39 ..B1.34 ..B1.35 + # Execution count [5.00e+01] + vsubsd 64(%rdi,%r10), %xmm5, %xmm16 #85.48 + vsubsd (%rdi,%r10), %xmm6, %xmm15 #84.48 + vsubsd 128(%rdi,%r10), %xmm4, %xmm17 #86.48 + vmulsd %xmm16, %xmm16, %xmm11 #87.61 + vfmadd231sd %xmm15, %xmm15, %xmm11 #87.75 + vfmadd231sd %xmm17, %xmm17, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.44 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm15 xmm16 xmm17 +..B1.41: # Preds ..B1.40 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm2 #89.51 + vmulsd %xmm2, %xmm7, %xmm0 #90.50 + vmulsd %xmm3, %xmm2, %xmm11 #91.67 + vmulsd %xmm2, %xmm0, %xmm1 #90.56 + vmulsd %xmm2, %xmm1, %xmm12 #90.62 + vmulsd %xmm12, %xmm11, %xmm13 #91.76 + vsubsd %xmm9, %xmm12, %xmm14 #91.67 + vmulsd %xmm14, %xmm13, %xmm13 #91.82 + vmulsd %xmm13, %xmm15, %xmm2 #94.67 + testl %eax, %eax #93.32 + je ..B1.43 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm13 xmm16 xmm17 +..B1.42: # Preds ..B1.41 + # Execution count [1.25e+01] + vmovsd (%rdi,%rsi), %xmm1 #94.33 + movb $1, %dl #102.29 + vmovsd 64(%rdi,%rsi), %xmm11 #95.33 + vsubsd %xmm2, %xmm1, %xmm0 #94.33 + vmulsd %xmm13, %xmm16, %xmm1 #95.67 + vfnmadd213sd %xmm11, %xmm13, %xmm16 #95.33 + vmovsd 128(%rdi,%rsi), %xmm12 #96.33 + vmovsd %xmm0, (%rdi,%rsi) #94.33 + vmulsd %xmm13, %xmm17, %xmm0 #96.67 + vfnmadd213sd %xmm12, %xmm17, %xmm13 #96.33 + vmovsd %xmm16, 64(%rdi,%rsi) #95.33 + vmovsd %xmm13, 128(%rdi,%rsi) #96.33 + incq 24(%r12) #103.29 + jmp ..B1.45 # Prob 100% #103.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.43: # Preds ..B1.41 + # Execution count [1.25e+01] + vmulsd %xmm13, %xmm16, %xmm1 #100.43 + movb $1, %dl #102.29 + vmulsd %xmm13, %xmm17, %xmm0 #101.43 + incq 24(%r12) #103.29 + jmp ..B1.52 # Prob 100% #103.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.44: # Preds ..B1.40 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + testl %eax, %eax #77.28 + je ..B1.52 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.45: # Preds ..B1.42 ..B1.44 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.54 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.46: # Preds ..B1.45 + # Execution count [1.88e+01] + testl %r8d, %r8d #77.99 + jle ..B1.54 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.47: # Preds ..B1.46 + # Execution count [0.00e+00] + cmpl $2, %r8d #77.99 + jl ..B1.70 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.48: # Preds ..B1.47 + # Execution count [0.00e+00] + testl %eax, %eax #77.28 + je ..B1.82 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.49: # Preds ..B1.37 ..B1.48 + # Execution count [0.00e+00] + cmpl $3, %r8d #77.99 + jl ..B1.83 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.50: # Preds ..B1.49 + # Execution count [6.25e+00] + cmpl %r9d, %r13d #77.62 + jne ..B1.96 # Prob 50% #77.62 + jmp ..B1.64 # Prob 100% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.52: # Preds ..B1.43 ..B1.44 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.54 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.53: # Preds ..B1.52 + # Execution count [1.88e+01] + cmpl $1, %r8d #78.100 + je ..B1.70 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.54: # Preds ..B1.53 ..B1.46 ..B1.45 ..B1.52 ..B1.36 + # ..B1.39 + # Execution count [5.00e+01] + vsubsd 72(%rdi,%r10), %xmm5, %xmm19 #85.48 + vsubsd 8(%rdi,%r10), %xmm6, %xmm18 #84.48 + vsubsd 136(%rdi,%r10), %xmm4, %xmm20 #86.48 + vmulsd %xmm19, %xmm19, %xmm11 #87.61 + vfmadd231sd %xmm18, %xmm18, %xmm11 #87.75 + vfmadd231sd %xmm20, %xmm20, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.59 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm18 xmm19 xmm20 +..B1.55: # Preds ..B1.54 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm13 #89.51 + vmulsd %xmm13, %xmm7, %xmm11 #90.50 + vmulsd %xmm3, %xmm13, %xmm14 #91.67 + vmulsd %xmm13, %xmm11, %xmm12 #90.56 + vmulsd %xmm13, %xmm12, %xmm15 #90.62 + vmulsd %xmm15, %xmm14, %xmm16 #91.76 + vsubsd %xmm9, %xmm15, %xmm17 #91.67 + vmulsd %xmm17, %xmm16, %xmm15 #91.82 + vmulsd %xmm15, %xmm18, %xmm17 #94.67 + testl %eax, %eax #93.32 + je ..B1.57 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm15 xmm17 xmm19 xmm20 +..B1.56: # Preds ..B1.55 + # Execution count [1.25e+01] + vmovsd 8(%rdi,%rsi), %xmm11 #94.33 + vmovsd 72(%rdi,%rsi), %xmm13 #95.33 + vsubsd %xmm17, %xmm11, %xmm12 #94.33 + vmulsd %xmm15, %xmm19, %xmm11 #95.67 + vmovsd %xmm12, 8(%rdi,%rsi) #94.33 + vsubsd %xmm11, %xmm13, %xmm14 #95.33 + vmulsd %xmm15, %xmm20, %xmm12 #96.67 + vmovsd 136(%rdi,%rsi), %xmm15 #96.33 + vmovsd %xmm14, 72(%rdi,%rsi) #95.33 + vsubsd %xmm12, %xmm15, %xmm16 #96.33 + vmovsd %xmm16, 136(%rdi,%rsi) #96.33 + jmp ..B1.58 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.57: # Preds ..B1.55 + # Execution count [1.25e+01] + vmulsd %xmm15, %xmm19, %xmm11 #100.43 + vmulsd %xmm15, %xmm20, %xmm12 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.58: # Preds ..B1.56 ..B1.57 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm17, %xmm2, %xmm2 #99.29 + vaddsd %xmm11, %xmm1, %xmm1 #100.29 + vaddsd %xmm12, %xmm0, %xmm0 #101.29 + jmp ..B1.60 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.59: # Preds ..B1.54 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.60: # Preds ..B1.58 ..B1.59 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.68 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.61: # Preds ..B1.60 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.70 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.62: # Preds ..B1.61 + # Execution count [1.88e+01] + cmpl $2, %r8d #77.99 + jl ..B1.70 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.63: # Preds ..B1.62 + # Execution count [6.25e+00] + cmpl $3, %r8d #77.99 + jl ..B1.83 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.64: # Preds ..B1.63 ..B1.50 + # Execution count [3.91e+00] + cmpl $4, %r8d #77.99 + jl ..B1.96 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.65: # Preds ..B1.64 + # Execution count [0.00e+00] + testl %eax, %eax #77.28 + jne ..B1.80 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.66: # Preds ..B1.65 + # Execution count [7.81e+00] + cmpl %r9d, %r13d #78.62 + jne ..B1.110 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.67: # Preds ..B1.66 + # Execution count [3.91e+00] + cmpl $5, %r8d #78.100 + jne ..B1.110 # Prob 50% #78.100 + jmp ..B1.107 # Prob 100% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.68: # Preds ..B1.60 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.70 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.69: # Preds ..B1.68 + # Execution count [1.88e+01] + cmpl $2, %r8d #78.100 + je ..B1.83 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.70: # Preds ..B1.37 ..B1.62 ..B1.69 ..B1.61 ..B1.68 + # ..B1.47 ..B1.53 + # Execution count [5.00e+01] + vsubsd 80(%rdi,%r10), %xmm5, %xmm19 #85.48 + vsubsd 16(%rdi,%r10), %xmm6, %xmm18 #84.48 + vsubsd 144(%rdi,%r10), %xmm4, %xmm20 #86.48 + vmulsd %xmm19, %xmm19, %xmm11 #87.61 + vfmadd231sd %xmm18, %xmm18, %xmm11 #87.75 + vfmadd231sd %xmm20, %xmm20, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.75 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm18 xmm19 xmm20 +..B1.71: # Preds ..B1.70 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm13 #89.51 + vmulsd %xmm13, %xmm7, %xmm11 #90.50 + vmulsd %xmm3, %xmm13, %xmm14 #91.67 + vmulsd %xmm13, %xmm11, %xmm12 #90.56 + vmulsd %xmm13, %xmm12, %xmm15 #90.62 + vmulsd %xmm15, %xmm14, %xmm16 #91.76 + vsubsd %xmm9, %xmm15, %xmm17 #91.67 + vmulsd %xmm17, %xmm16, %xmm15 #91.82 + vmulsd %xmm15, %xmm18, %xmm17 #94.67 + testl %eax, %eax #93.32 + je ..B1.73 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm15 xmm17 xmm19 xmm20 +..B1.72: # Preds ..B1.71 + # Execution count [1.25e+01] + vmovsd 16(%rdi,%rsi), %xmm11 #94.33 + vmovsd 80(%rdi,%rsi), %xmm13 #95.33 + vsubsd %xmm17, %xmm11, %xmm12 #94.33 + vmulsd %xmm15, %xmm19, %xmm11 #95.67 + vmovsd %xmm12, 16(%rdi,%rsi) #94.33 + vsubsd %xmm11, %xmm13, %xmm14 #95.33 + vmulsd %xmm15, %xmm20, %xmm12 #96.67 + vmovsd 144(%rdi,%rsi), %xmm15 #96.33 + vmovsd %xmm14, 80(%rdi,%rsi) #95.33 + vsubsd %xmm12, %xmm15, %xmm16 #96.33 + vmovsd %xmm16, 144(%rdi,%rsi) #96.33 + jmp ..B1.74 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.73: # Preds ..B1.71 + # Execution count [1.25e+01] + vmulsd %xmm15, %xmm19, %xmm11 #100.43 + vmulsd %xmm15, %xmm20, %xmm12 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.74: # Preds ..B1.72 ..B1.73 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm17, %xmm2, %xmm2 #99.29 + vaddsd %xmm11, %xmm1, %xmm1 #100.29 + vaddsd %xmm12, %xmm0, %xmm0 #101.29 + jmp ..B1.76 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.75: # Preds ..B1.70 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.76: # Preds ..B1.74 ..B1.75 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.81 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.77: # Preds ..B1.76 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.83 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.78: # Preds ..B1.77 + # Execution count [1.88e+01] + cmpl $3, %r8d #77.99 + jl ..B1.83 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.79: # Preds ..B1.78 + # Execution count [2.34e+00] + cmpl $4, %r8d #77.99 + jl ..B1.96 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.80: # Preds ..B1.79 ..B1.65 + # Execution count [7.81e+00] + cmpl %r9d, %r13d #77.62 + jne ..B1.110 # Prob 50% #77.62 + jmp ..B1.92 # Prob 100% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.81: # Preds ..B1.76 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.83 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.82: # Preds ..B1.48 ..B1.81 + # Execution count [1.88e+01] + cmpl $3, %r8d #78.100 + je ..B1.96 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.83: # Preds ..B1.49 ..B1.78 ..B1.82 ..B1.77 ..B1.81 + # ..B1.63 ..B1.69 + # Execution count [5.00e+01] + vsubsd 88(%rdi,%r10), %xmm5, %xmm19 #85.48 + vsubsd 24(%rdi,%r10), %xmm6, %xmm18 #84.48 + vsubsd 152(%rdi,%r10), %xmm4, %xmm20 #86.48 + vmulsd %xmm19, %xmm19, %xmm11 #87.61 + vfmadd231sd %xmm18, %xmm18, %xmm11 #87.75 + vfmadd231sd %xmm20, %xmm20, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.88 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm18 xmm19 xmm20 +..B1.84: # Preds ..B1.83 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm13 #89.51 + vmulsd %xmm13, %xmm7, %xmm11 #90.50 + vmulsd %xmm3, %xmm13, %xmm14 #91.67 + vmulsd %xmm13, %xmm11, %xmm12 #90.56 + vmulsd %xmm13, %xmm12, %xmm15 #90.62 + vmulsd %xmm15, %xmm14, %xmm16 #91.76 + vsubsd %xmm9, %xmm15, %xmm17 #91.67 + vmulsd %xmm17, %xmm16, %xmm15 #91.82 + vmulsd %xmm15, %xmm18, %xmm17 #94.67 + testl %eax, %eax #93.32 + je ..B1.86 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm15 xmm17 xmm19 xmm20 +..B1.85: # Preds ..B1.84 + # Execution count [1.25e+01] + vmovsd 24(%rdi,%rsi), %xmm11 #94.33 + vmovsd 88(%rdi,%rsi), %xmm13 #95.33 + vsubsd %xmm17, %xmm11, %xmm12 #94.33 + vmulsd %xmm15, %xmm19, %xmm11 #95.67 + vmovsd %xmm12, 24(%rdi,%rsi) #94.33 + vsubsd %xmm11, %xmm13, %xmm14 #95.33 + vmulsd %xmm15, %xmm20, %xmm12 #96.67 + vmovsd 152(%rdi,%rsi), %xmm15 #96.33 + vmovsd %xmm14, 88(%rdi,%rsi) #95.33 + vsubsd %xmm12, %xmm15, %xmm16 #96.33 + vmovsd %xmm16, 152(%rdi,%rsi) #96.33 + jmp ..B1.87 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.86: # Preds ..B1.84 + # Execution count [1.25e+01] + vmulsd %xmm15, %xmm19, %xmm11 #100.43 + vmulsd %xmm15, %xmm20, %xmm12 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.87: # Preds ..B1.85 ..B1.86 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm17, %xmm2, %xmm2 #99.29 + vaddsd %xmm11, %xmm1, %xmm1 #100.29 + vaddsd %xmm12, %xmm0, %xmm0 #101.29 + jmp ..B1.89 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.88: # Preds ..B1.83 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.89: # Preds ..B1.87 ..B1.88 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.94 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.90: # Preds ..B1.89 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.96 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.91: # Preds ..B1.90 + # Execution count [1.88e+01] + cmpl $4, %r8d #77.99 + jl ..B1.96 # Prob 50% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.92: # Preds ..B1.91 ..B1.80 + # Execution count [6.25e+00] + cmpl $5, %r8d #77.99 + jl ..B1.110 # Prob 50% #77.99 + jmp ..B1.107 # Prob 100% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.94: # Preds ..B1.89 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.96 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.95: # Preds ..B1.94 + # Execution count [1.88e+01] + cmpl $4, %r8d #78.100 + je ..B1.110 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.96: # Preds ..B1.82 ..B1.79 ..B1.91 ..B1.95 ..B1.90 + # ..B1.94 ..B1.50 ..B1.64 + # Execution count [5.00e+01] + vsubsd 96(%rdi,%r10), %xmm5, %xmm19 #85.48 + vsubsd 32(%rdi,%r10), %xmm6, %xmm18 #84.48 + vsubsd 160(%rdi,%r10), %xmm4, %xmm20 #86.48 + vmulsd %xmm19, %xmm19, %xmm11 #87.61 + vfmadd231sd %xmm18, %xmm18, %xmm11 #87.75 + vfmadd231sd %xmm20, %xmm20, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.101 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm18 xmm19 xmm20 +..B1.97: # Preds ..B1.96 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm13 #89.51 + vmulsd %xmm13, %xmm7, %xmm11 #90.50 + vmulsd %xmm3, %xmm13, %xmm14 #91.67 + vmulsd %xmm13, %xmm11, %xmm12 #90.56 + vmulsd %xmm13, %xmm12, %xmm15 #90.62 + vmulsd %xmm15, %xmm14, %xmm16 #91.76 + vsubsd %xmm9, %xmm15, %xmm17 #91.67 + vmulsd %xmm17, %xmm16, %xmm15 #91.82 + vmulsd %xmm15, %xmm18, %xmm17 #94.67 + testl %eax, %eax #93.32 + je ..B1.99 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm15 xmm17 xmm19 xmm20 +..B1.98: # Preds ..B1.97 + # Execution count [1.25e+01] + vmovsd 32(%rdi,%rsi), %xmm11 #94.33 + vmovsd 96(%rdi,%rsi), %xmm13 #95.33 + vsubsd %xmm17, %xmm11, %xmm12 #94.33 + vmulsd %xmm15, %xmm19, %xmm11 #95.67 + vmovsd %xmm12, 32(%rdi,%rsi) #94.33 + vsubsd %xmm11, %xmm13, %xmm14 #95.33 + vmulsd %xmm15, %xmm20, %xmm12 #96.67 + vmovsd 160(%rdi,%rsi), %xmm15 #96.33 + vmovsd %xmm14, 96(%rdi,%rsi) #95.33 + vsubsd %xmm12, %xmm15, %xmm16 #96.33 + vmovsd %xmm16, 160(%rdi,%rsi) #96.33 + jmp ..B1.100 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.99: # Preds ..B1.97 + # Execution count [1.25e+01] + vmulsd %xmm15, %xmm19, %xmm11 #100.43 + vmulsd %xmm15, %xmm20, %xmm12 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.100: # Preds ..B1.98 ..B1.99 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm17, %xmm2, %xmm2 #99.29 + vaddsd %xmm11, %xmm1, %xmm1 #100.29 + vaddsd %xmm12, %xmm0, %xmm0 #101.29 + jmp ..B1.102 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.101: # Preds ..B1.96 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.102: # Preds ..B1.100 ..B1.101 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.105 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.103: # Preds ..B1.102 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.110 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.104: # Preds ..B1.103 + # Execution count [1.88e+01] + cmpl $5, %r8d #77.99 + jl ..B1.110 # Prob 50% #77.99 + jmp ..B1.107 # Prob 100% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.105: # Preds ..B1.102 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.110 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.106: # Preds ..B1.105 + # Execution count [1.88e+01] + cmpl $5, %r8d #78.100 + jne ..B1.110 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.107: # Preds ..B1.92 ..B1.67 ..B1.106 ..B1.104 + # Execution count [9.38e+00] + testl %eax, %eax #77.28 + jne ..B1.117 # Prob 50% #77.28 + jmp ..B1.120 # Prob 100% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.110: # Preds ..B1.95 ..B1.104 ..B1.106 ..B1.103 ..B1.105 + # ..B1.80 ..B1.92 ..B1.66 ..B1.67 + # Execution count [5.00e+01] + vsubsd 104(%rdi,%r10), %xmm5, %xmm19 #85.48 + vsubsd 40(%rdi,%r10), %xmm6, %xmm18 #84.48 + vsubsd 168(%rdi,%r10), %xmm4, %xmm20 #86.48 + vmulsd %xmm19, %xmm19, %xmm11 #87.61 + vfmadd231sd %xmm18, %xmm18, %xmm11 #87.75 + vfmadd231sd %xmm20, %xmm20, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.115 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm18 xmm19 xmm20 +..B1.111: # Preds ..B1.110 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm13 #89.51 + vmulsd %xmm13, %xmm7, %xmm11 #90.50 + vmulsd %xmm3, %xmm13, %xmm14 #91.67 + vmulsd %xmm13, %xmm11, %xmm12 #90.56 + vmulsd %xmm13, %xmm12, %xmm15 #90.62 + vmulsd %xmm15, %xmm14, %xmm16 #91.76 + vsubsd %xmm9, %xmm15, %xmm17 #91.67 + vmulsd %xmm17, %xmm16, %xmm15 #91.82 + vmulsd %xmm15, %xmm18, %xmm17 #94.67 + testl %eax, %eax #93.32 + je ..B1.113 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm15 xmm17 xmm19 xmm20 +..B1.112: # Preds ..B1.111 + # Execution count [1.25e+01] + vmovsd 40(%rdi,%rsi), %xmm11 #94.33 + vmovsd 104(%rdi,%rsi), %xmm13 #95.33 + vsubsd %xmm17, %xmm11, %xmm12 #94.33 + vmulsd %xmm15, %xmm19, %xmm11 #95.67 + vmovsd %xmm12, 40(%rdi,%rsi) #94.33 + vsubsd %xmm11, %xmm13, %xmm14 #95.33 + vmulsd %xmm15, %xmm20, %xmm12 #96.67 + vmovsd 168(%rdi,%rsi), %xmm15 #96.33 + vmovsd %xmm14, 104(%rdi,%rsi) #95.33 + vsubsd %xmm12, %xmm15, %xmm16 #96.33 + vmovsd %xmm16, 168(%rdi,%rsi) #96.33 + jmp ..B1.114 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.113: # Preds ..B1.111 + # Execution count [1.25e+01] + vmulsd %xmm15, %xmm19, %xmm11 #100.43 + vmulsd %xmm15, %xmm20, %xmm12 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.114: # Preds ..B1.112 ..B1.113 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm17, %xmm2, %xmm2 #99.29 + vaddsd %xmm11, %xmm1, %xmm1 #100.29 + vaddsd %xmm12, %xmm0, %xmm0 #101.29 + jmp ..B1.116 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.115: # Preds ..B1.110 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.116: # Preds ..B1.114 ..B1.115 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.120 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.117: # Preds ..B1.107 ..B1.116 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.123 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.119: # Preds ..B1.117 + # Execution count [2.50e+01] + cmpl $6, %r8d #77.99 + jl ..B1.123 # Prob 50% #77.99 + jmp ..B1.130 # Prob 100% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.120: # Preds ..B1.107 ..B1.116 + # Execution count [3.75e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.123 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.122: # Preds ..B1.120 + # Execution count [2.50e+01] + cmpl $6, %r8d #78.100 + je ..B1.130 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.123: # Preds ..B1.117 ..B1.120 ..B1.119 ..B1.122 + # Execution count [5.00e+01] + vsubsd 112(%rdi,%r10), %xmm5, %xmm19 #85.48 + vsubsd 48(%rdi,%r10), %xmm6, %xmm18 #84.48 + vsubsd 176(%rdi,%r10), %xmm4, %xmm20 #86.48 + vmulsd %xmm19, %xmm19, %xmm11 #87.61 + vfmadd231sd %xmm18, %xmm18, %xmm11 #87.75 + vfmadd231sd %xmm20, %xmm20, %xmm11 #87.75 + vcomisd %xmm11, %xmm10 #88.34 + jbe ..B1.128 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm18 xmm19 xmm20 +..B1.124: # Preds ..B1.123 + # Execution count [2.50e+01] + vdivsd %xmm11, %xmm8, %xmm13 #89.51 + vmulsd %xmm13, %xmm7, %xmm11 #90.50 + vmulsd %xmm3, %xmm13, %xmm14 #91.67 + vmulsd %xmm13, %xmm11, %xmm12 #90.56 + vmulsd %xmm13, %xmm12, %xmm15 #90.62 + vmulsd %xmm15, %xmm14, %xmm16 #91.76 + vsubsd %xmm9, %xmm15, %xmm17 #91.67 + vmulsd %xmm17, %xmm16, %xmm15 #91.82 + vmulsd %xmm15, %xmm18, %xmm17 #94.67 + testl %eax, %eax #93.32 + je ..B1.126 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm15 xmm17 xmm19 xmm20 +..B1.125: # Preds ..B1.124 + # Execution count [1.25e+01] + vmovsd 48(%rdi,%rsi), %xmm11 #94.33 + vmovsd 112(%rdi,%rsi), %xmm13 #95.33 + vsubsd %xmm17, %xmm11, %xmm12 #94.33 + vmulsd %xmm15, %xmm19, %xmm11 #95.67 + vmovsd %xmm12, 48(%rdi,%rsi) #94.33 + vsubsd %xmm11, %xmm13, %xmm14 #95.33 + vmulsd %xmm15, %xmm20, %xmm12 #96.67 + vmovsd 176(%rdi,%rsi), %xmm15 #96.33 + vmovsd %xmm14, 112(%rdi,%rsi) #95.33 + vsubsd %xmm12, %xmm15, %xmm16 #96.33 + vmovsd %xmm16, 176(%rdi,%rsi) #96.33 + jmp ..B1.127 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.126: # Preds ..B1.124 + # Execution count [1.25e+01] + vmulsd %xmm15, %xmm19, %xmm11 #100.43 + vmulsd %xmm15, %xmm20, %xmm12 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm17 +..B1.127: # Preds ..B1.125 ..B1.126 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm17, %xmm2, %xmm2 #99.29 + vaddsd %xmm11, %xmm1, %xmm1 #100.29 + vaddsd %xmm12, %xmm0, %xmm0 #101.29 + jmp ..B1.130 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.128: # Preds ..B1.123 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.130: # Preds ..B1.128 ..B1.127 ..B1.122 ..B1.119 + # Execution count [1.25e+01] + testl %eax, %eax #77.28 + je ..B1.133 # Prob 50% #77.28 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.131: # Preds ..B1.130 + # Execution count [5.00e+01] + cmpl %r9d, %r13d #77.62 + jne ..B1.135 # Prob 50% #77.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.132: # Preds ..B1.131 + # Execution count [2.50e+01] + cmpl $7, %r8d #77.99 + jl ..B1.135 # Prob 50% #77.99 + jmp ..B1.141 # Prob 100% #77.99 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.133: # Preds ..B1.130 + # Execution count [5.00e+01] + cmpl %r9d, %r13d #78.62 + jne ..B1.135 # Prob 50% #78.62 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.134: # Preds ..B1.133 + # Execution count [2.50e+01] + cmpl $7, %r8d #78.100 + je ..B1.141 # Prob 50% #78.100 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.135: # Preds ..B1.131 ..B1.132 ..B1.133 ..B1.134 + # Execution count [5.00e+01] + vsubsd 120(%rdi,%r10), %xmm5, %xmm16 #85.48 + vsubsd 56(%rdi,%r10), %xmm6, %xmm15 #84.48 + vsubsd 184(%rdi,%r10), %xmm4, %xmm17 #86.48 + vmulsd %xmm16, %xmm16, %xmm4 #87.61 + vfmadd231sd %xmm15, %xmm15, %xmm4 #87.75 + vfmadd231sd %xmm17, %xmm17, %xmm4 #87.75 + vcomisd %xmm4, %xmm10 #88.34 + jbe ..B1.140 # Prob 50% #88.34 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm7 xmm8 xmm9 xmm10 xmm15 xmm16 xmm17 +..B1.136: # Preds ..B1.135 + # Execution count [2.50e+01] + vdivsd %xmm4, %xmm8, %xmm6 #89.51 + vmulsd %xmm6, %xmm7, %xmm4 #90.50 + vmulsd %xmm3, %xmm6, %xmm11 #91.67 + vmulsd %xmm6, %xmm4, %xmm5 #90.56 + vmulsd %xmm6, %xmm5, %xmm12 #90.62 + vmulsd %xmm12, %xmm11, %xmm13 #91.76 + vsubsd %xmm9, %xmm12, %xmm14 #91.67 + vmulsd %xmm14, %xmm13, %xmm12 #91.82 + vmulsd %xmm12, %xmm15, %xmm14 #94.67 + testl %eax, %eax #93.32 + je ..B1.138 # Prob 50% #93.32 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm12 xmm14 xmm16 xmm17 +..B1.137: # Preds ..B1.136 + # Execution count [1.25e+01] + vmovsd 56(%rdi,%rsi), %xmm4 #94.33 + vmovsd 120(%rdi,%rsi), %xmm6 #95.33 + vsubsd %xmm14, %xmm4, %xmm5 #94.33 + vmulsd %xmm12, %xmm16, %xmm4 #95.67 + vmovsd %xmm5, 56(%rdi,%rsi) #94.33 + vsubsd %xmm4, %xmm6, %xmm11 #95.33 + vmulsd %xmm12, %xmm17, %xmm5 #96.67 + vmovsd 184(%rdi,%rsi), %xmm12 #96.33 + vmovsd %xmm11, 120(%rdi,%rsi) #95.33 + vsubsd %xmm5, %xmm12, %xmm13 #96.33 + vmovsd %xmm13, 184(%rdi,%rsi) #96.33 + jmp ..B1.139 # Prob 100% #96.33 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm7 xmm8 xmm9 xmm10 xmm14 +..B1.138: # Preds ..B1.136 + # Execution count [1.25e+01] + vmulsd %xmm12, %xmm16, %xmm4 #100.43 + vmulsd %xmm12, %xmm17, %xmm5 #101.43 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm7 xmm8 xmm9 xmm10 xmm14 +..B1.139: # Preds ..B1.137 ..B1.138 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddsd %xmm14, %xmm2, %xmm2 #99.29 + vaddsd %xmm4, %xmm1, %xmm1 #100.29 + vaddsd %xmm5, %xmm0, %xmm0 #101.29 + jmp ..B1.142 # Prob 100% #101.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.140: # Preds ..B1.135 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.141: # Preds ..B1.132 ..B1.134 ..B1.140 + # Execution count [7.50e+01] + testb %dl, %dl #110.27 + je ..B1.143 # Prob 50% #110.27 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.142: # Preds ..B1.139 ..B1.141 + # Execution count [5.00e+01] + incq 40(%r12) #111.21 + jmp ..B1.144 # Prob 100% #111.21 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.143: # Preds ..B1.141 + # Execution count [5.00e+01] + incq 48(%r12) #113.21 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.144: # Preds ..B1.142 ..B1.143 + # Execution count [1.00e+02] + incb %cl #63.13 + incl %r8d #63.13 + vaddsd (%r14,%rbx,8), %xmm2, %xmm2 #116.17 + vaddsd 64(%r14,%rbx,8), %xmm1, %xmm1 #117.17 + vaddsd 128(%r14,%rbx,8), %xmm0, %xmm0 #118.17 + vmovsd %xmm2, (%r14,%rbx,8) #116.17 + vmovsd %xmm1, 64(%r14,%rbx,8) #117.17 + vmovsd %xmm0, 128(%r14,%rbx,8) #118.17 + incq %rbx #63.13 + cmpb $4, %cl #63.13 + jb ..B1.33 # Prob 75% #63.13 + # LOE rbx rsi rdi r10 r11 r12 r14 eax r8d r9d r13d r15d dl cl xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.145: # Preds ..B1.144 + # Execution count [2.50e+01] + movq 112(%rsp), %r10 #[spill] + incq %r10 #56.9 + movq 104(%rsp), %r12 #[spill] + movq 96(%rsp), %rsi #[spill] + cmpq %r12, %r10 #56.9 + jb ..B1.32 # Prob 82% #56.9 + # LOE rsi r10 r11 r12 r14 eax r13d r15d xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.146: # Preds ..B1.145 + # Execution count [4.50e+00] + movq 48(%rsp), %rdx #[spill] + movl 40(%rsp), %ecx #[spill] + movq 24(%rsp), %rax #[spill] + movq 16(%rsp), %rbx #[spill] + movq 8(%rsp), %rdi #[spill] + movq 80(%rsp), %r8 #[spill] + movq 88(%rsp), %r9 #[spill] + # LOE rax rdx rbx rsi rdi r8 r9 r12 ecx xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.147: # Preds ..B1.146 ..B1.30 + # Execution count [5.00e+00] + vxorpd %xmm16, %xmm16, %xmm16 #124.9 + addq %r12, %rbx #123.9 + vcvtsi2sd %r12d, %xmm16, %xmm16 #124.9 + vmulsd %xmm16, %xmm9, %xmm0 #124.9 + incl %ecx #47.5 + vcvttsd2si %xmm0, %r10 #124.9 + incq %rdx #47.5 + addq %r10, %rax #124.9 + lea (%rsi,%rdi,4), %rsi #47.5 + cmpl 56(%rsp), %ecx #47.5[spill] + jb ..B1.30 # Prob 82% #47.5 + # LOE rax rdx rbx rsi rdi r8 r9 ecx xmm3 xmm7 xmm8 xmm9 xmm10 +..B1.148: # Preds ..B1.147 + # Execution count [9.00e-01] + movq 72(%rsp), %r13 #[spill] + movq %rax, 16(%r13) #124.9 + movq %rbx, 8(%r13) #123.9 + # LOE r12 +..B1.149: # Preds ..B1.26 ..B1.148 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #127.5 +..___tag_value_computeForceLJ_ref.56: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #127.5 +..___tag_value_computeForceLJ_ref.57: + # LOE r12 +..B1.150: # Preds ..B1.149 + # Execution count [1.00e+00] + xorl %eax, %eax #130.16 +..___tag_value_computeForceLJ_ref.58: +# getTimeStamp() + call getTimeStamp #130.16 +..___tag_value_computeForceLJ_ref.59: + # LOE r12 xmm0 +..B1.157: # Preds ..B1.150 + # Execution count [1.00e+00] + vmovsd %xmm0, 8(%rsp) #130.16[spill] + # LOE r12 +..B1.151: # Preds ..B1.157 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #131.5 + xorl %eax, %eax #131.5 +..___tag_value_computeForceLJ_ref.61: +# debug_printf(const char *, ...) + call debug_printf #131.5 +..___tag_value_computeForceLJ_ref.62: + # LOE r12 +..B1.152: # Preds ..B1.151 + # Execution count [1.00e+00] + vmovsd 8(%rsp), %xmm0 #132.14[spill] + vsubsd (%rsp), %xmm0, %xmm0 #132.14[spill] + addq $152, %rsp #132.14 + .cfi_restore 3 + popq %rbx #132.14 + .cfi_restore 15 + popq %r15 #132.14 + .cfi_restore 14 + popq %r14 #132.14 + .cfi_restore 13 + popq %r13 #132.14 + .cfi_restore 12 + popq %r12 #132.14 + movq %rbp, %rsp #132.14 + popq %rbp #132.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #132.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B1.153: # Preds ..B1.5 + # Execution count [4.50e-01]: Infreq + xorl %r10d, %r10d #33.9 + jmp ..B1.18 # Prob 100% #33.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_ref,@function + .size computeForceLJ_ref,.-computeForceLJ_ref +..LNcomputeForceLJ_ref.0: + .data +# -- End computeForceLJ_ref + .text +.L_2__routine_start_computeForceLJ_2xnn_full_1: +# -- Begin computeForceLJ_2xnn_full + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_2xnn_full +# --- computeForceLJ_2xnn_full(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_2xnn_full: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B2.1: # Preds ..B2.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_2xnn_full.80: +..L81: + #287.97 + pushq %rbp #287.97 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #287.97 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #287.97 + pushq %r12 #287.97 + pushq %r13 #287.97 + pushq %r14 #287.97 + pushq %r15 #287.97 + pushq %rbx #287.97 + subq $216, %rsp #287.97 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r15 #287.97 + movl $.L_2__STRING.3, %edi #288.5 + xorl %eax, %eax #288.5 + movq %rcx, %r14 #287.97 + movq %rdx, %r13 #287.97 + movq %rsi, %rbx #287.97 +..___tag_value_computeForceLJ_2xnn_full.90: +# debug_printf(const char *, ...) + call debug_printf #288.5 +..___tag_value_computeForceLJ_2xnn_full.91: + # LOE rbx r12 r13 r14 r15 +..B2.2: # Preds ..B2.1 + # Execution count [1.00e+00] + vmovsd 144(%r15), %xmm0 #291.27 + xorl %ecx, %ecx #301.5 + vmulsd %xmm0, %xmm0, %xmm1 #294.36 + xorl %esi, %esi #303.27 + vbroadcastsd 56(%r15), %zmm3 #295.32 + vbroadcastsd 40(%r15), %zmm4 #296.29 + vbroadcastsd %xmm1, %zmm2 #294.36 + vmovups %zmm3, 128(%rsp) #295.32[spill] + vmovups %zmm4, (%rsp) #296.29[spill] + vmovups %zmm2, 64(%rsp) #294.36[spill] + movl 20(%rbx), %edx #301.26 + testl %edx, %edx #301.26 + jle ..B2.24 # Prob 9% #301.26 + # LOE rbx rsi r12 r13 r14 edx ecx +..B2.3: # Preds ..B2.2 + # Execution count [9.00e-01] + movq 176(%rbx), %rdi #303.27 + movq 192(%rbx), %rax #304.32 + vxorpd %ymm2, %ymm2, %ymm2 #305.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #304.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #304.9 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B2.4: # Preds ..B2.22 ..B2.3 + # Execution count [5.00e+00] + movl %ecx, %r8d #302.27 + movl %ecx, %r9d #302.27 + sarl $1, %r8d #302.27 + andl $1, %r9d #302.27 + shll $2, %r9d #302.27 + lea (%r8,%r8,2), %r10d #302.27 + lea (%r9,%r10,8), %r11d #302.27 + movslq %r11d, %r11 #303.27 + lea (%rdi,%r11,8), %r12 #303.27 + movl (%rsi,%rax), %r11d #304.32 + testl %r11d, %r11d #304.32 + jle ..B2.22 # Prob 50% #304.32 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B2.5: # Preds ..B2.4 + # Execution count [4.50e+00] + cmpl $16, %r11d #304.9 + jl ..B2.38 # Prob 10% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B2.6: # Preds ..B2.5 + # Execution count [4.50e+00] + lea 128(%r12), %r8 #307.13 + andq $63, %r8 #304.9 + testl $7, %r8d #304.9 + je ..B2.8 # Prob 50% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B2.7: # Preds ..B2.6 + # Execution count [2.25e+00] + xorl %r8d, %r8d #304.9 + jmp ..B2.10 # Prob 100% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B2.8: # Preds ..B2.6 + # Execution count [2.25e+00] + testl %r8d, %r8d #304.9 + je ..B2.10 # Prob 50% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B2.9: # Preds ..B2.8 + # Execution count [2.50e+01] + negl %r8d #304.9 + addl $64, %r8d #304.9 + shrl $3, %r8d #304.9 + cmpl %r8d, %r11d #304.9 + cmovl %r11d, %r8d #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B2.10: # Preds ..B2.7 ..B2.9 ..B2.8 + # Execution count [5.00e+00] + movl %r11d, %r10d #304.9 + subl %r8d, %r10d #304.9 + andl $15, %r10d #304.9 + negl %r10d #304.9 + addl %r11d, %r10d #304.9 + cmpl $1, %r8d #304.9 + jb ..B2.14 # Prob 50% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B2.11: # Preds ..B2.10 + # Execution count [4.50e+00] + vpbroadcastd %r8d, %xmm3 #304.9 + xorl %r15d, %r15d #304.9 + vmovdqa %xmm0, %xmm4 #304.9 + movslq %r8d, %r9 #304.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B2.12: # Preds ..B2.12 ..B2.11 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #304.9 + vpaddd %xmm1, %xmm4, %xmm4 #304.9 + vmovupd %ymm2, (%r12,%r15,8){%k1} #305.13 + vmovupd %ymm2, 64(%r12,%r15,8){%k1} #306.13 + vmovupd %ymm2, 128(%r12,%r15,8){%k1} #307.13 + addq $4, %r15 #304.9 + cmpq %r9, %r15 #304.9 + jb ..B2.12 # Prob 82% #304.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B2.13: # Preds ..B2.12 + # Execution count [4.50e+00] + cmpl %r8d, %r11d #304.9 + je ..B2.22 # Prob 10% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B2.14: # Preds ..B2.10 ..B2.13 + # Execution count [2.50e+01] + lea 16(%r8), %r9d #304.9 + cmpl %r9d, %r10d #304.9 + jl ..B2.18 # Prob 50% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B2.15: # Preds ..B2.14 + # Execution count [4.50e+00] + movslq %r8d, %r8 #304.9 + movslq %r10d, %r9 #304.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B2.16: # Preds ..B2.16 ..B2.15 + # Execution count [2.50e+01] + vmovupd %ymm2, (%r12,%r8,8) #305.13 + vmovupd %ymm2, 32(%r12,%r8,8) #305.13 + vmovupd %ymm2, 64(%r12,%r8,8) #305.13 + vmovupd %ymm2, 128(%r12,%r8,8) #306.13 + vmovupd %ymm2, 192(%r12,%r8,8) #307.13 + vmovupd %ymm2, 96(%r12,%r8,8) #305.13 + vmovupd %ymm2, 160(%r12,%r8,8) #306.13 + vmovupd %ymm2, 224(%r12,%r8,8) #307.13 + addq $16, %r8 #304.9 + cmpq %r9, %r8 #304.9 + jb ..B2.16 # Prob 82% #304.9 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B2.18: # Preds ..B2.16 ..B2.14 ..B2.38 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #304.9 + cmpl %r11d, %r8d #304.9 + ja ..B2.22 # Prob 50% #304.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B2.19: # Preds ..B2.18 + # Execution count [4.50e+00] + movslq %r10d, %r9 #305.13 + negl %r10d #304.9 + addl %r11d, %r10d #304.9 + xorl %r8d, %r8d #304.9 + movslq %r11d, %r11 #304.9 + vmovdqa %xmm0, %xmm4 #304.9 + vpbroadcastd %r10d, %xmm3 #304.9 + subq %r9, %r11 #304.9 + lea (%r12,%r9,8), %r12 #305.13 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B2.20: # Preds ..B2.20 ..B2.19 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #304.9 + vpaddd %xmm1, %xmm4, %xmm4 #304.9 + vmovupd %ymm2, (%r12,%r8,8){%k1} #305.13 + vmovupd %ymm2, 64(%r12,%r8,8){%k1} #306.13 + vmovupd %ymm2, 128(%r12,%r8,8){%k1} #307.13 + addq $4, %r8 #304.9 + cmpq %r11, %r8 #304.9 + jb ..B2.20 # Prob 82% #304.9 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B2.22: # Preds ..B2.20 ..B2.4 ..B2.13 ..B2.18 + # Execution count [5.00e+00] + incl %ecx #301.5 + addq $56, %rsi #301.5 + cmpl %edx, %ecx #301.5 + jb ..B2.4 # Prob 82% #301.5 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B2.24: # Preds ..B2.22 ..B2.2 + # Execution count [1.00e+00] + xorl %eax, %eax #311.16 + vzeroupper #311.16 +..___tag_value_computeForceLJ_2xnn_full.95: +# getTimeStamp() + call getTimeStamp #311.16 +..___tag_value_computeForceLJ_2xnn_full.96: + # LOE rbx r12 r13 r14 xmm0 +..B2.41: # Preds ..B2.24 + # Execution count [1.00e+00] + vmovsd %xmm0, 192(%rsp) #311.16[spill] + # LOE rbx r12 r13 r14 +..B2.25: # Preds ..B2.41 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #315.5 +..___tag_value_computeForceLJ_2xnn_full.98: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #315.5 +..___tag_value_computeForceLJ_2xnn_full.99: + # LOE rbx r12 r13 r14 +..B2.26: # Preds ..B2.25 + # Execution count [1.00e+00] + xorl %edx, %edx #318.16 + xorl %eax, %eax #318.16 + cmpl $0, 20(%rbx) #318.26 + jle ..B2.34 # Prob 10% #318.26 + # LOE rax rbx r12 r13 r14 edx +..B2.27: # Preds ..B2.26 + # Execution count [9.00e-01] + movl $65484, %ecx #406.9 + kmovw %ecx, %k2 #406.9 + movl $65450, %ecx #406.9 + kmovw %ecx, %k1 #406.9 + vmovups (%rsp), %zmm25 #406.9[spill] + vmovups 128(%rsp), %zmm26 #406.9[spill] + vmovups 64(%rsp), %zmm27 #406.9[spill] + vbroadcastsd .L_2il0floatpacket.2(%rip), %zmm28 #406.9 + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm24 #406.9 + vpxord %zmm8, %zmm8, %zmm8 #335.30 + # LOE rax rbx r13 r14 edx zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B2.28: # Preds ..B2.32 ..B2.27 + # Execution count [5.00e+00] + movl %edx, %r8d #323.27 + movl %edx, %r12d #323.27 + sarl $1, %r8d #323.27 + andl $1, %r12d #323.27 + shll $2, %r12d #323.27 + xorl %r9d, %r9d #342.19 + movl 16(%r13), %ecx #326.44 + imull %edx, %ecx #326.44 + movq 160(%rbx), %r11 #324.27 + lea (%r8,%r8,2), %r15d #323.27 + vmovaps %zmm8, %zmm16 #335.30 + lea (%r12,%r15,8), %r8d #323.27 + movslq %r8d, %r8 #323.27 + vmovaps %zmm16, %zmm15 #336.30 + movslq %ecx, %rcx #326.19 + vmovaps %zmm15, %zmm14 #337.30 + vbroadcastsd 8(%r11,%r8,8), %ymm20 #329.33 + vbroadcastsd 24(%r11,%r8,8), %ymm18 #330.33 + vbroadcastsd 72(%r11,%r8,8), %ymm0 #331.33 + vbroadcastsd 88(%r11,%r8,8), %ymm2 #332.33 + vbroadcastsd 136(%r11,%r8,8), %ymm4 #333.33 + vbroadcastsd 152(%r11,%r8,8), %ymm6 #334.33 + vbroadcastsd 128(%r11,%r8,8), %zmm3 #333.33 + vbroadcastsd 64(%r11,%r8,8), %zmm17 #331.33 + vbroadcastsd (%r11,%r8,8), %zmm21 #329.33 + vbroadcastsd 16(%r11,%r8,8), %zmm19 #330.33 + vbroadcastsd 80(%r11,%r8,8), %zmm1 #332.33 + vbroadcastsd 144(%r11,%r8,8), %zmm5 #334.33 + vinsertf64x4 $1, %ymm20, %zmm21, %zmm21 #329.33 + vinsertf64x4 $1, %ymm18, %zmm19, %zmm20 #330.33 + vinsertf64x4 $1, %ymm0, %zmm17, %zmm19 #331.33 + vinsertf64x4 $1, %ymm2, %zmm1, %zmm18 #332.33 + vinsertf64x4 $1, %ymm4, %zmm3, %zmm17 #333.33 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm23 #334.33 + movq 24(%r13), %rdi #327.25 + movq 8(%r13), %rsi #326.19 + vmovaps %zmm14, %zmm13 #338.30 + vmovaps %zmm13, %zmm12 #339.30 + movslq (%rdi,%rax,4), %r10 #327.25 + lea (%rsi,%rcx,4), %r15 #326.19 + vmovaps %zmm12, %zmm22 #340.30 + movq 176(%rbx), %r12 #325.27 + testq %r10, %r10 #342.28 + jle ..B2.32 # Prob 10% #342.28 + # LOE rax rbx r8 r9 r10 r11 r12 r13 r14 r15 edx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B2.29: # Preds ..B2.28 + # Execution count [4.50e+00] + movq %r13, 8(%rsp) #[spill] + movq %r14, (%rsp) #[spill] + # LOE rax rbx r8 r9 r10 r11 r12 r15 edx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B2.30: # Preds ..B2.30 ..B2.29 + # Execution count [2.50e+01] + movl (%r15,%r9,4), %edi #343.22 + incq %r9 #342.39 + lea (%rdi,%rdi,2), %r13d #344.31 + shll $3, %r13d #344.31 + lea (%rdi,%rdi), %r14d #365.56 + movslq %r13d, %r13 #345.31 + cmpl %edx, %r14d #365.66 + lea 1(%rdi,%rdi), %ecx #366.61 + movl $0, %edi #365.66 + sete %dil #365.66 + cmpl %edx, %ecx #366.66 + movl $0, %ecx #366.66 + vbroadcastf64x4 128(%r11,%r13,8), %zmm31 #350.36 + sete %cl #366.66 + vbroadcastf64x4 64(%r11,%r13,8), %zmm30 #349.36 + vbroadcastf64x4 (%r11,%r13,8), %zmm29 #348.36 + vsubpd %zmm31, %zmm17, %zmm10 #353.35 + vsubpd %zmm31, %zmm23, %zmm5 #356.35 + vsubpd %zmm30, %zmm19, %zmm9 #352.35 + vsubpd %zmm30, %zmm18, %zmm6 #355.35 + vsubpd %zmm29, %zmm21, %zmm11 #351.35 + vsubpd %zmm29, %zmm20, %zmm7 #354.35 + vmulpd %zmm10, %zmm10, %zmm0 #383.80 + vmulpd %zmm5, %zmm5, %zmm1 #384.80 + vfmadd231pd %zmm9, %zmm9, %zmm0 #383.57 + vfmadd231pd %zmm6, %zmm6, %zmm1 #384.57 + vfmadd231pd %zmm11, %zmm11, %zmm0 #383.34 + vfmadd231pd %zmm7, %zmm7, %zmm1 #384.34 + vrcp14pd %zmm0, %zmm4 #389.35 + vrcp14pd %zmm1, %zmm3 #390.35 + vcmppd $17, %zmm27, %zmm1, %k0 #387.67 + vcmppd $17, %zmm27, %zmm0, %k4 #386.67 + vmulpd %zmm26, %zmm4, %zmm2 #392.67 + vmulpd %zmm26, %zmm3, %zmm29 #393.67 + vmulpd %zmm2, %zmm4, %zmm30 #392.51 + vmulpd %zmm29, %zmm3, %zmm1 #393.51 + vmulpd %zmm30, %zmm4, %zmm2 #392.35 + vmulpd %zmm1, %zmm3, %zmm0 #393.35 + vfmsub213pd %zmm28, %zmm4, %zmm30 #395.79 + vfmsub213pd %zmm28, %zmm3, %zmm1 #396.79 + vmulpd %zmm25, %zmm4, %zmm4 #395.105 + vmulpd %zmm25, %zmm3, %zmm3 #396.105 + vmulpd %zmm4, %zmm30, %zmm31 #395.70 + vmulpd %zmm3, %zmm1, %zmm1 #396.70 + vmulpd %zmm31, %zmm2, %zmm2 #395.54 + vmulpd %zmm1, %zmm0, %zmm0 #396.54 + vmulpd %zmm2, %zmm24, %zmm4 #395.36 + vmulpd %zmm0, %zmm24, %zmm2 #396.36 + movl %ecx, %r13d #380.39 + lea (%rdi,%rdi), %esi #380.39 + shll $5, %r13d #380.39 + negl %esi #380.39 + subl %r13d, %esi #380.39 + movl %edi, %r13d #380.39 + movl %ecx, %r14d #380.39 + negl %r13d #380.39 + shll $4, %r14d #380.39 + shll $4, %esi #380.39 + subl %r14d, %r13d #380.39 + addl $4080, %esi #380.39 + addl $255, %r13d #380.39 + orl %r13d, %esi #380.39 + movl %ecx, %r14d #381.39 + kmovb %esi, %k3 #380.39 + kmovb %k3, %esi #380.39 + kmovb %esi, %k5 #386.41 + kmovw %k4, %esi #386.67 + kmovb %esi, %k6 #386.41 + lea (,%rdi,8), %esi #381.39 + shll $2, %edi #381.39 + negl %esi #381.39 + shll $7, %r14d #381.39 + negl %edi #381.39 + shll $6, %ecx #381.39 + subl %r14d, %esi #381.39 + shll $4, %esi #381.39 + subl %ecx, %edi #381.39 + addl $4080, %esi #381.39 + addl $255, %edi #381.39 + orl %edi, %esi #381.39 + kmovb %esi, %k4 #381.39 + kmovw %k0, %edi #387.67 + kmovb %k4, %ecx #381.39 + kandb %k6, %k5, %k7 #386.41 + kmovb %ecx, %k5 #387.41 + kmovb %edi, %k0 #387.41 + kandb %k0, %k5, %k6 #387.41 + kmovb %k7, %r13d #386.41 + kmovb %k6, %ecx #387.41 + kmovw %r13d, %k3 #398.20 + kmovw %ecx, %k7 #401.20 + vfmadd231pd %zmm11, %zmm4, %zmm16{%k3} #398.20 + vfmadd231pd %zmm9, %zmm4, %zmm15{%k3} #399.20 + vfmadd231pd %zmm10, %zmm4, %zmm14{%k3} #400.20 + vfmadd231pd %zmm7, %zmm2, %zmm13{%k7} #401.20 + vfmadd231pd %zmm6, %zmm2, %zmm12{%k7} #402.20 + vfmadd231pd %zmm5, %zmm2, %zmm22{%k7} #403.20 + cmpq %r10, %r9 #342.28 + jl ..B2.30 # Prob 82% #342.28 + # LOE rax rbx r8 r9 r10 r11 r12 r15 edx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B2.31: # Preds ..B2.30 + # Execution count [4.50e+00] + movq 8(%rsp), %r13 #[spill] + movq (%rsp), %r14 #[spill] + # LOE rax rbx r8 r10 r12 r13 r14 edx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm22 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B2.32: # Preds ..B2.31 ..B2.28 + # Execution count [5.00e+00] + vpermpd $78, %zmm16, %zmm17 #406.9 + vxorpd %xmm7, %xmm7, %xmm7 #412.9 + vpermpd $78, %zmm15, %zmm23 #407.9 + vpermpd $78, %zmm14, %zmm2 #408.9 + vaddpd %zmm16, %zmm17, %zmm18 #406.9 + vpermpd $78, %zmm13, %zmm16 #406.9 + vaddpd %zmm15, %zmm23, %zmm30 #407.9 + vpermpd $78, %zmm12, %zmm29 #407.9 + vaddpd %zmm14, %zmm2, %zmm4 #408.9 + vpermpd $78, %zmm22, %zmm3 #408.9 + vaddpd %zmm16, %zmm13, %zmm18{%k2} #406.9 + vaddpd %zmm29, %zmm12, %zmm30{%k2} #407.9 + vaddpd %zmm3, %zmm22, %zmm4{%k2} #408.9 + vpermpd $177, %zmm18, %zmm19 #406.9 + vpermpd $177, %zmm30, %zmm31 #407.9 + vpermpd $177, %zmm4, %zmm22 #408.9 + vaddpd %zmm19, %zmm18, %zmm20 #406.9 + vaddpd %zmm31, %zmm30, %zmm0 #407.9 + vaddpd %zmm22, %zmm4, %zmm5 #408.9 + vshuff64x2 $238, %zmm20, %zmm20, %zmm20{%k1} #406.9 + vshuff64x2 $238, %zmm0, %zmm0, %zmm0{%k1} #407.9 + vshuff64x2 $238, %zmm5, %zmm5, %zmm5{%k1} #408.9 + incl %edx #318.49 + incq %rax #318.49 + vaddpd (%r12,%r8,8), %ymm20, %ymm21 #406.9 + vaddpd 64(%r12,%r8,8), %ymm0, %ymm1 #407.9 + vaddpd 128(%r12,%r8,8), %ymm5, %ymm6 #408.9 + vmovupd %ymm21, (%r12,%r8,8) #406.9 + vmovupd %ymm1, 64(%r12,%r8,8) #407.9 + vmovupd %ymm6, 128(%r12,%r8,8) #408.9 + addq %r10, 8(%r14) #411.9 + vcvtsi2sd %r10d, %xmm7, %xmm7 #412.9 + vcvttsd2si %xmm7, %rcx #412.9 + incq (%r14) #410.9 + addq %rcx, 16(%r14) #412.9 + cmpl 20(%rbx), %edx #318.26 + jl ..B2.28 # Prob 82% #318.26 + # LOE rax rbx r13 r14 edx zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B2.34: # Preds ..B2.32 ..B2.26 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #416.5 + vzeroupper #416.5 +..___tag_value_computeForceLJ_2xnn_full.107: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #416.5 +..___tag_value_computeForceLJ_2xnn_full.108: + # LOE r12 +..B2.35: # Preds ..B2.34 + # Execution count [1.00e+00] + xorl %eax, %eax #419.16 +..___tag_value_computeForceLJ_2xnn_full.109: +# getTimeStamp() + call getTimeStamp #419.16 +..___tag_value_computeForceLJ_2xnn_full.110: + # LOE r12 xmm0 +..B2.42: # Preds ..B2.35 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #419.16[spill] + # LOE r12 +..B2.36: # Preds ..B2.42 + # Execution count [1.00e+00] + movl $.L_2__STRING.4, %edi #420.5 + xorl %eax, %eax #420.5 +..___tag_value_computeForceLJ_2xnn_full.112: +# debug_printf(const char *, ...) + call debug_printf #420.5 +..___tag_value_computeForceLJ_2xnn_full.113: + # LOE r12 +..B2.37: # Preds ..B2.36 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm0 #421.14[spill] + vsubsd 192(%rsp), %xmm0, %xmm0 #421.14[spill] + addq $216, %rsp #421.14 + .cfi_restore 3 + popq %rbx #421.14 + .cfi_restore 15 + popq %r15 #421.14 + .cfi_restore 14 + popq %r14 #421.14 + .cfi_restore 13 + popq %r13 #421.14 + .cfi_restore 12 + popq %r12 #421.14 + movq %rbp, %rsp #421.14 + popq %rbp #421.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #421.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B2.38: # Preds ..B2.5 + # Execution count [4.50e-01]: Infreq + xorl %r10d, %r10d #304.9 + jmp ..B2.18 # Prob 100% #304.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_2xnn_full,@function + .size computeForceLJ_2xnn_full,.-computeForceLJ_2xnn_full +..LNcomputeForceLJ_2xnn_full.1: + .data +# -- End computeForceLJ_2xnn_full + .text +.L_2__routine_start_computeForceLJ_2xnn_2: +# -- Begin computeForceLJ_2xnn + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_2xnn +# --- computeForceLJ_2xnn(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_2xnn: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B3.1: # Preds ..B3.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_2xnn.131: +..L132: + #424.92 + pushq %rbp #424.92 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #424.92 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #424.92 + pushq %r13 #424.92 + pushq %r14 #424.92 + pushq %r15 #424.92 + pushq %rbx #424.92 + subq $224, %rsp #424.92 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + movq %rdx, %r13 #424.92 + movq %rcx, %r14 #424.92 + movq %rsi, %rbx #424.92 + movq %rdi, %r15 #424.92 + cmpl $0, 32(%r13) #425.8 + je ..B3.4 # Prob 50% #425.8 + # LOE rbx r12 r13 r14 r15 +..B3.2: # Preds ..B3.1 + # Execution count [5.00e-01] + movq %r15, %rdi #426.16 + movq %rbx, %rsi #426.16 + movq %r13, %rdx #426.16 + movq %r14, %rcx #426.16 + addq $224, %rsp #426.16 + .cfi_restore 3 + popq %rbx #426.16 + .cfi_restore 15 + popq %r15 #426.16 + .cfi_restore 14 + popq %r14 #426.16 + .cfi_restore 13 + popq %r13 #426.16 + movq %rbp, %rsp #426.16 + popq %rbp #426.16 + .cfi_def_cfa 7, 8 + .cfi_restore 6 +# computeForceLJ_2xnn_half(Parameter *, Atom *, Neighbor *, Stats *) + jmp computeForceLJ_2xnn_half #426.16 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B3.4: # Preds ..B3.1 + # Execution count [5.00e-01] + movl $.L_2__STRING.3, %edi #429.12 + xorl %eax, %eax #429.12 +..___tag_value_computeForceLJ_2xnn.152: +# debug_printf(const char *, ...) + call debug_printf #429.12 +..___tag_value_computeForceLJ_2xnn.153: + # LOE rbx r12 r13 r14 r15 +..B3.5: # Preds ..B3.4 + # Execution count [5.00e-01] + vmovsd 144(%r15), %xmm0 #429.12 + xorl %r8d, %r8d #429.12 + vmulsd %xmm0, %xmm0, %xmm1 #429.12 + xorl %r9d, %r9d #429.12 + vbroadcastsd 56(%r15), %zmm3 #429.12 + vbroadcastsd 40(%r15), %zmm4 #429.12 + vbroadcastsd %xmm1, %zmm2 #429.12 + vmovups %zmm3, (%rsp) #429.12[spill] + vmovups %zmm4, 64(%rsp) #429.12[spill] + vmovups %zmm2, 128(%rsp) #429.12[spill] + movl 20(%rbx), %edi #429.12 + testl %edi, %edi #429.12 + jle ..B3.27 # Prob 9% #429.12 + # LOE rbx r9 r12 r13 r14 edi r8d +..B3.6: # Preds ..B3.5 + # Execution count [4.50e-01] + movq 176(%rbx), %r10 #429.12 + movq 192(%rbx), %rax #429.12 + vxorpd %ymm2, %ymm2, %ymm2 #429.12 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #429.12 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #429.12 + movq %r12, 192(%rsp) #429.12[spill] + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + # LOE rax rbx r9 r10 r13 r14 edi r8d xmm0 xmm1 ymm2 +..B3.7: # Preds ..B3.25 ..B3.6 + # Execution count [2.50e+00] + movl %r8d, %r11d #429.12 + movl %r8d, %r12d #429.12 + sarl $1, %r11d #429.12 + andl $1, %r12d #429.12 + shll $2, %r12d #429.12 + movl (%r9,%rax), %edx #429.12 + lea (%r11,%r11,2), %r15d #429.12 + lea (%r12,%r15,8), %r11d #429.12 + movslq %r11d, %r11 #429.12 + lea (%r10,%r11,8), %rcx #429.12 + testl %edx, %edx #429.12 + jle ..B3.25 # Prob 50% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d xmm0 xmm1 ymm2 +..B3.8: # Preds ..B3.7 + # Execution count [2.25e+00] + cmpl $16, %edx #429.12 + jl ..B3.41 # Prob 10% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d xmm0 xmm1 ymm2 +..B3.9: # Preds ..B3.8 + # Execution count [2.25e+00] + lea 128(%rcx), %r11 #429.12 + andq $63, %r11 #429.12 + testl $7, %r11d #429.12 + je ..B3.11 # Prob 50% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d xmm0 xmm1 ymm2 +..B3.10: # Preds ..B3.9 + # Execution count [1.12e+00] + xorl %r11d, %r11d #429.12 + jmp ..B3.13 # Prob 100% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d xmm0 xmm1 ymm2 +..B3.11: # Preds ..B3.9 + # Execution count [1.12e+00] + testl %r11d, %r11d #429.12 + je ..B3.13 # Prob 50% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d xmm0 xmm1 ymm2 +..B3.12: # Preds ..B3.11 + # Execution count [1.25e+01] + negl %r11d #429.12 + addl $64, %r11d #429.12 + shrl $3, %r11d #429.12 + cmpl %r11d, %edx #429.12 + cmovl %edx, %r11d #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d xmm0 xmm1 ymm2 +..B3.13: # Preds ..B3.10 ..B3.12 ..B3.11 + # Execution count [2.50e+00] + movl %edx, %r15d #429.12 + subl %r11d, %r15d #429.12 + andl $15, %r15d #429.12 + negl %r15d #429.12 + addl %edx, %r15d #429.12 + cmpl $1, %r11d #429.12 + jb ..B3.17 # Prob 50% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d r15d xmm0 xmm1 ymm2 +..B3.14: # Preds ..B3.13 + # Execution count [2.25e+00] + vpbroadcastd %r11d, %xmm3 #429.12 + xorl %esi, %esi #429.12 + vmovdqa %xmm0, %xmm4 #429.12 + movslq %r11d, %r12 #429.12 + # LOE rax rcx rbx rsi r9 r10 r12 r13 r14 edx edi r8d r11d r15d xmm0 xmm1 xmm3 xmm4 ymm2 +..B3.15: # Preds ..B3.15 ..B3.14 + # Execution count [1.25e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #429.12 + vpaddd %xmm1, %xmm4, %xmm4 #429.12 + vmovupd %ymm2, (%rcx,%rsi,8){%k1} #429.12 + vmovupd %ymm2, 64(%rcx,%rsi,8){%k1} #429.12 + vmovupd %ymm2, 128(%rcx,%rsi,8){%k1} #429.12 + addq $4, %rsi #429.12 + cmpq %r12, %rsi #429.12 + jb ..B3.15 # Prob 82% #429.12 + # LOE rax rcx rbx rsi r9 r10 r12 r13 r14 edx edi r8d r11d r15d xmm0 xmm1 xmm3 xmm4 ymm2 +..B3.16: # Preds ..B3.15 + # Execution count [2.25e+00] + cmpl %r11d, %edx #429.12 + je ..B3.25 # Prob 10% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d r15d xmm0 xmm1 ymm2 +..B3.17: # Preds ..B3.13 ..B3.16 + # Execution count [1.25e+01] + lea 16(%r11), %r12d #429.12 + cmpl %r12d, %r15d #429.12 + jl ..B3.21 # Prob 50% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r11d r15d xmm0 xmm1 ymm2 +..B3.18: # Preds ..B3.17 + # Execution count [2.25e+00] + movslq %r11d, %r11 #429.12 + movslq %r15d, %r12 #429.12 + .align 16,0x90 + # LOE rax rcx rbx r9 r10 r11 r12 r13 r14 edx edi r8d r15d xmm0 xmm1 ymm2 +..B3.19: # Preds ..B3.19 ..B3.18 + # Execution count [1.25e+01] + vmovupd %ymm2, (%rcx,%r11,8) #429.12 + vmovupd %ymm2, 32(%rcx,%r11,8) #429.12 + vmovupd %ymm2, 64(%rcx,%r11,8) #429.12 + vmovupd %ymm2, 128(%rcx,%r11,8) #429.12 + vmovupd %ymm2, 192(%rcx,%r11,8) #429.12 + vmovupd %ymm2, 96(%rcx,%r11,8) #429.12 + vmovupd %ymm2, 160(%rcx,%r11,8) #429.12 + vmovupd %ymm2, 224(%rcx,%r11,8) #429.12 + addq $16, %r11 #429.12 + cmpq %r12, %r11 #429.12 + jb ..B3.19 # Prob 82% #429.12 + # LOE rax rcx rbx r9 r10 r11 r12 r13 r14 edx edi r8d r15d xmm0 xmm1 ymm2 +..B3.21: # Preds ..B3.19 ..B3.17 ..B3.41 + # Execution count [2.50e+00] + lea 1(%r15), %r11d #429.12 + cmpl %edx, %r11d #429.12 + ja ..B3.25 # Prob 50% #429.12 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r15d xmm0 xmm1 ymm2 +..B3.22: # Preds ..B3.21 + # Execution count [2.25e+00] + movslq %r15d, %r12 #429.12 + negl %r15d #429.12 + addl %edx, %r15d #429.12 + xorl %r11d, %r11d #429.12 + movslq %edx, %rdx #429.12 + vmovdqa %xmm0, %xmm4 #429.12 + vpbroadcastd %r15d, %xmm3 #429.12 + subq %r12, %rdx #429.12 + lea (%rcx,%r12,8), %rcx #429.12 + # LOE rax rdx rcx rbx r9 r10 r11 r13 r14 edi r8d xmm0 xmm1 xmm3 xmm4 ymm2 +..B3.23: # Preds ..B3.23 ..B3.22 + # Execution count [1.25e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #429.12 + vpaddd %xmm1, %xmm4, %xmm4 #429.12 + vmovupd %ymm2, (%rcx,%r11,8){%k1} #429.12 + vmovupd %ymm2, 64(%rcx,%r11,8){%k1} #429.12 + vmovupd %ymm2, 128(%rcx,%r11,8){%k1} #429.12 + addq $4, %r11 #429.12 + cmpq %rdx, %r11 #429.12 + jb ..B3.23 # Prob 82% #429.12 + # LOE rax rdx rcx rbx r9 r10 r11 r13 r14 edi r8d xmm0 xmm1 xmm3 xmm4 ymm2 +..B3.25: # Preds ..B3.23 ..B3.7 ..B3.16 ..B3.21 + # Execution count [2.50e+00] + incl %r8d #429.12 + addq $56, %r9 #429.12 + cmpl %edi, %r8d #429.12 + jb ..B3.7 # Prob 82% #429.12 + # LOE rax rbx r9 r10 r13 r14 edi r8d xmm0 xmm1 ymm2 +..B3.26: # Preds ..B3.25 + # Execution count [4.50e-01] + movq 192(%rsp), %r12 #[spill] + .cfi_restore 12 + # LOE rbx r12 r13 r14 +..B3.27: # Preds ..B3.5 ..B3.26 + # Execution count [5.00e-01] + xorl %eax, %eax #429.12 + vzeroupper #429.12 +..___tag_value_computeForceLJ_2xnn.160: +# getTimeStamp() + call getTimeStamp #429.12 +..___tag_value_computeForceLJ_2xnn.161: + # LOE rbx r12 r13 r14 xmm0 +..B3.45: # Preds ..B3.27 + # Execution count [5.00e-01] + vmovsd %xmm0, 200(%rsp) #429.12[spill] + # LOE rbx r12 r13 r14 +..B3.28: # Preds ..B3.45 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #429.12 +..___tag_value_computeForceLJ_2xnn.163: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #429.12 +..___tag_value_computeForceLJ_2xnn.164: + # LOE rbx r12 r13 r14 +..B3.29: # Preds ..B3.28 + # Execution count [5.00e-01] + xorl %edi, %edi #429.12 + xorl %eax, %eax #429.12 + cmpl $0, 20(%rbx) #429.12 + jle ..B3.37 # Prob 10% #429.12 + # LOE rax rbx r12 r13 r14 edi +..B3.30: # Preds ..B3.29 + # Execution count [4.50e-01] + movl $65484, %edx #429.12 + kmovw %edx, %k2 #429.12 + movl $65450, %edx #429.12 + kmovw %edx, %k1 #429.12 + vmovups 64(%rsp), %zmm25 #429.12[spill] + vmovups (%rsp), %zmm26 #429.12[spill] + vmovups 128(%rsp), %zmm27 #429.12[spill] + vbroadcastsd .L_2il0floatpacket.2(%rip), %zmm28 #429.12 + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm24 #429.12 + movq %r12, 192(%rsp) #429.12[spill] + vpxord %zmm8, %zmm8, %zmm8 #429.12 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + # LOE rax rbx r13 r14 edi zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B3.31: # Preds ..B3.35 ..B3.30 + # Execution count [2.50e+00] + movl %edi, %r9d #429.12 + movl %edi, %r15d #429.12 + sarl $1, %r9d #429.12 + andl $1, %r15d #429.12 + shll $2, %r15d #429.12 + movl 16(%r13), %r10d #429.12 + imull %edi, %r10d #429.12 + movq 160(%rbx), %r12 #429.12 + lea (%r9,%r9,2), %r8d #429.12 + vmovaps %zmm8, %zmm16 #429.12 + lea (%r15,%r8,8), %r9d #429.12 + movslq %r9d, %r9 #429.12 + vmovaps %zmm16, %zmm15 #429.12 + movslq %r10d, %r10 #429.12 + vmovaps %zmm15, %zmm14 #429.12 + vbroadcastsd 8(%r12,%r9,8), %ymm20 #429.12 + vbroadcastsd 24(%r12,%r9,8), %ymm18 #429.12 + vbroadcastsd 72(%r12,%r9,8), %ymm0 #429.12 + vbroadcastsd 88(%r12,%r9,8), %ymm2 #429.12 + vbroadcastsd 136(%r12,%r9,8), %ymm4 #429.12 + vbroadcastsd 152(%r12,%r9,8), %ymm6 #429.12 + vbroadcastsd 128(%r12,%r9,8), %zmm3 #429.12 + vbroadcastsd 64(%r12,%r9,8), %zmm17 #429.12 + vbroadcastsd (%r12,%r9,8), %zmm21 #429.12 + vbroadcastsd 16(%r12,%r9,8), %zmm19 #429.12 + vbroadcastsd 80(%r12,%r9,8), %zmm1 #429.12 + vbroadcastsd 144(%r12,%r9,8), %zmm5 #429.12 + vinsertf64x4 $1, %ymm20, %zmm21, %zmm21 #429.12 + vinsertf64x4 $1, %ymm18, %zmm19, %zmm20 #429.12 + vinsertf64x4 $1, %ymm0, %zmm17, %zmm19 #429.12 + vinsertf64x4 $1, %ymm2, %zmm1, %zmm18 #429.12 + vinsertf64x4 $1, %ymm4, %zmm3, %zmm17 #429.12 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm23 #429.12 + movq 8(%r13), %r11 #429.12 + movq 24(%r13), %r8 #429.12 + vmovaps %zmm14, %zmm13 #429.12 + vmovaps %zmm13, %zmm12 #429.12 + lea (%r11,%r10,4), %rsi #429.12 + movslq (%r8,%rax,4), %r11 #429.12 + xorl %r10d, %r10d #429.12 + vmovaps %zmm12, %zmm22 #429.12 + movq 176(%rbx), %r15 #429.12 + testq %r11, %r11 #429.12 + jle ..B3.35 # Prob 10% #429.12 + # LOE rax rbx rsi r9 r10 r11 r12 r13 r14 r15 edi zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B3.32: # Preds ..B3.31 + # Execution count [2.25e+00] + movq %r13, 8(%rsp) #[spill] + movq %r14, (%rsp) #[spill] + # LOE rax rbx rsi r9 r10 r11 r12 r15 edi zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B3.33: # Preds ..B3.33 ..B3.32 + # Execution count [1.25e+01] + movl (%rsi,%r10,4), %r8d #429.12 + incq %r10 #429.12 + lea (%r8,%r8,2), %r13d #429.12 + shll $3, %r13d #429.12 + lea (%r8,%r8), %r14d #429.12 + movslq %r13d, %r13 #429.12 + cmpl %edi, %r14d #429.12 + lea 1(%r8,%r8), %edx #429.12 + movl $0, %r8d #429.12 + sete %r8b #429.12 + cmpl %edi, %edx #429.12 + movl $0, %edx #429.12 + vbroadcastf64x4 128(%r12,%r13,8), %zmm31 #429.12 + sete %dl #429.12 + vbroadcastf64x4 64(%r12,%r13,8), %zmm30 #429.12 + vbroadcastf64x4 (%r12,%r13,8), %zmm29 #429.12 + vsubpd %zmm31, %zmm17, %zmm10 #429.12 + vsubpd %zmm31, %zmm23, %zmm5 #429.12 + vsubpd %zmm30, %zmm19, %zmm9 #429.12 + vsubpd %zmm30, %zmm18, %zmm6 #429.12 + vsubpd %zmm29, %zmm21, %zmm11 #429.12 + vsubpd %zmm29, %zmm20, %zmm7 #429.12 + vmulpd %zmm10, %zmm10, %zmm0 #429.12 + vmulpd %zmm5, %zmm5, %zmm1 #429.12 + vfmadd231pd %zmm9, %zmm9, %zmm0 #429.12 + vfmadd231pd %zmm6, %zmm6, %zmm1 #429.12 + vfmadd231pd %zmm11, %zmm11, %zmm0 #429.12 + vfmadd231pd %zmm7, %zmm7, %zmm1 #429.12 + vrcp14pd %zmm0, %zmm4 #429.12 + vrcp14pd %zmm1, %zmm3 #429.12 + vcmppd $17, %zmm27, %zmm1, %k0 #429.12 + vcmppd $17, %zmm27, %zmm0, %k4 #429.12 + vmulpd %zmm4, %zmm26, %zmm2 #429.12 + vmulpd %zmm3, %zmm26, %zmm29 #429.12 + vmulpd %zmm2, %zmm4, %zmm30 #429.12 + vmulpd %zmm29, %zmm3, %zmm1 #429.12 + vmulpd %zmm30, %zmm4, %zmm2 #429.12 + vmulpd %zmm1, %zmm3, %zmm0 #429.12 + vfmsub213pd %zmm28, %zmm4, %zmm30 #429.12 + vfmsub213pd %zmm28, %zmm3, %zmm1 #429.12 + vmulpd %zmm4, %zmm25, %zmm4 #429.12 + vmulpd %zmm3, %zmm25, %zmm3 #429.12 + vmulpd %zmm4, %zmm30, %zmm31 #429.12 + vmulpd %zmm3, %zmm1, %zmm1 #429.12 + vmulpd %zmm31, %zmm2, %zmm2 #429.12 + vmulpd %zmm1, %zmm0, %zmm0 #429.12 + vmulpd %zmm2, %zmm24, %zmm4 #429.12 + vmulpd %zmm0, %zmm24, %zmm2 #429.12 + movl %edx, %r13d #429.12 + lea (%r8,%r8), %ecx #429.12 + shll $5, %r13d #429.12 + negl %ecx #429.12 + subl %r13d, %ecx #429.12 + movl %r8d, %r13d #429.12 + movl %edx, %r14d #429.12 + negl %r13d #429.12 + shll $4, %r14d #429.12 + shll $4, %ecx #429.12 + subl %r14d, %r13d #429.12 + addl $4080, %ecx #429.12 + addl $255, %r13d #429.12 + orl %r13d, %ecx #429.12 + movl %edx, %r14d #429.12 + kmovb %ecx, %k3 #429.12 + kmovb %k3, %ecx #429.12 + kmovb %ecx, %k5 #429.12 + kmovw %k4, %ecx #429.12 + kmovb %ecx, %k6 #429.12 + lea (,%r8,8), %ecx #429.12 + shll $2, %r8d #429.12 + negl %ecx #429.12 + shll $7, %r14d #429.12 + negl %r8d #429.12 + shll $6, %edx #429.12 + subl %r14d, %ecx #429.12 + shll $4, %ecx #429.12 + subl %edx, %r8d #429.12 + addl $4080, %ecx #429.12 + addl $255, %r8d #429.12 + orl %r8d, %ecx #429.12 + kmovb %ecx, %k4 #429.12 + kmovw %k0, %r8d #429.12 + kmovb %k4, %edx #429.12 + kandb %k6, %k5, %k7 #429.12 + kmovb %edx, %k5 #429.12 + kmovb %r8d, %k0 #429.12 + kandb %k0, %k5, %k6 #429.12 + kmovb %k7, %r13d #429.12 + kmovb %k6, %edx #429.12 + kmovw %r13d, %k3 #429.12 + kmovw %edx, %k7 #429.12 + vfmadd231pd %zmm11, %zmm4, %zmm16{%k3} #429.12 + vfmadd231pd %zmm9, %zmm4, %zmm15{%k3} #429.12 + vfmadd231pd %zmm10, %zmm4, %zmm14{%k3} #429.12 + vfmadd231pd %zmm7, %zmm2, %zmm13{%k7} #429.12 + vfmadd231pd %zmm6, %zmm2, %zmm12{%k7} #429.12 + vfmadd231pd %zmm5, %zmm2, %zmm22{%k7} #429.12 + cmpq %r11, %r10 #429.12 + jl ..B3.33 # Prob 82% #429.12 + # LOE rax rbx rsi r9 r10 r11 r12 r15 edi zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B3.34: # Preds ..B3.33 + # Execution count [2.25e+00] + movq 8(%rsp), %r13 #[spill] + movq (%rsp), %r14 #[spill] + # LOE rax rbx r9 r11 r13 r14 r15 edi zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm22 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B3.35: # Preds ..B3.34 ..B3.31 + # Execution count [2.50e+00] + vpermpd $78, %zmm16, %zmm17 #429.12 + vxorpd %xmm7, %xmm7, %xmm7 #429.12 + vpermpd $78, %zmm15, %zmm23 #429.12 + vpermpd $78, %zmm14, %zmm2 #429.12 + vaddpd %zmm16, %zmm17, %zmm18 #429.12 + vpermpd $78, %zmm13, %zmm16 #429.12 + vaddpd %zmm15, %zmm23, %zmm30 #429.12 + vpermpd $78, %zmm12, %zmm29 #429.12 + vaddpd %zmm14, %zmm2, %zmm4 #429.12 + vpermpd $78, %zmm22, %zmm3 #429.12 + vaddpd %zmm16, %zmm13, %zmm18{%k2} #429.12 + vaddpd %zmm29, %zmm12, %zmm30{%k2} #429.12 + vaddpd %zmm3, %zmm22, %zmm4{%k2} #429.12 + vpermpd $177, %zmm18, %zmm19 #429.12 + vpermpd $177, %zmm30, %zmm31 #429.12 + vpermpd $177, %zmm4, %zmm22 #429.12 + vaddpd %zmm19, %zmm18, %zmm20 #429.12 + vaddpd %zmm31, %zmm30, %zmm0 #429.12 + vaddpd %zmm22, %zmm4, %zmm5 #429.12 + vshuff64x2 $238, %zmm20, %zmm20, %zmm20{%k1} #429.12 + vshuff64x2 $238, %zmm0, %zmm0, %zmm0{%k1} #429.12 + vshuff64x2 $238, %zmm5, %zmm5, %zmm5{%k1} #429.12 + incl %edi #429.12 + incq %rax #429.12 + vaddpd (%r15,%r9,8), %ymm20, %ymm21 #429.12 + vaddpd 64(%r15,%r9,8), %ymm0, %ymm1 #429.12 + vaddpd 128(%r15,%r9,8), %ymm5, %ymm6 #429.12 + vmovupd %ymm21, (%r15,%r9,8) #429.12 + vmovupd %ymm1, 64(%r15,%r9,8) #429.12 + vmovupd %ymm6, 128(%r15,%r9,8) #429.12 + addq %r11, 8(%r14) #429.12 + vcvtsi2sd %r11d, %xmm7, %xmm7 #429.12 + vcvttsd2si %xmm7, %r8 #429.12 + incq (%r14) #429.12 + addq %r8, 16(%r14) #429.12 + cmpl 20(%rbx), %edi #429.12 + jl ..B3.31 # Prob 82% #429.12 + # LOE rax rbx r13 r14 edi zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 k2 +..B3.36: # Preds ..B3.35 + # Execution count [4.50e-01] + movq 192(%rsp), %r12 #[spill] + .cfi_restore 12 + # LOE r12 +..B3.37: # Preds ..B3.36 ..B3.29 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #429.12 + vzeroupper #429.12 +..___tag_value_computeForceLJ_2xnn.175: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #429.12 +..___tag_value_computeForceLJ_2xnn.176: + # LOE r12 +..B3.38: # Preds ..B3.37 + # Execution count [5.00e-01] + xorl %eax, %eax #429.12 +..___tag_value_computeForceLJ_2xnn.177: +# getTimeStamp() + call getTimeStamp #429.12 +..___tag_value_computeForceLJ_2xnn.178: + # LOE r12 xmm0 +..B3.46: # Preds ..B3.38 + # Execution count [5.00e-01] + vmovsd %xmm0, (%rsp) #429.12[spill] + # LOE r12 +..B3.39: # Preds ..B3.46 + # Execution count [5.00e-01] + movl $.L_2__STRING.4, %edi #429.12 + xorl %eax, %eax #429.12 +..___tag_value_computeForceLJ_2xnn.180: +# debug_printf(const char *, ...) + call debug_printf #429.12 +..___tag_value_computeForceLJ_2xnn.181: + # LOE r12 +..B3.40: # Preds ..B3.39 + # Execution count [5.00e-01] + vmovsd (%rsp), %xmm0 #429.12[spill] + vsubsd 200(%rsp), %xmm0, %xmm0 #429.12[spill] + addq $224, %rsp #429.12 + .cfi_restore 3 + popq %rbx #429.12 + .cfi_restore 15 + popq %r15 #429.12 + .cfi_restore 14 + popq %r14 #429.12 + .cfi_restore 13 + popq %r13 #429.12 + movq %rbp, %rsp #429.12 + popq %rbp #429.12 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #429.12 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B3.41: # Preds ..B3.8 + # Execution count [2.25e-01]: Infreq + xorl %r15d, %r15d #429.12 + jmp ..B3.21 # Prob 100% #429.12 + .align 16,0x90 + # LOE rax rcx rbx r9 r10 r13 r14 edx edi r8d r15d xmm0 xmm1 ymm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_2xnn,@function + .size computeForceLJ_2xnn,.-computeForceLJ_2xnn +..LNcomputeForceLJ_2xnn.2: + .data +# -- End computeForceLJ_2xnn + .text +.L_2__routine_start_computeForceLJ_2xnn_half_3: +# -- Begin computeForceLJ_2xnn_half + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_2xnn_half +# --- computeForceLJ_2xnn_half(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_2xnn_half: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B4.1: # Preds ..B4.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_2xnn_half.198: +..L199: + #135.97 + pushq %rbp #135.97 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #135.97 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #135.97 + pushq %r12 #135.97 + pushq %r13 #135.97 + pushq %r14 #135.97 + pushq %r15 #135.97 + pushq %rbx #135.97 + subq $280, %rsp #135.97 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r15 #135.97 + movl $.L_2__STRING.3, %edi #136.5 + xorl %eax, %eax #136.5 + movq %rcx, %r13 #135.97 + movq %rdx, %r14 #135.97 + movq %rsi, %rbx #135.97 +..___tag_value_computeForceLJ_2xnn_half.208: +# debug_printf(const char *, ...) + call debug_printf #136.5 +..___tag_value_computeForceLJ_2xnn_half.209: + # LOE rbx r12 r13 r14 r15 +..B4.2: # Preds ..B4.1 + # Execution count [1.00e+00] + vmovsd 144(%r15), %xmm0 #139.27 + xorl %ecx, %ecx #149.5 + vmulsd %xmm0, %xmm0, %xmm1 #142.36 + xorl %esi, %esi #151.27 + vbroadcastsd 56(%r15), %zmm3 #143.32 + vbroadcastsd 40(%r15), %zmm4 #144.29 + vbroadcastsd %xmm1, %zmm2 #142.36 + vbroadcastsd .L_2il0floatpacket.2(%rip), %zmm5 #146.29 + vmovups %zmm3, 128(%rsp) #143.32[spill] + vmovups %zmm2, 64(%rsp) #142.36[spill] + vmovups %zmm4, (%rsp) #144.29[spill] + vmovups %zmm5, 192(%rsp) #146.29[spill] + movl 20(%rbx), %edx #149.26 + testl %edx, %edx #149.26 + jle ..B4.24 # Prob 9% #149.26 + # LOE rbx rsi r12 r13 r14 edx ecx +..B4.3: # Preds ..B4.2 + # Execution count [9.00e-01] + movq 176(%rbx), %rdi #151.27 + movq 192(%rbx), %rax #152.32 + vxorpd %ymm2, %ymm2, %ymm2 #153.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #152.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #152.9 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B4.4: # Preds ..B4.22 ..B4.3 + # Execution count [5.00e+00] + movl %ecx, %r8d #150.27 + movl %ecx, %r9d #150.27 + sarl $1, %r8d #150.27 + andl $1, %r9d #150.27 + shll $2, %r9d #150.27 + lea (%r8,%r8,2), %r10d #150.27 + lea (%r9,%r10,8), %r11d #150.27 + movslq %r11d, %r11 #151.27 + lea (%rdi,%r11,8), %r12 #151.27 + movl (%rsi,%rax), %r11d #152.32 + testl %r11d, %r11d #152.32 + jle ..B4.22 # Prob 50% #152.32 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B4.5: # Preds ..B4.4 + # Execution count [4.50e+00] + cmpl $16, %r11d #152.9 + jl ..B4.38 # Prob 10% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B4.6: # Preds ..B4.5 + # Execution count [4.50e+00] + lea 128(%r12), %r8 #155.13 + andq $63, %r8 #152.9 + testl $7, %r8d #152.9 + je ..B4.8 # Prob 50% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B4.7: # Preds ..B4.6 + # Execution count [2.25e+00] + xorl %r8d, %r8d #152.9 + jmp ..B4.10 # Prob 100% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B4.8: # Preds ..B4.6 + # Execution count [2.25e+00] + testl %r8d, %r8d #152.9 + je ..B4.10 # Prob 50% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B4.9: # Preds ..B4.8 + # Execution count [2.50e+01] + negl %r8d #152.9 + addl $64, %r8d #152.9 + shrl $3, %r8d #152.9 + cmpl %r8d, %r11d #152.9 + cmovl %r11d, %r8d #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B4.10: # Preds ..B4.7 ..B4.9 ..B4.8 + # Execution count [5.00e+00] + movl %r11d, %r10d #152.9 + subl %r8d, %r10d #152.9 + andl $15, %r10d #152.9 + negl %r10d #152.9 + addl %r11d, %r10d #152.9 + cmpl $1, %r8d #152.9 + jb ..B4.14 # Prob 50% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B4.11: # Preds ..B4.10 + # Execution count [4.50e+00] + vpbroadcastd %r8d, %xmm3 #152.9 + xorl %r15d, %r15d #152.9 + vmovdqa %xmm0, %xmm4 #152.9 + movslq %r8d, %r9 #152.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B4.12: # Preds ..B4.12 ..B4.11 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #152.9 + vpaddd %xmm1, %xmm4, %xmm4 #152.9 + vmovupd %ymm2, (%r12,%r15,8){%k1} #153.13 + vmovupd %ymm2, 64(%r12,%r15,8){%k1} #154.13 + vmovupd %ymm2, 128(%r12,%r15,8){%k1} #155.13 + addq $4, %r15 #152.9 + cmpq %r9, %r15 #152.9 + jb ..B4.12 # Prob 82% #152.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B4.13: # Preds ..B4.12 + # Execution count [4.50e+00] + cmpl %r8d, %r11d #152.9 + je ..B4.22 # Prob 10% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B4.14: # Preds ..B4.10 ..B4.13 + # Execution count [2.50e+01] + lea 16(%r8), %r9d #152.9 + cmpl %r9d, %r10d #152.9 + jl ..B4.18 # Prob 50% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B4.15: # Preds ..B4.14 + # Execution count [4.50e+00] + movslq %r8d, %r8 #152.9 + movslq %r10d, %r9 #152.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B4.16: # Preds ..B4.16 ..B4.15 + # Execution count [2.50e+01] + vmovupd %ymm2, (%r12,%r8,8) #153.13 + vmovupd %ymm2, 32(%r12,%r8,8) #153.13 + vmovupd %ymm2, 64(%r12,%r8,8) #153.13 + vmovupd %ymm2, 128(%r12,%r8,8) #154.13 + vmovupd %ymm2, 192(%r12,%r8,8) #155.13 + vmovupd %ymm2, 96(%r12,%r8,8) #153.13 + vmovupd %ymm2, 160(%r12,%r8,8) #154.13 + vmovupd %ymm2, 224(%r12,%r8,8) #155.13 + addq $16, %r8 #152.9 + cmpq %r9, %r8 #152.9 + jb ..B4.16 # Prob 82% #152.9 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B4.18: # Preds ..B4.16 ..B4.14 ..B4.38 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #152.9 + cmpl %r11d, %r8d #152.9 + ja ..B4.22 # Prob 50% #152.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B4.19: # Preds ..B4.18 + # Execution count [4.50e+00] + movslq %r10d, %r9 #153.13 + negl %r10d #152.9 + addl %r11d, %r10d #152.9 + xorl %r8d, %r8d #152.9 + movslq %r11d, %r11 #152.9 + vmovdqa %xmm0, %xmm4 #152.9 + vpbroadcastd %r10d, %xmm3 #152.9 + subq %r9, %r11 #152.9 + lea (%r12,%r9,8), %r12 #153.13 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B4.20: # Preds ..B4.20 ..B4.19 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #152.9 + vpaddd %xmm1, %xmm4, %xmm4 #152.9 + vmovupd %ymm2, (%r12,%r8,8){%k1} #153.13 + vmovupd %ymm2, 64(%r12,%r8,8){%k1} #154.13 + vmovupd %ymm2, 128(%r12,%r8,8){%k1} #155.13 + addq $4, %r8 #152.9 + cmpq %r11, %r8 #152.9 + jb ..B4.20 # Prob 82% #152.9 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B4.22: # Preds ..B4.20 ..B4.4 ..B4.13 ..B4.18 + # Execution count [5.00e+00] + incl %ecx #149.5 + addq $56, %rsi #149.5 + cmpl %edx, %ecx #149.5 + jb ..B4.4 # Prob 82% #149.5 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B4.24: # Preds ..B4.22 ..B4.2 + # Execution count [1.00e+00] + xorl %eax, %eax #159.16 + vzeroupper #159.16 +..___tag_value_computeForceLJ_2xnn_half.214: +# getTimeStamp() + call getTimeStamp #159.16 +..___tag_value_computeForceLJ_2xnn_half.215: + # LOE rbx r12 r13 r14 xmm0 +..B4.41: # Preds ..B4.24 + # Execution count [1.00e+00] + vmovsd %xmm0, 256(%rsp) #159.16[spill] + # LOE rbx r12 r13 r14 +..B4.25: # Preds ..B4.41 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #163.5 +..___tag_value_computeForceLJ_2xnn_half.217: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #163.5 +..___tag_value_computeForceLJ_2xnn_half.218: + # LOE rbx r12 r13 r14 +..B4.26: # Preds ..B4.25 + # Execution count [1.00e+00] + xorl %r9d, %r9d #166.16 + xorl %r10d, %r10d #166.16 + cmpl $0, 20(%rbx) #166.26 + jle ..B4.34 # Prob 10% #166.26 + # LOE rbx r10 r12 r13 r14 r9d +..B4.27: # Preds ..B4.26 + # Execution count [9.00e-01] + movl $65484, %eax #270.9 + kmovw %eax, %k2 #270.9 + movl $65450, %eax #270.9 + vmovsd .L_2il0floatpacket.2(%rip), %xmm15 #270.9 + kmovw %eax, %k1 #270.9 + vmovups 192(%rsp), %zmm21 #270.9[spill] + vmovups (%rsp), %zmm17 #270.9[spill] + vmovups 128(%rsp), %zmm18 #270.9[spill] + vmovups 64(%rsp), %zmm19 #270.9[spill] + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm16 #270.9 + vpxord %zmm1, %zmm1, %zmm1 #183.30 + # LOE rbx r10 r13 r14 r9d xmm15 zmm1 zmm16 zmm17 zmm18 zmm19 zmm21 k1 k2 +..B4.28: # Preds ..B4.32 ..B4.27 + # Execution count [5.00e+00] + movl %r9d, %ecx #171.27 + movl %r9d, %edi #171.27 + sarl $1, %ecx #171.27 + andl $1, %edi #171.27 + shll $2, %edi #171.27 + movl 16(%r14), %edx #174.44 + imull %r9d, %edx #174.44 + movq 160(%rbx), %r11 #172.27 + lea (%rcx,%rcx,2), %r8d #171.27 + vmovaps %zmm1, %zmm22 #183.30 + lea (%rdi,%r8,8), %r8d #171.27 + movslq %r8d, %r8 #171.27 + vmovaps %zmm22, %zmm23 #184.30 + movslq %edx, %rdx #174.19 + vmovaps %zmm23, %zmm24 #185.30 + vbroadcastsd 8(%r11,%r8,8), %ymm12 #177.33 + vbroadcastsd 24(%r11,%r8,8), %ymm10 #178.33 + vbroadcastsd 72(%r11,%r8,8), %ymm0 #179.33 + vbroadcastsd 88(%r11,%r8,8), %ymm3 #180.33 + vbroadcastsd 136(%r11,%r8,8), %ymm5 #181.33 + vbroadcastsd 152(%r11,%r8,8), %ymm7 #182.33 + vbroadcastsd 128(%r11,%r8,8), %zmm4 #181.33 + vbroadcastsd 64(%r11,%r8,8), %zmm9 #179.33 + vbroadcastsd (%r11,%r8,8), %zmm13 #177.33 + vbroadcastsd 16(%r11,%r8,8), %zmm11 #178.33 + vbroadcastsd 80(%r11,%r8,8), %zmm2 #180.33 + vbroadcastsd 144(%r11,%r8,8), %zmm6 #182.33 + vinsertf64x4 $1, %ymm12, %zmm13, %zmm13 #177.33 + vinsertf64x4 $1, %ymm10, %zmm11, %zmm12 #178.33 + vinsertf64x4 $1, %ymm0, %zmm9, %zmm11 #179.33 + vinsertf64x4 $1, %ymm3, %zmm2, %zmm10 #180.33 + vinsertf64x4 $1, %ymm5, %zmm4, %zmm9 #181.33 + vinsertf64x4 $1, %ymm7, %zmm6, %zmm6 #182.33 + movq 24(%r14), %rsi #175.25 + movq 8(%r14), %rax #174.19 + vmovaps %zmm24, %zmm25 #186.30 + vmovaps %zmm25, %zmm26 #187.30 + movslq (%rsi,%r10,4), %rsi #175.25 + lea (%rax,%rdx,4), %rdx #174.19 + movq 176(%rbx), %rcx #173.27 + movq %rcx, %rdi #173.27 + vmovaps %zmm26, %zmm14 #188.30 + xorl %eax, %eax #190.19 + testq %rsi, %rsi #190.28 + jle ..B4.32 # Prob 10% #190.28 + # LOE rax rdx rcx rbx rsi rdi r8 r10 r11 r13 r14 r9d xmm15 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm16 zmm17 zmm18 zmm19 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 k1 k2 +..B4.29: # Preds ..B4.28 + # Execution count [4.50e+00] + vmovups .L_2il0floatpacket.5(%rip), %zmm20 #266.13 + movq %r10, 16(%rsp) #266.13[spill] + movq %r14, 8(%rsp) #266.13[spill] + movq %r13, (%rsp) #266.13[spill] + # LOE rax rdx rcx rbx rsi rdi r8 r11 r9d xmm15 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 k1 k2 +..B4.30: # Preds ..B4.43 ..B4.29 + # Execution count [2.50e+01] + movl (%rdx,%rax,4), %r15d #191.22 + xorl %r13d, %r13d #214.66 + movslq %r15d, %r15 #192.31 + incq %rax #190.39 + lea (%r15,%r15), %r10d #214.56 + cmpl %r9d, %r10d #214.66 + lea 1(%r15,%r15), %r12d #215.61 + sete %r13b #214.66 + lea (%r15,%r15,2), %r14 #193.31 + shlq $6, %r14 #193.31 + cmpl %r9d, %r12d #215.66 + movl $0, %r12d #215.66 + movl %r13d, %r15d #229.39 + sete %r12b #215.66 + negl %r15d #229.39 + movl %r12d, %r10d #229.39 + vbroadcastf64x4 128(%r14,%r11), %zmm28 #199.36 + vbroadcastf64x4 64(%r14,%r11), %zmm27 #198.36 + vbroadcastf64x4 (%r14,%r11), %zmm3 #197.36 + vsubpd %zmm28, %zmm9, %zmm5 #202.35 + vsubpd %zmm28, %zmm6, %zmm4 #205.35 + vsubpd %zmm3, %zmm13, %zmm0 #200.35 + vsubpd %zmm27, %zmm11, %zmm7 #201.35 + vsubpd %zmm3, %zmm12, %zmm8 #203.35 + vsubpd %zmm27, %zmm10, %zmm3 #204.35 + vmulpd %zmm5, %zmm5, %zmm31 #232.80 + vmulpd %zmm4, %zmm4, %zmm30 #233.80 + vfmadd231pd %zmm7, %zmm7, %zmm31 #232.57 + vfmadd231pd %zmm3, %zmm3, %zmm30 #233.57 + vfmadd231pd %zmm0, %zmm0, %zmm31 #232.34 + vfmadd231pd %zmm8, %zmm8, %zmm30 #233.34 + vrcp14pd %zmm31, %zmm29 #238.35 + vrcp14pd %zmm30, %zmm27 #239.35 + vcmppd $17, %zmm19, %zmm30, %k0 #236.67 + vcmppd $17, %zmm19, %zmm31, %k4 #235.67 + vmulpd %zmm18, %zmm29, %zmm2 #241.67 + vmulpd %zmm18, %zmm27, %zmm28 #242.67 + vmulpd %zmm2, %zmm29, %zmm30 #241.51 + vmulpd %zmm28, %zmm27, %zmm28 #242.51 + vmulpd %zmm30, %zmm29, %zmm2 #241.35 + vmulpd %zmm28, %zmm27, %zmm31 #242.35 + vfmsub213pd %zmm21, %zmm29, %zmm30 #244.79 + vfmsub213pd %zmm21, %zmm27, %zmm28 #245.79 + vmulpd %zmm17, %zmm29, %zmm29 #244.105 + vmulpd %zmm17, %zmm27, %zmm27 #245.105 + vmulpd %zmm29, %zmm30, %zmm30 #244.70 + vmulpd %zmm27, %zmm28, %zmm27 #245.70 + vmovupd 64(%r14,%rdi), %ymm28 #266.13 + vmulpd %zmm30, %zmm2, %zmm2 #244.54 + vmulpd %zmm27, %zmm31, %zmm31 #245.54 + vmovupd 128(%r14,%rdi), %ymm27 #266.13 + vmulpd %zmm2, %zmm16, %zmm2 #244.36 + vmulpd %zmm31, %zmm16, %zmm30 #245.36 + vmovupd (%r14,%rdi), %ymm31 #266.13 + shll $6, %r10d #229.39 + lea (%r13,%r13,2), %r11d #229.39 + negl %r10d #229.39 + negl %r11d #229.39 + addl %r12d, %r10d #229.39 + addl %r10d, %r11d #229.39 + movl %r12d, %r10d #229.39 + shll $5, %r10d #229.39 + negl %r10d #229.39 + addl %r12d, %r10d #229.39 + shll $4, %r11d #229.39 + addl $4080, %r11d #229.39 + lea 255(%r15,%r10), %r15d #229.39 + kmovw %k4, %r10d #235.67 + orl %r15d, %r11d #229.39 + movl %r12d, %r15d #230.39 + kmovb %r11d, %k3 #229.39 + kmovb %r10d, %k6 #235.41 + movl %r13d, %r10d #230.39 + kmovb %k3, %r11d #229.39 + shll $4, %r10d #230.39 + shll $8, %r15d #230.39 + negl %r10d #230.39 + kmovb %r11d, %k5 #235.41 + negl %r15d #230.39 + kandb %k6, %k5, %k7 #235.41 + addl %r13d, %r10d #230.39 + addl %r12d, %r15d #230.39 + kmovb %k7, %r11d #235.41 + addl %r15d, %r10d #230.39 + movl %r12d, %r15d #230.39 + kmovw %r11d, %k3 #247.33 + lea (,%r13,8), %r11d #230.39 + shll $7, %r15d #230.39 + subl %r11d, %r13d #230.39 + subl %r15d, %r12d #230.39 + shll $4, %r10d #230.39 + addl $4080, %r10d #230.39 + kmovw %k0, %r11d #236.67 + vmulpd %zmm2, %zmm0, %zmm29{%k3}{z} #247.33 + vmulpd %zmm2, %zmm7, %zmm0{%k3}{z} #248.33 + vmulpd %zmm2, %zmm5, %zmm5{%k3}{z} #249.33 + vaddpd %zmm22, %zmm29, %zmm22 #254.20 + vaddpd %zmm23, %zmm0, %zmm23 #255.20 + vaddpd %zmm24, %zmm5, %zmm24 #256.20 + kmovb %r11d, %k0 #236.41 + lea 255(%r13,%r12), %r13d #230.39 + orl %r13d, %r10d #230.39 + kmovb %r10d, %k4 #230.39 + kmovb %k4, %r10d #230.39 + kmovb %r10d, %k5 #236.41 + kandb %k0, %k5, %k6 #236.41 + kmovb %k6, %r12d #236.41 + kmovw %r12d, %k7 #250.33 + vmulpd %zmm30, %zmm8, %zmm7{%k7}{z} #250.33 + vmulpd %zmm30, %zmm3, %zmm8{%k7}{z} #251.33 + vmulpd %zmm30, %zmm4, %zmm2{%k7}{z} #252.33 + vaddpd %zmm7, %zmm29, %zmm3 #266.38 + vaddpd %zmm8, %zmm0, %zmm4 #266.49 + vaddpd %zmm2, %zmm5, %zmm29 #266.60 + vaddpd %zmm26, %zmm8, %zmm26 #258.20 + vaddpd %zmm25, %zmm7, %zmm25 #257.20 + vaddpd %zmm14, %zmm2, %zmm14 #259.20 + vpermd %zmm3, %zmm20, %zmm0 #266.13 + vpermd %zmm4, %zmm20, %zmm8 #266.13 + vpermd %zmm29, %zmm20, %zmm30 #266.13 + vaddpd %zmm3, %zmm0, %zmm5 #266.13 + vaddpd %zmm4, %zmm8, %zmm4 #266.13 + vaddpd %zmm29, %zmm30, %zmm29 #266.13 + vsubpd %ymm5, %ymm31, %ymm7 #266.13 + vsubpd %ymm4, %ymm28, %ymm28 #266.13 + vsubpd %ymm29, %ymm27, %ymm27 #266.13 + vmovupd %ymm7, (%r14,%rdi) #266.13 + vmovupd %ymm28, 64(%r14,%rdi) #266.13 + vmovupd %ymm27, 128(%r14,%rdi) #266.13 + cmpq %rsi, %rax #190.28 + jge ..B4.31 # Prob 18% #190.28 + # LOE rax rdx rcx rbx rsi r8 r9d xmm15 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 k1 k2 +..B4.43: # Preds ..B4.30 + # Execution count [2.05e+01] + movq 176(%rbx), %rdi #151.27 + movq 160(%rbx), %r11 #172.27 + jmp ..B4.30 # Prob 100% #172.27 + # LOE rax rdx rcx rbx rsi rdi r8 r11 r9d xmm15 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 k1 k2 +..B4.31: # Preds ..B4.30 + # Execution count [4.50e+00] + movq 16(%rsp), %r10 #[spill] + movq 8(%rsp), %r14 #[spill] + movq (%rsp), %r13 #[spill] + # LOE rcx rbx rsi r8 r10 r13 r14 r9d xmm15 zmm1 zmm14 zmm16 zmm17 zmm18 zmm19 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 k1 k2 +..B4.32: # Preds ..B4.31 ..B4.28 + # Execution count [5.00e+00] + vpermpd $78, %zmm22, %zmm20 #270.9 + vxorpd %xmm8, %xmm8, %xmm8 #276.9 + vpermpd $78, %zmm23, %zmm30 #271.9 + vpermpd $78, %zmm24, %zmm3 #272.9 + vaddpd %zmm22, %zmm20, %zmm27 #270.9 + vpermpd $78, %zmm25, %zmm22 #270.9 + vaddpd %zmm23, %zmm30, %zmm31 #271.9 + vpermpd $78, %zmm26, %zmm23 #271.9 + vaddpd %zmm24, %zmm3, %zmm4 #272.9 + vpermpd $78, %zmm14, %zmm24 #272.9 + vaddpd %zmm22, %zmm25, %zmm27{%k2} #270.9 + vaddpd %zmm23, %zmm26, %zmm31{%k2} #271.9 + vaddpd %zmm24, %zmm14, %zmm4{%k2} #272.9 + vpermpd $177, %zmm27, %zmm25 #270.9 + vpermpd $177, %zmm31, %zmm26 #271.9 + vpermpd $177, %zmm4, %zmm5 #272.9 + vaddpd %zmm25, %zmm27, %zmm28 #270.9 + vaddpd %zmm26, %zmm31, %zmm0 #271.9 + vaddpd %zmm5, %zmm4, %zmm6 #272.9 + vshuff64x2 $238, %zmm28, %zmm28, %zmm28{%k1} #270.9 + vshuff64x2 $238, %zmm0, %zmm0, %zmm0{%k1} #271.9 + vshuff64x2 $238, %zmm6, %zmm6, %zmm6{%k1} #272.9 + incl %r9d #166.49 + incq %r10 #166.49 + vaddpd (%rcx,%r8,8), %ymm28, %ymm29 #270.9 + vaddpd 64(%rcx,%r8,8), %ymm0, %ymm2 #271.9 + vaddpd 128(%rcx,%r8,8), %ymm6, %ymm7 #272.9 + vmovupd %ymm29, (%rcx,%r8,8) #270.9 + vmovupd %ymm2, 64(%rcx,%r8,8) #271.9 + vmovupd %ymm7, 128(%rcx,%r8,8) #272.9 + addq %rsi, 8(%r13) #275.9 + vcvtsi2sd %esi, %xmm8, %xmm8 #276.9 + vmulsd %xmm8, %xmm15, %xmm9 #276.9 + vcvttsd2si %xmm9, %rax #276.9 + incq (%r13) #274.9 + addq %rax, 16(%r13) #276.9 + cmpl 20(%rbx), %r9d #166.26 + jl ..B4.28 # Prob 82% #166.26 + # LOE rbx r10 r13 r14 r9d xmm15 zmm1 zmm16 zmm17 zmm18 zmm19 zmm21 k1 k2 +..B4.34: # Preds ..B4.32 ..B4.26 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #279.5 + vzeroupper #279.5 +..___tag_value_computeForceLJ_2xnn_half.229: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #279.5 +..___tag_value_computeForceLJ_2xnn_half.230: + # LOE r12 +..B4.35: # Preds ..B4.34 + # Execution count [1.00e+00] + xorl %eax, %eax #282.16 +..___tag_value_computeForceLJ_2xnn_half.231: +# getTimeStamp() + call getTimeStamp #282.16 +..___tag_value_computeForceLJ_2xnn_half.232: + # LOE r12 xmm0 +..B4.42: # Preds ..B4.35 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #282.16[spill] + # LOE r12 +..B4.36: # Preds ..B4.42 + # Execution count [1.00e+00] + movl $.L_2__STRING.4, %edi #283.5 + xorl %eax, %eax #283.5 +..___tag_value_computeForceLJ_2xnn_half.234: +# debug_printf(const char *, ...) + call debug_printf #283.5 +..___tag_value_computeForceLJ_2xnn_half.235: + # LOE r12 +..B4.37: # Preds ..B4.36 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm0 #284.14[spill] + vsubsd 256(%rsp), %xmm0, %xmm0 #284.14[spill] + addq $280, %rsp #284.14 + .cfi_restore 3 + popq %rbx #284.14 + .cfi_restore 15 + popq %r15 #284.14 + .cfi_restore 14 + popq %r14 #284.14 + .cfi_restore 13 + popq %r13 #284.14 + .cfi_restore 12 + popq %r12 #284.14 + movq %rbp, %rsp #284.14 + popq %rbp #284.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #284.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B4.38: # Preds ..B4.5 + # Execution count [4.50e-01]: Infreq + xorl %r10d, %r10d #152.9 + jmp ..B4.18 # Prob 100% #152.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_2xnn_half,@function + .size computeForceLJ_2xnn_half,.-computeForceLJ_2xnn_half +..LNcomputeForceLJ_2xnn_half.3: + .data +# -- End computeForceLJ_2xnn_half + .text +.L_2__routine_start_computeForceLJ_4xn_4: +# -- Begin computeForceLJ_4xn + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_4xn +# --- computeForceLJ_4xn(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_4xn: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B5.1: # Preds ..B5.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_4xn.253: +..L254: + #787.91 + cmpl $0, 32(%rdx) #788.8 + je ..B5.4 # Prob 50% #788.8 + # LOE rdx rcx rbx rbp rsi rdi r12 r13 r14 r15 +..B5.2: # Preds ..B5.1 + # Execution count [5.00e-01] +# computeForceLJ_4xn_half(Parameter *, Atom *, Neighbor *, Stats *) + jmp computeForceLJ_4xn_half #789.16 + # LOE +..B5.4: # Preds ..B5.1 + # Execution count [5.00e-01] +# computeForceLJ_4xn_full(Parameter *, Atom *, Neighbor *, Stats *) + jmp computeForceLJ_4xn_full #792.12 + .align 16,0x90 + # LOE + .cfi_endproc +# mark_end; + .type computeForceLJ_4xn,@function + .size computeForceLJ_4xn,.-computeForceLJ_4xn +..LNcomputeForceLJ_4xn.4: + .data +# -- End computeForceLJ_4xn + .text +.L_2__routine_start_computeForceLJ_4xn_half_5: +# -- Begin computeForceLJ_4xn_half + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_4xn_half +# --- computeForceLJ_4xn_half(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_4xn_half: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B6.1: # Preds ..B6.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_4xn_half.256: +..L257: + #432.96 + pushq %rbp #432.96 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #432.96 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #432.96 + pushq %r12 #432.96 + pushq %r13 #432.96 + pushq %r14 #432.96 + pushq %r15 #432.96 + pushq %rbx #432.96 + subq $1176, %rsp #432.96 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r15 #432.96 + movl $.L_2__STRING.5, %edi #433.5 + xorl %eax, %eax #433.5 + movq %rcx, %r14 #432.96 + movq %rdx, %r13 #432.96 + movq %rsi, %rbx #432.96 +..___tag_value_computeForceLJ_4xn_half.266: +# debug_printf(const char *, ...) + call debug_printf #433.5 +..___tag_value_computeForceLJ_4xn_half.267: + # LOE rbx r12 r13 r14 r15 +..B6.2: # Preds ..B6.1 + # Execution count [1.00e+00] + vmovsd 144(%r15), %xmm0 #436.27 + xorl %ecx, %ecx #445.5 + vmulsd %xmm0, %xmm0, %xmm1 #439.36 + xorl %esi, %esi #447.27 + vbroadcastsd 56(%r15), %zmm3 #440.32 + vbroadcastsd 40(%r15), %zmm4 #441.29 + vbroadcastsd %xmm1, %zmm2 #439.36 + vbroadcastsd .L_2il0floatpacket.2(%rip), %zmm5 #443.29 + vmovups %zmm3, 256(%rsp) #440.32[spill] + vmovups %zmm2, 128(%rsp) #439.36[spill] + vmovups %zmm4, 192(%rsp) #441.29[spill] + vmovups %zmm5, (%rsp) #443.29[spill] + movl 20(%rbx), %edx #445.26 + testl %edx, %edx #445.26 + jle ..B6.24 # Prob 9% #445.26 + # LOE rbx rsi r12 r13 r14 edx ecx +..B6.3: # Preds ..B6.2 + # Execution count [9.00e-01] + movq 176(%rbx), %rdi #447.27 + movq 192(%rbx), %rax #448.32 + vxorpd %ymm2, %ymm2, %ymm2 #449.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #448.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #448.9 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B6.4: # Preds ..B6.22 ..B6.3 + # Execution count [5.00e+00] + movl %ecx, %r8d #446.27 + movl %ecx, %r9d #446.27 + sarl $1, %r8d #446.27 + andl $1, %r9d #446.27 + shll $2, %r9d #446.27 + lea (%r8,%r8,2), %r10d #446.27 + lea (%r9,%r10,8), %r11d #446.27 + movslq %r11d, %r11 #447.27 + lea (%rdi,%r11,8), %r12 #447.27 + movl (%rsi,%rax), %r11d #448.32 + testl %r11d, %r11d #448.32 + jle ..B6.22 # Prob 50% #448.32 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B6.5: # Preds ..B6.4 + # Execution count [4.50e+00] + cmpl $16, %r11d #448.9 + jl ..B6.38 # Prob 10% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B6.6: # Preds ..B6.5 + # Execution count [4.50e+00] + lea 128(%r12), %r8 #451.13 + andq $63, %r8 #448.9 + testl $7, %r8d #448.9 + je ..B6.8 # Prob 50% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B6.7: # Preds ..B6.6 + # Execution count [2.25e+00] + xorl %r8d, %r8d #448.9 + jmp ..B6.10 # Prob 100% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B6.8: # Preds ..B6.6 + # Execution count [2.25e+00] + testl %r8d, %r8d #448.9 + je ..B6.10 # Prob 50% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B6.9: # Preds ..B6.8 + # Execution count [2.50e+01] + negl %r8d #448.9 + addl $64, %r8d #448.9 + shrl $3, %r8d #448.9 + cmpl %r8d, %r11d #448.9 + cmovl %r11d, %r8d #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B6.10: # Preds ..B6.7 ..B6.9 ..B6.8 + # Execution count [5.00e+00] + movl %r11d, %r10d #448.9 + subl %r8d, %r10d #448.9 + andl $15, %r10d #448.9 + negl %r10d #448.9 + addl %r11d, %r10d #448.9 + cmpl $1, %r8d #448.9 + jb ..B6.14 # Prob 50% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B6.11: # Preds ..B6.10 + # Execution count [4.50e+00] + vpbroadcastd %r8d, %xmm3 #448.9 + xorl %r15d, %r15d #448.9 + vmovdqa %xmm0, %xmm4 #448.9 + movslq %r8d, %r9 #448.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B6.12: # Preds ..B6.12 ..B6.11 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #448.9 + vpaddd %xmm1, %xmm4, %xmm4 #448.9 + vmovupd %ymm2, (%r12,%r15,8){%k1} #449.13 + vmovupd %ymm2, 64(%r12,%r15,8){%k1} #450.13 + vmovupd %ymm2, 128(%r12,%r15,8){%k1} #451.13 + addq $4, %r15 #448.9 + cmpq %r9, %r15 #448.9 + jb ..B6.12 # Prob 82% #448.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B6.13: # Preds ..B6.12 + # Execution count [4.50e+00] + cmpl %r8d, %r11d #448.9 + je ..B6.22 # Prob 10% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B6.14: # Preds ..B6.10 ..B6.13 + # Execution count [2.50e+01] + lea 16(%r8), %r9d #448.9 + cmpl %r9d, %r10d #448.9 + jl ..B6.18 # Prob 50% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B6.15: # Preds ..B6.14 + # Execution count [4.50e+00] + movslq %r8d, %r8 #448.9 + movslq %r10d, %r9 #448.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B6.16: # Preds ..B6.16 ..B6.15 + # Execution count [2.50e+01] + vmovupd %ymm2, (%r12,%r8,8) #449.13 + vmovupd %ymm2, 32(%r12,%r8,8) #449.13 + vmovupd %ymm2, 64(%r12,%r8,8) #449.13 + vmovupd %ymm2, 128(%r12,%r8,8) #450.13 + vmovupd %ymm2, 192(%r12,%r8,8) #451.13 + vmovupd %ymm2, 96(%r12,%r8,8) #449.13 + vmovupd %ymm2, 160(%r12,%r8,8) #450.13 + vmovupd %ymm2, 224(%r12,%r8,8) #451.13 + addq $16, %r8 #448.9 + cmpq %r9, %r8 #448.9 + jb ..B6.16 # Prob 82% #448.9 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B6.18: # Preds ..B6.16 ..B6.14 ..B6.38 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #448.9 + cmpl %r11d, %r8d #448.9 + ja ..B6.22 # Prob 50% #448.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B6.19: # Preds ..B6.18 + # Execution count [4.50e+00] + movslq %r10d, %r9 #449.13 + negl %r10d #448.9 + addl %r11d, %r10d #448.9 + xorl %r8d, %r8d #448.9 + movslq %r11d, %r11 #448.9 + vmovdqa %xmm0, %xmm4 #448.9 + vpbroadcastd %r10d, %xmm3 #448.9 + subq %r9, %r11 #448.9 + lea (%r12,%r9,8), %r12 #449.13 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B6.20: # Preds ..B6.20 ..B6.19 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #448.9 + vpaddd %xmm1, %xmm4, %xmm4 #448.9 + vmovupd %ymm2, (%r12,%r8,8){%k1} #449.13 + vmovupd %ymm2, 64(%r12,%r8,8){%k1} #450.13 + vmovupd %ymm2, 128(%r12,%r8,8){%k1} #451.13 + addq $4, %r8 #448.9 + cmpq %r11, %r8 #448.9 + jb ..B6.20 # Prob 82% #448.9 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B6.22: # Preds ..B6.20 ..B6.4 ..B6.13 ..B6.18 + # Execution count [5.00e+00] + incl %ecx #445.5 + addq $56, %rsi #445.5 + cmpl %edx, %ecx #445.5 + jb ..B6.4 # Prob 82% #445.5 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B6.24: # Preds ..B6.22 ..B6.2 + # Execution count [1.00e+00] + xorl %eax, %eax #455.16 + vzeroupper #455.16 +..___tag_value_computeForceLJ_4xn_half.272: +# getTimeStamp() + call getTimeStamp #455.16 +..___tag_value_computeForceLJ_4xn_half.273: + # LOE rbx r12 r13 r14 xmm0 +..B6.41: # Preds ..B6.24 + # Execution count [1.00e+00] + vmovsd %xmm0, 64(%rsp) #455.16[spill] + # LOE rbx r12 r13 r14 +..B6.25: # Preds ..B6.41 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #459.5 +..___tag_value_computeForceLJ_4xn_half.275: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #459.5 +..___tag_value_computeForceLJ_4xn_half.276: + # LOE rbx r12 r13 r14 +..B6.26: # Preds ..B6.25 + # Execution count [1.00e+00] + xorl %r10d, %r10d #462.16 + xorl %r9d, %r9d #462.16 + cmpl $0, 20(%rbx) #462.26 + jle ..B6.34 # Prob 10% #462.26 + # LOE rbx r9 r12 r13 r14 r10d +..B6.27: # Preds ..B6.26 + # Execution count [9.00e-01] + movl $65450, %eax #605.9 + kmovw %eax, %k3 #605.9 + movl $65520, %eax #605.9 + kmovw %eax, %k4 #605.9 + vmovsd .L_2il0floatpacket.2(%rip), %xmm3 #605.9 + movl $12, %eax #605.9 + kmovw %eax, %k2 #605.9 + vmovups (%rsp), %zmm23 #605.9[spill] + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm24 #605.9 + vpxord %zmm1, %zmm1, %zmm1 #485.30 + # LOE rbx r9 r13 r14 r10d xmm3 zmm1 zmm23 zmm24 k2 k3 k4 +..B6.28: # Preds ..B6.32 ..B6.27 + # Execution count [5.00e+00] + vmovaps %zmm1, %zmm22 #485.30 + movl %r10d, %eax #467.27 + vmovaps %zmm22, %zmm21 #486.30 + movl %r10d, %ecx #467.27 + vmovaps %zmm21, %zmm2 #487.30 + andl $1, %ecx #467.27 + sarl $1, %eax #467.27 + movl 16(%r13), %edi #470.44 + vmovaps %zmm2, %zmm0 #488.30 + imull %r10d, %edi #470.44 + shll $2, %ecx #467.27 + lea (%rax,%rax,2), %esi #467.27 + vmovaps %zmm0, %zmm20 #489.30 + vmovaps %zmm20, %zmm19 #490.30 + vmovaps %zmm19, %zmm18 #491.30 + vmovaps %zmm18, %zmm17 #492.30 + lea (%rcx,%rsi,8), %eax #467.27 + movslq %edi, %rdi #470.19 + movslq %eax, %rax #467.27 + movq 8(%r13), %rdx #470.19 + movq 24(%r13), %r8 #471.25 + vmovaps %zmm17, %zmm25 #493.30 + movq 160(%rbx), %r11 #468.27 + lea (%rdx,%rdi,4), %rdi #470.19 + vmovaps %zmm25, %zmm26 #494.30 + vmovaps %zmm26, %zmm27 #495.30 + movslq (%r8,%r9,4), %rdx #471.25 + xorl %r8d, %r8d #498.19 + movq 176(%rbx), %rsi #469.27 + movq %rsi, %rcx #469.27 + vmovaps %zmm27, %zmm16 #496.30 + vbroadcastsd (%r11,%rax,8), %zmm15 #473.33 + vbroadcastsd 8(%r11,%rax,8), %zmm14 #474.33 + vbroadcastsd 16(%r11,%rax,8), %zmm13 #475.33 + vbroadcastsd 24(%r11,%rax,8), %zmm12 #476.33 + vbroadcastsd 64(%r11,%rax,8), %zmm11 #477.33 + vbroadcastsd 72(%r11,%rax,8), %zmm10 #478.33 + vbroadcastsd 80(%r11,%rax,8), %zmm9 #479.33 + vbroadcastsd 88(%r11,%rax,8), %zmm8 #480.33 + vbroadcastsd 128(%r11,%rax,8), %zmm7 #481.33 + vbroadcastsd 136(%r11,%rax,8), %zmm6 #482.33 + vbroadcastsd 144(%r11,%rax,8), %zmm5 #483.33 + vbroadcastsd 152(%r11,%rax,8), %zmm4 #484.33 + testq %rdx, %rdx #498.28 + jle ..B6.32 # Prob 10% #498.28 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11 r13 r14 r10d xmm3 zmm0 zmm1 zmm2 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 k2 k3 k4 +..B6.29: # Preds ..B6.28 + # Execution count [4.50e+00] + vmovups %zmm0, 1024(%rsp) #[spill] + vmovups %zmm2, 1088(%rsp) #[spill] + vmovups %zmm4, 960(%rsp) #[spill] + vmovups %zmm5, 576(%rsp) #[spill] + vmovups %zmm6, 768(%rsp) #[spill] + vmovups %zmm7, 832(%rsp) #[spill] + vmovups %zmm8, 448(%rsp) #[spill] + vmovups %zmm9, 704(%rsp) #[spill] + vmovups %zmm10, 640(%rsp) #[spill] + vmovups %zmm11, 320(%rsp) #[spill] + vmovups %zmm12, 896(%rsp) #[spill] + vmovups %zmm13, (%rsp) #[spill] + vmovups %zmm14, 512(%rsp) #[spill] + vmovups %zmm15, 384(%rsp) #[spill] + movq %r13, 80(%rsp) #[spill] + movq %r14, 72(%rsp) #[spill] + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11 r10d zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 k2 k3 k4 +..B6.30: # Preds ..B6.43 ..B6.29 + # Execution count [2.50e+01] + movl (%rdi,%r8,4), %r14d #499.22 + xorl %r12d, %r12d #526.66 + movslq %r14d, %r14 #500.31 + incq %r8 #498.39 + vmovups 832(%rsp), %zmm12 #508.35[spill] + lea (%r14,%r14), %r15d #526.56 + vmovups (%rsp), %zmm8 #512.35[spill] + vmovups 576(%rsp), %zmm7 #514.35[spill] + vmovups 512(%rsp), %zmm10 #509.35[spill] + vmovups 768(%rsp), %zmm13 #511.35[spill] + vmovups 640(%rsp), %zmm11 #510.35[spill] + vmovups 320(%rsp), %zmm15 #507.35[spill] + vmovups 704(%rsp), %zmm9 #513.35[spill] + vmovups 384(%rsp), %zmm14 #506.35[spill] + vmovups 960(%rsp), %zmm30 #517.35[spill] + vmovups 448(%rsp), %zmm29 #516.35[spill] + vmovups 896(%rsp), %zmm5 #515.35[spill] + cmpl %r10d, %r15d #526.66 + lea (%r14,%r14,2), %r13 #501.31 + sete %r12b #526.66 + shlq $6, %r13 #501.31 + vsubpd 128(%r13,%r11), %zmm12, %zmm1 #508.35 + vsubpd (%r13,%r11), %zmm8, %zmm6 #512.35 + vsubpd 128(%r13,%r11), %zmm7, %zmm8 #514.35 + vsubpd (%r13,%r11), %zmm10, %zmm2 #509.35 + vsubpd 128(%r13,%r11), %zmm13, %zmm4 #511.35 + vsubpd 64(%r13,%r11), %zmm11, %zmm3 #510.35 + vsubpd 64(%r13,%r11), %zmm15, %zmm0 #507.35 + vsubpd 64(%r13,%r11), %zmm9, %zmm9 #513.35 + vsubpd (%r13,%r11), %zmm14, %zmm28 #506.35 + vsubpd 128(%r13,%r11), %zmm30, %zmm14 #517.35 + vsubpd 64(%r13,%r11), %zmm29, %zmm15 #516.35 + vsubpd (%r13,%r11), %zmm5, %zmm12 #515.35 + vmulpd %zmm1, %zmm1, %zmm10 #541.80 + vmulpd %zmm8, %zmm8, %zmm13 #543.80 + vmulpd %zmm4, %zmm4, %zmm11 #542.80 + vmulpd %zmm14, %zmm14, %zmm30 #544.80 + vmovups 128(%rsp), %zmm29 #546.67[spill] + vfmadd231pd %zmm0, %zmm0, %zmm10 #541.57 + vfmadd231pd %zmm9, %zmm9, %zmm13 #543.57 + vfmadd231pd %zmm3, %zmm3, %zmm11 #542.57 + vfmadd231pd %zmm15, %zmm15, %zmm30 #544.57 + vfmadd231pd %zmm28, %zmm28, %zmm10 #541.34 + vfmadd231pd %zmm6, %zmm6, %zmm13 #543.34 + vfmadd231pd %zmm2, %zmm2, %zmm11 #542.34 + vfmadd231pd %zmm12, %zmm12, %zmm30 #544.34 + vrcp14pd %zmm10, %zmm31 #551.35 + vrcp14pd %zmm13, %zmm7 #553.35 + vrcp14pd %zmm11, %zmm5 #552.35 + vcmppd $17, %zmm29, %zmm13, %k7 #548.67 + vcmppd $17, %zmm29, %zmm11, %k6 #547.67 + vcmppd $17, %zmm29, %zmm10, %k1 #546.67 + vcmppd $17, %zmm29, %zmm30, %k0 #549.67 + vrcp14pd %zmm30, %zmm10 #554.35 + vmovups 256(%rsp), %zmm13 #556.67[spill] + vmulpd %zmm13, %zmm31, %zmm11 #556.67 + lea 1(%r14,%r14), %r11d #527.61 + vmulpd %zmm11, %zmm31, %zmm29 #556.51 + cmpl %r10d, %r11d #527.66 + vmovups 192(%rsp), %zmm11 #561.105[spill] + vmulpd %zmm29, %zmm31, %zmm30 #556.35 + vfmsub213pd %zmm23, %zmm31, %zmm29 #561.79 + vmulpd %zmm11, %zmm31, %zmm31 #561.105 + vmulpd %zmm31, %zmm29, %zmm29 #561.70 + movl $0, %r11d #527.66 + vmulpd %zmm29, %zmm30, %zmm30 #561.54 + sete %r11b #527.66 + vmulpd %zmm13, %zmm5, %zmm29 #557.67 + vmulpd %zmm30, %zmm24, %zmm31 #561.36 + movl %r11d, %r15d #528.39 + movl %r12d, %r14d #528.39 + shll $5, %r15d #528.39 + negl %r14d #528.39 + negl %r15d #528.39 + addl %r11d, %r15d #528.39 + lea 255(%r15,%r14), %r15d #528.39 + kmovb %r15d, %k5 #528.39 + kmovw %k1, %r15d #546.67 + kmovb %k5, %r14d #528.39 + kmovb %r14d, %k5 #546.41 + kmovb %r15d, %k1 #546.41 + movl %r11d, %r15d #529.39 + kandb %k1, %k5, %k5 #546.41 + kmovb %k5, %r14d #546.41 + kmovw %r14d, %k5 #566.33 + lea (%r12,%r12,2), %r14d #529.39 + vmulpd %zmm31, %zmm28, %zmm28{%k5}{z} #566.33 + negl %r14d #529.39 + vmulpd %zmm31, %zmm0, %zmm0{%k5}{z} #567.33 + vmulpd %zmm31, %zmm1, %zmm31{%k5}{z} #568.33 + vaddpd %zmm22, %zmm28, %zmm22 #579.20 + vaddpd %zmm21, %zmm0, %zmm21 #580.20 + vaddpd 1088(%rsp), %zmm31, %zmm1 #581.20[spill] + vmovups %zmm1, 1088(%rsp) #581.20[spill] + vmulpd %zmm29, %zmm5, %zmm1 #557.51 + vmulpd %zmm1, %zmm5, %zmm30 #557.35 + vfmsub213pd %zmm23, %zmm5, %zmm1 #562.79 + vmulpd %zmm11, %zmm5, %zmm5 #562.105 + vmulpd %zmm5, %zmm1, %zmm29 #562.70 + vmulpd %zmm29, %zmm30, %zmm30 #562.54 + shll $6, %r15d #529.39 + negl %r15d #529.39 + addl %r11d, %r15d #529.39 + vmulpd %zmm30, %zmm24, %zmm29 #562.36 + lea 255(%r15,%r14), %r15d #529.39 + kmovb %r15d, %k1 #529.39 + kmovw %k6, %r15d #547.67 + kmovb %k1, %r14d #529.39 + kmovb %r14d, %k1 #547.41 + kmovb %r15d, %k6 #547.41 + movl %r11d, %r15d #530.39 + kandb %k6, %k1, %k1 #547.41 + kmovb %k1, %r14d #547.41 + kmovw %r14d, %k1 #569.33 + lea (,%r12,8), %r14d #530.39 + vmulpd %zmm29, %zmm2, %zmm2{%k1}{z} #569.33 + negl %r14d #530.39 + vmulpd %zmm29, %zmm3, %zmm3{%k1}{z} #570.33 + vmulpd %zmm29, %zmm4, %zmm30{%k1}{z} #571.33 + vaddpd %zmm2, %zmm28, %zmm29 #599.83 + vaddpd %zmm3, %zmm0, %zmm28 #600.83 + vaddpd 1024(%rsp), %zmm2, %zmm4 #582.20[spill] + vaddpd %zmm20, %zmm3, %zmm20 #583.20 + vaddpd %zmm19, %zmm30, %zmm19 #584.20 + vaddpd %zmm30, %zmm31, %zmm31 #601.83 + vmulpd %zmm13, %zmm7, %zmm0 #558.67 + vmovups %zmm4, 1024(%rsp) #582.20[spill] + vmulpd %zmm0, %zmm7, %zmm1 #558.51 + vmulpd %zmm1, %zmm7, %zmm2 #558.35 + addl %r12d, %r14d #530.39 + vfmsub213pd %zmm23, %zmm7, %zmm1 #563.79 + vmulpd %zmm11, %zmm7, %zmm7 #563.105 + vmulpd %zmm7, %zmm1, %zmm3 #563.70 + vmulpd %zmm3, %zmm2, %zmm4 #563.54 + vmulpd %zmm13, %zmm10, %zmm2 #559.67 + vmulpd %zmm4, %zmm24, %zmm5 #563.36 + vmulpd %zmm2, %zmm10, %zmm3 #559.51 + shll $7, %r15d #530.39 + negl %r15d #530.39 + addl %r11d, %r15d #530.39 + vmulpd %zmm3, %zmm10, %zmm4 #559.35 + vfmsub213pd %zmm23, %zmm10, %zmm3 #564.79 + vmulpd %zmm11, %zmm10, %zmm10 #564.105 + lea 255(%r15,%r14), %r15d #530.39 + kmovb %r15d, %k6 #530.39 + kmovw %k7, %r15d #548.67 + kmovb %k6, %r14d #530.39 + kmovb %r14d, %k6 #548.41 + kmovb %r15d, %k7 #548.41 + movl %r12d, %r15d #531.39 + kandb %k7, %k6, %k7 #548.41 + kmovb %k7, %r14d #548.41 + kmovw %r14d, %k7 #572.33 + movl %r11d, %r14d #531.39 + vmulpd %zmm5, %zmm6, %zmm6{%k7}{z} #572.33 + vmulpd %zmm5, %zmm9, %zmm0{%k7}{z} #573.33 + vmulpd %zmm5, %zmm8, %zmm1{%k7}{z} #574.33 + vmulpd %zmm10, %zmm3, %zmm5 #564.70 + vaddpd %zmm18, %zmm6, %zmm18 #585.20 + vaddpd %zmm29, %zmm6, %zmm30 #599.89 + vaddpd %zmm28, %zmm0, %zmm29 #600.89 + vaddpd %zmm31, %zmm1, %zmm28 #601.89 + vaddpd %zmm17, %zmm0, %zmm17 #586.20 + vaddpd %zmm25, %zmm1, %zmm25 #587.20 + vmulpd %zmm5, %zmm4, %zmm6 #564.54 + shll $4, %r15d #531.39 + shll $8, %r14d #531.39 + subl %r15d, %r12d #531.39 + subl %r14d, %r11d #531.39 + vmulpd %zmm6, %zmm24, %zmm7 #564.36 + lea 255(%r11,%r12), %r12d #531.39 + kmovb %r12d, %k6 #531.39 + kmovw %k0, %r12d #549.67 + kmovb %k6, %r11d #531.39 + kmovb %r11d, %k6 #549.41 + kmovb %r12d, %k0 #549.41 + kandb %k0, %k6, %k0 #549.41 + kmovb %k0, %r14d #549.41 + kmovw %r14d, %k6 #575.33 + vmulpd %zmm7, %zmm12, %zmm8{%k6}{z} #575.33 + vmulpd %zmm7, %zmm14, %zmm11{%k6}{z} #577.33 + vmulpd %zmm7, %zmm15, %zmm9{%k6}{z} #576.33 + vmovups 64(%r13,%rcx), %zmm14 #600.44 + vaddpd %zmm30, %zmm8, %zmm12 #599.95 + vaddpd %zmm28, %zmm11, %zmm31 #601.95 + vaddpd %zmm29, %zmm9, %zmm15 #600.95 + vaddpd %zmm26, %zmm8, %zmm26 #588.20 + vaddpd %zmm27, %zmm9, %zmm27 #589.20 + vsubpd %zmm15, %zmm14, %zmm29 #600.95 + vaddpd %zmm16, %zmm11, %zmm16 #590.20 + vmovups (%r13,%rcx), %zmm28 #599.44 + vmovups 128(%r13,%rcx), %zmm30 #601.44 + vmovups %zmm29, 64(%r13,%rcx) #600.13 + vsubpd %zmm12, %zmm28, %zmm13 #599.95 + vsubpd %zmm31, %zmm30, %zmm0 #601.95 + vmovups %zmm13, (%r13,%rcx) #599.13 + vmovups %zmm0, 128(%r13,%rcx) #601.13 + cmpq %rdx, %r8 #498.28 + jge ..B6.31 # Prob 18% #498.28 + # LOE rax rdx rbx rsi rdi r8 r9 r10d zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 k2 k3 k4 +..B6.43: # Preds ..B6.30 + # Execution count [2.05e+01] + movq 176(%rbx), %rcx #447.27 + movq 160(%rbx), %r11 #468.27 + jmp ..B6.30 # Prob 100% #468.27 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11 r10d zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 k2 k3 k4 +..B6.31: # Preds ..B6.30 + # Execution count [4.50e+00] + vmovups 1024(%rsp), %zmm0 #[spill] + vmovups 1088(%rsp), %zmm2 #[spill] + vmovsd .L_2il0floatpacket.2(%rip), %xmm3 # + movq 80(%rsp), %r13 #[spill] + movq 72(%rsp), %r14 #[spill] + vpxord %zmm1, %zmm1, %zmm1 # + # LOE rax rdx rbx rsi r9 r13 r14 r10d xmm3 zmm0 zmm1 zmm2 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 k2 k3 k4 +..B6.32: # Preds ..B6.31 ..B6.28 + # Execution count [5.00e+00] + vpermilpd $85, %zmm21, %zmm8 #606.9 + incl %r10d #462.49 + vaddpd %zmm21, %zmm8, %zmm9 #606.9 + incq %r9 #462.49 + vpermilpd $85, %zmm17, %zmm21 #606.9 + vpermilpd $85, %zmm22, %zmm28 #605.9 + vaddpd %zmm17, %zmm21, %zmm10 #606.9 + vaddpd %zmm22, %zmm28, %zmm30 #605.9 + vpermilpd $85, %zmm20, %zmm17 #606.9 + vaddpd %zmm17, %zmm20, %zmm9{%k3} #606.9 + vpermilpd $85, %zmm2, %zmm17 #607.9 + vpermilpd $85, %zmm18, %zmm22 #605.9 + vaddpd %zmm2, %zmm17, %zmm17 #607.9 + vaddpd %zmm18, %zmm22, %zmm31 #605.9 + vxorpd %xmm22, %xmm22, %xmm22 #611.9 + vpermilpd $85, %zmm0, %zmm18 #605.9 + vpermilpd $85, %zmm25, %zmm2 #607.9 + vaddpd %zmm18, %zmm0, %zmm30{%k3} #605.9 + vaddpd %zmm25, %zmm2, %zmm18 #607.9 + vpermilpd $85, %zmm19, %zmm25 #607.9 + vaddpd %zmm25, %zmm19, %zmm17{%k3} #607.9 + vpermilpd $85, %zmm26, %zmm29 #605.9 + vpermilpd $85, %zmm27, %zmm20 #606.9 + vpermilpd $85, %zmm16, %zmm19 #607.9 + vaddpd %zmm29, %zmm26, %zmm31{%k3} #605.9 + valignd $8, %zmm30, %zmm30, %zmm26 #605.9 + vaddpd %zmm20, %zmm27, %zmm10{%k3} #606.9 + valignd $8, %zmm9, %zmm9, %zmm27 #606.9 + vaddpd %zmm19, %zmm16, %zmm18{%k3} #607.9 + valignd $8, %zmm17, %zmm17, %zmm16 #607.9 + vaddpd %zmm26, %zmm30, %zmm4 #605.9 + valignd $8, %zmm31, %zmm31, %zmm0 #605.9 + vaddpd %zmm27, %zmm9, %zmm12 #606.9 + valignd $8, %zmm10, %zmm10, %zmm11 #606.9 + vaddpd %zmm16, %zmm17, %zmm17 #607.9 + valignd $8, %zmm18, %zmm18, %zmm16 #607.9 + vaddpd %zmm0, %zmm31, %zmm4{%k4} #605.9 + vaddpd %zmm11, %zmm10, %zmm12{%k4} #606.9 + vaddpd %zmm16, %zmm18, %zmm17{%k4} #607.9 + vshuff64x2 $177, %zmm4, %zmm4, %zmm5 #605.9 + vshuff64x2 $177, %zmm12, %zmm12, %zmm13 #606.9 + vshuff64x2 $177, %zmm17, %zmm17, %zmm19 #607.9 + vaddpd %zmm5, %zmm4, %zmm6 #605.9 + vaddpd %zmm13, %zmm12, %zmm14 #606.9 + vaddpd %zmm19, %zmm17, %zmm20 #607.9 + vshuff64x2 $238, %zmm6, %zmm6, %zmm6{%k2} #605.9 + vshuff64x2 $238, %zmm14, %zmm14, %zmm14{%k2} #606.9 + vshuff64x2 $238, %zmm20, %zmm20, %zmm20{%k2} #607.9 + vaddpd (%rsi,%rax,8), %ymm6, %ymm7 #605.9 + vaddpd 64(%rsi,%rax,8), %ymm14, %ymm15 #606.9 + vaddpd 128(%rsi,%rax,8), %ymm20, %ymm21 #607.9 + vmovupd %ymm7, (%rsi,%rax,8) #605.9 + vmovupd %ymm15, 64(%rsi,%rax,8) #606.9 + vmovupd %ymm21, 128(%rsi,%rax,8) #607.9 + addq %rdx, 8(%r14) #610.9 + vcvtsi2sd %edx, %xmm22, %xmm22 #611.9 + vmulsd %xmm22, %xmm3, %xmm0 #611.9 + vcvttsd2si %xmm0, %rax #611.9 + incq (%r14) #609.9 + addq %rax, 16(%r14) #611.9 + cmpl 20(%rbx), %r10d #462.26 + jl ..B6.28 # Prob 82% #462.26 + # LOE rbx r9 r13 r14 r10d xmm3 zmm1 zmm23 zmm24 k2 k3 k4 +..B6.34: # Preds ..B6.32 ..B6.26 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #614.5 + vzeroupper #614.5 +..___tag_value_computeForceLJ_4xn_half.317: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #614.5 +..___tag_value_computeForceLJ_4xn_half.318: + # LOE r12 +..B6.35: # Preds ..B6.34 + # Execution count [1.00e+00] + xorl %eax, %eax #617.16 +..___tag_value_computeForceLJ_4xn_half.319: +# getTimeStamp() + call getTimeStamp #617.16 +..___tag_value_computeForceLJ_4xn_half.320: + # LOE r12 xmm0 +..B6.42: # Preds ..B6.35 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #617.16[spill] + # LOE r12 +..B6.36: # Preds ..B6.42 + # Execution count [1.00e+00] + movl $.L_2__STRING.6, %edi #618.5 + xorl %eax, %eax #618.5 +..___tag_value_computeForceLJ_4xn_half.322: +# debug_printf(const char *, ...) + call debug_printf #618.5 +..___tag_value_computeForceLJ_4xn_half.323: + # LOE r12 +..B6.37: # Preds ..B6.36 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm0 #619.14[spill] + vsubsd 64(%rsp), %xmm0, %xmm0 #619.14[spill] + addq $1176, %rsp #619.14 + .cfi_restore 3 + popq %rbx #619.14 + .cfi_restore 15 + popq %r15 #619.14 + .cfi_restore 14 + popq %r14 #619.14 + .cfi_restore 13 + popq %r13 #619.14 + .cfi_restore 12 + popq %r12 #619.14 + movq %rbp, %rsp #619.14 + popq %rbp #619.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #619.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B6.38: # Preds ..B6.5 + # Execution count [4.50e-01]: Infreq + xorl %r10d, %r10d #448.9 + jmp ..B6.18 # Prob 100% #448.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_4xn_half,@function + .size computeForceLJ_4xn_half,.-computeForceLJ_4xn_half +..LNcomputeForceLJ_4xn_half.5: + .data +# -- End computeForceLJ_4xn_half + .text +.L_2__routine_start_computeForceLJ_4xn_full_6: +# -- Begin computeForceLJ_4xn_full + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_4xn_full +# --- computeForceLJ_4xn_full(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_4xn_full: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B7.1: # Preds ..B7.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_4xn_full.341: +..L342: + #622.96 + pushq %rbp #622.96 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #622.96 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #622.96 + pushq %r12 #622.96 + pushq %r13 #622.96 + pushq %r14 #622.96 + pushq %r15 #622.96 + pushq %rbx #622.96 + subq $1176, %rsp #622.96 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r15 #622.96 + movl $.L_2__STRING.5, %edi #623.5 + xorl %eax, %eax #623.5 + movq %rcx, %r13 #622.96 + movq %rdx, %r14 #622.96 + movq %rsi, %rbx #622.96 +..___tag_value_computeForceLJ_4xn_full.351: +# debug_printf(const char *, ...) + call debug_printf #623.5 +..___tag_value_computeForceLJ_4xn_full.352: + # LOE rbx r12 r13 r14 r15 +..B7.2: # Preds ..B7.1 + # Execution count [1.00e+00] + vmovsd 144(%r15), %xmm0 #626.27 + xorl %ecx, %ecx #635.5 + vmulsd %xmm0, %xmm0, %xmm1 #629.36 + xorl %esi, %esi #637.27 + vbroadcastsd 56(%r15), %zmm3 #630.32 + vbroadcastsd 40(%r15), %zmm4 #631.29 + vbroadcastsd %xmm1, %zmm2 #629.36 + vmovups %zmm3, 64(%rsp) #630.32[spill] + vmovups %zmm4, 128(%rsp) #631.29[spill] + vmovups %zmm2, 192(%rsp) #629.36[spill] + movl 20(%rbx), %edx #635.26 + testl %edx, %edx #635.26 + jle ..B7.24 # Prob 9% #635.26 + # LOE rbx rsi r12 r13 r14 edx ecx +..B7.3: # Preds ..B7.2 + # Execution count [9.00e-01] + movq 176(%rbx), %rdi #637.27 + movq 192(%rbx), %rax #638.32 + vxorpd %ymm2, %ymm2, %ymm2 #639.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #638.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #638.9 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B7.4: # Preds ..B7.22 ..B7.3 + # Execution count [5.00e+00] + movl %ecx, %r8d #636.27 + movl %ecx, %r9d #636.27 + sarl $1, %r8d #636.27 + andl $1, %r9d #636.27 + shll $2, %r9d #636.27 + lea (%r8,%r8,2), %r10d #636.27 + lea (%r9,%r10,8), %r11d #636.27 + movslq %r11d, %r11 #637.27 + lea (%rdi,%r11,8), %r12 #637.27 + movl (%rsi,%rax), %r11d #638.32 + testl %r11d, %r11d #638.32 + jle ..B7.22 # Prob 50% #638.32 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B7.5: # Preds ..B7.4 + # Execution count [4.50e+00] + cmpl $16, %r11d #638.9 + jl ..B7.38 # Prob 10% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r11d xmm0 xmm1 ymm2 +..B7.6: # Preds ..B7.5 + # Execution count [4.50e+00] + lea 128(%r12), %r8 #641.13 + andq $63, %r8 #638.9 + testl $7, %r8d #638.9 + je ..B7.8 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B7.7: # Preds ..B7.6 + # Execution count [2.25e+00] + xorl %r8d, %r8d #638.9 + jmp ..B7.10 # Prob 100% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B7.8: # Preds ..B7.6 + # Execution count [2.25e+00] + testl %r8d, %r8d #638.9 + je ..B7.10 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B7.9: # Preds ..B7.8 + # Execution count [2.50e+01] + negl %r8d #638.9 + addl $64, %r8d #638.9 + shrl $3, %r8d #638.9 + cmpl %r8d, %r11d #638.9 + cmovl %r11d, %r8d #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r11d xmm0 xmm1 ymm2 +..B7.10: # Preds ..B7.7 ..B7.9 ..B7.8 + # Execution count [5.00e+00] + movl %r11d, %r10d #638.9 + subl %r8d, %r10d #638.9 + andl $15, %r10d #638.9 + negl %r10d #638.9 + addl %r11d, %r10d #638.9 + cmpl $1, %r8d #638.9 + jb ..B7.14 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B7.11: # Preds ..B7.10 + # Execution count [4.50e+00] + vpbroadcastd %r8d, %xmm3 #638.9 + xorl %r15d, %r15d #638.9 + vmovdqa %xmm0, %xmm4 #638.9 + movslq %r8d, %r9 #638.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B7.12: # Preds ..B7.12 ..B7.11 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #638.9 + vpaddd %xmm1, %xmm4, %xmm4 #638.9 + vmovupd %ymm2, (%r12,%r15,8){%k1} #639.13 + vmovupd %ymm2, 64(%r12,%r15,8){%k1} #640.13 + vmovupd %ymm2, 128(%r12,%r15,8){%k1} #641.13 + addq $4, %r15 #638.9 + cmpq %r9, %r15 #638.9 + jb ..B7.12 # Prob 82% #638.9 + # LOE rax rbx rsi rdi r9 r12 r13 r14 r15 edx ecx r8d r10d r11d xmm0 xmm1 xmm3 xmm4 ymm2 +..B7.13: # Preds ..B7.12 + # Execution count [4.50e+00] + cmpl %r8d, %r11d #638.9 + je ..B7.22 # Prob 10% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B7.14: # Preds ..B7.10 ..B7.13 + # Execution count [2.50e+01] + lea 16(%r8), %r9d #638.9 + cmpl %r9d, %r10d #638.9 + jl ..B7.18 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r8d r10d r11d xmm0 xmm1 ymm2 +..B7.15: # Preds ..B7.14 + # Execution count [4.50e+00] + movslq %r8d, %r8 #638.9 + movslq %r10d, %r9 #638.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B7.16: # Preds ..B7.16 ..B7.15 + # Execution count [2.50e+01] + vmovupd %ymm2, (%r12,%r8,8) #639.13 + vmovupd %ymm2, 32(%r12,%r8,8) #639.13 + vmovupd %ymm2, 64(%r12,%r8,8) #639.13 + vmovupd %ymm2, 128(%r12,%r8,8) #640.13 + vmovupd %ymm2, 192(%r12,%r8,8) #641.13 + vmovupd %ymm2, 96(%r12,%r8,8) #639.13 + vmovupd %ymm2, 160(%r12,%r8,8) #640.13 + vmovupd %ymm2, 224(%r12,%r8,8) #641.13 + addq $16, %r8 #638.9 + cmpq %r9, %r8 #638.9 + jb ..B7.16 # Prob 82% #638.9 + # LOE rax rbx rsi rdi r8 r9 r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B7.18: # Preds ..B7.16 ..B7.14 ..B7.38 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #638.9 + cmpl %r11d, %r8d #638.9 + ja ..B7.22 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 +..B7.19: # Preds ..B7.18 + # Execution count [4.50e+00] + movslq %r10d, %r9 #639.13 + negl %r10d #638.9 + addl %r11d, %r10d #638.9 + xorl %r8d, %r8d #638.9 + movslq %r11d, %r11 #638.9 + vmovdqa %xmm0, %xmm4 #638.9 + vpbroadcastd %r10d, %xmm3 #638.9 + subq %r9, %r11 #638.9 + lea (%r12,%r9,8), %r12 #639.13 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B7.20: # Preds ..B7.20 ..B7.19 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #638.9 + vpaddd %xmm1, %xmm4, %xmm4 #638.9 + vmovupd %ymm2, (%r12,%r8,8){%k1} #639.13 + vmovupd %ymm2, 64(%r12,%r8,8){%k1} #640.13 + vmovupd %ymm2, 128(%r12,%r8,8){%k1} #641.13 + addq $4, %r8 #638.9 + cmpq %r11, %r8 #638.9 + jb ..B7.20 # Prob 82% #638.9 + # LOE rax rbx rsi rdi r8 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm3 xmm4 ymm2 +..B7.22: # Preds ..B7.20 ..B7.4 ..B7.13 ..B7.18 + # Execution count [5.00e+00] + incl %ecx #635.5 + addq $56, %rsi #635.5 + cmpl %edx, %ecx #635.5 + jb ..B7.4 # Prob 82% #635.5 + # LOE rax rbx rsi rdi r13 r14 edx ecx xmm0 xmm1 ymm2 +..B7.24: # Preds ..B7.22 ..B7.2 + # Execution count [1.00e+00] + xorl %eax, %eax #645.16 + vzeroupper #645.16 +..___tag_value_computeForceLJ_4xn_full.356: +# getTimeStamp() + call getTimeStamp #645.16 +..___tag_value_computeForceLJ_4xn_full.357: + # LOE rbx r12 r13 r14 xmm0 +..B7.41: # Preds ..B7.24 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #645.16[spill] + # LOE rbx r12 r13 r14 +..B7.25: # Preds ..B7.41 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #649.5 +..___tag_value_computeForceLJ_4xn_full.359: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #649.5 +..___tag_value_computeForceLJ_4xn_full.360: + # LOE rbx r12 r13 r14 +..B7.26: # Preds ..B7.25 + # Execution count [1.00e+00] + xorl %r11d, %r11d #652.16 + xorl %r15d, %r15d #652.16 + cmpl $0, 20(%rbx) #652.26 + jle ..B7.34 # Prob 10% #652.26 + # LOE rbx r12 r13 r14 r15 r11d +..B7.27: # Preds ..B7.26 + # Execution count [9.00e-01] + movl $65450, %eax #769.9 + kmovw %eax, %k3 #769.9 + movl $65520, %eax #769.9 + kmovw %eax, %k4 #769.9 + vbroadcastsd .L_2il0floatpacket.2(%rip), %zmm1 #769.9 + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm10 #769.9 + movl $12, %eax #769.9 + kmovw %eax, %k2 #769.9 + vpxord %zmm13, %zmm13, %zmm13 #675.30 + # LOE rbx r13 r14 r15 r11d zmm1 zmm10 zmm13 k2 k3 k4 +..B7.28: # Preds ..B7.32 ..B7.27 + # Execution count [5.00e+00] + vmovaps %zmm13, %zmm26 #675.30 + movl %r11d, %ecx #657.27 + vmovaps %zmm26, %zmm12 #676.30 + movl %r11d, %r9d #657.27 + vmovaps %zmm12, %zmm9 #677.30 + andl $1, %r9d #657.27 + sarl $1, %ecx #657.27 + vmovaps %zmm9, %zmm8 #678.30 + shll $2, %r9d #657.27 + movl 16(%r14), %eax #660.44 + imull %r11d, %eax #660.44 + lea (%rcx,%rcx,2), %r10d #657.27 + vmovaps %zmm8, %zmm7 #679.30 + lea (%r9,%r10,8), %ecx #657.27 + vmovaps %zmm7, %zmm6 #680.30 + vmovaps %zmm6, %zmm5 #681.30 + vmovaps %zmm5, %zmm4 #682.30 + movslq %ecx, %rcx #657.27 + movslq %eax, %rax #660.19 + movq 24(%r14), %rsi #661.25 + vmovaps %zmm4, %zmm11 #683.30 + movq 8(%r14), %rdx #660.19 + movq 160(%rbx), %r8 #658.27 + vmovaps %zmm11, %zmm3 #684.30 + vmovaps %zmm3, %zmm2 #685.30 + lea (%rdx,%rax,4), %r10 #660.19 + movslq (%rsi,%r15,4), %rdi #661.25 + xorl %esi, %esi #688.19 + vmovaps %zmm2, %zmm0 #686.30 + vbroadcastsd (%r8,%rcx,8), %zmm25 #663.33 + vbroadcastsd 8(%r8,%rcx,8), %zmm24 #664.33 + vbroadcastsd 16(%r8,%rcx,8), %zmm23 #665.33 + vbroadcastsd 24(%r8,%rcx,8), %zmm22 #666.33 + vbroadcastsd 64(%r8,%rcx,8), %zmm21 #667.33 + vbroadcastsd 72(%r8,%rcx,8), %zmm20 #668.33 + vbroadcastsd 80(%r8,%rcx,8), %zmm19 #669.33 + vbroadcastsd 88(%r8,%rcx,8), %zmm18 #670.33 + vbroadcastsd 128(%r8,%rcx,8), %zmm17 #671.33 + vbroadcastsd 136(%r8,%rcx,8), %zmm16 #672.33 + vbroadcastsd 144(%r8,%rcx,8), %zmm15 #673.33 + vbroadcastsd 152(%r8,%rcx,8), %zmm14 #674.33 + movq 176(%rbx), %r9 #659.27 + testq %rdi, %rdi #688.28 + jle ..B7.32 # Prob 10% #688.28 + # LOE rcx rbx rsi rdi r8 r9 r10 r13 r14 r15 r11d zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 k2 k3 k4 +..B7.29: # Preds ..B7.28 + # Execution count [4.50e+00] + vmovups %zmm12, 1088(%rsp) #[spill] + vmovups %zmm26, 1024(%rsp) #[spill] + vmovups %zmm14, 768(%rsp) #[spill] + vmovups %zmm15, 960(%rsp) #[spill] + vmovups %zmm16, 832(%rsp) #[spill] + vmovups %zmm17, 896(%rsp) #[spill] + vmovups %zmm18, 320(%rsp) #[spill] + vmovups %zmm19, 704(%rsp) #[spill] + vmovups %zmm20, 384(%rsp) #[spill] + vmovups %zmm21, 512(%rsp) #[spill] + vmovups %zmm22, 640(%rsp) #[spill] + vmovups %zmm23, 448(%rsp) #[spill] + vmovups %zmm24, 576(%rsp) #[spill] + vmovups %zmm25, 256(%rsp) #[spill] + movq %r13, 8(%rsp) #[spill] + # LOE rcx rbx rsi rdi r8 r9 r10 r14 r15 r11d zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 k2 k3 k4 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# pointer_increment=64 682a899d0c46457e098cba76bedab789 +# LLVM-MCA-BEGIN +..B7.30: # Preds ..B7.30 ..B7.29 + # Execution count [2.50e+01] + movl (%r10,%rsi,4), %edx #689.22 + incq %rsi #688.39 + vmovups 896(%rsp), %zmm20 #697.35[spill] + vmovups 832(%rsp), %zmm25 #700.35[spill] + vmovups 448(%rsp), %zmm24 #701.35[spill] + vmovups 704(%rsp), %zmm23 #702.35[spill] + vmovups 960(%rsp), %zmm16 #703.35[spill] + vmovups 768(%rsp), %zmm14 #706.35[spill] + vmovups 576(%rsp), %zmm15 #698.35[spill] + vmovups 384(%rsp), %zmm12 #699.35[spill] + vmovups 512(%rsp), %zmm21 #696.35[spill] + vmovups 320(%rsp), %zmm18 #705.35[spill] + vmovups 256(%rsp), %zmm22 #695.35[spill] + vmovups 640(%rsp), %zmm17 #704.35[spill] + lea (%rdx,%rdx,2), %r12d #690.31 + shll $3, %r12d #690.31 + lea (%rdx,%rdx), %r13d #715.56 + movslq %r12d, %r12 #691.31 + cmpl %r11d, %r13d #715.66 + lea 1(%rdx,%rdx), %eax #716.61 + movl $0, %edx #715.66 + sete %dl #715.66 + cmpl %r11d, %eax #716.66 + movl $0, %eax #716.66 + movl %edx, %r13d #717.39 + vsubpd 128(%r8,%r12,8), %zmm20, %zmm29 #697.35 + sete %al #716.66 + vsubpd 128(%r8,%r12,8), %zmm25, %zmm26 #700.35 + vsubpd (%r8,%r12,8), %zmm24, %zmm25 #701.35 + vsubpd 64(%r8,%r12,8), %zmm23, %zmm24 #702.35 + vsubpd 128(%r8,%r12,8), %zmm16, %zmm23 #703.35 + vsubpd 128(%r8,%r12,8), %zmm14, %zmm20 #706.35 + vsubpd 64(%r8,%r12,8), %zmm12, %zmm27 #699.35 + vsubpd (%r8,%r12,8), %zmm15, %zmm28 #698.35 + vsubpd 64(%r8,%r12,8), %zmm21, %zmm30 #696.35 + vsubpd 64(%r8,%r12,8), %zmm18, %zmm21 #705.35 + vsubpd (%r8,%r12,8), %zmm22, %zmm31 #695.35 + vsubpd (%r8,%r12,8), %zmm17, %zmm22 #704.35 + vmulpd %zmm29, %zmm29, %zmm13 #730.80 + vmulpd %zmm26, %zmm26, %zmm15 #731.80 + vmulpd %zmm23, %zmm23, %zmm12 #732.80 + vmulpd %zmm20, %zmm20, %zmm14 #733.80 + vmovups 192(%rsp), %zmm16 #735.67[spill] + vfmadd231pd %zmm30, %zmm30, %zmm13 #730.57 + vfmadd231pd %zmm27, %zmm27, %zmm15 #731.57 + vfmadd231pd %zmm24, %zmm24, %zmm12 #732.57 + vfmadd231pd %zmm21, %zmm21, %zmm14 #733.57 + vfmadd231pd %zmm31, %zmm31, %zmm13 #730.34 + vfmadd231pd %zmm28, %zmm28, %zmm15 #731.34 + vfmadd231pd %zmm25, %zmm25, %zmm12 #732.34 + vfmadd231pd %zmm22, %zmm22, %zmm14 #733.34 + vrcp14pd %zmm13, %zmm19 #740.35 + vrcp14pd %zmm15, %zmm18 #741.35 + vrcp14pd %zmm12, %zmm17 #742.35 + vcmppd $17, %zmm16, %zmm13, %k1 #735.67 + vcmppd $17, %zmm16, %zmm15, %k6 #736.67 + vcmppd $17, %zmm16, %zmm12, %k7 #737.67 + vcmppd $17, %zmm16, %zmm14, %k0 #738.67 + vrcp14pd %zmm14, %zmm15 #743.35 + vmovups 64(%rsp), %zmm16 #745.67[spill] + vmovups 128(%rsp), %zmm12 #750.105[spill] + vmulpd %zmm16, %zmm19, %zmm13 #745.67 + vmulpd %zmm13, %zmm19, %zmm13 #745.51 + negl %r13d #717.39 + vmulpd %zmm13, %zmm19, %zmm14 #745.35 + movl %eax, %r12d #717.39 + vfmsub213pd %zmm1, %zmm19, %zmm13 #750.79 + vmulpd %zmm12, %zmm19, %zmm19 #750.105 + vmulpd %zmm19, %zmm13, %zmm13 #750.70 + addl $255, %r13d #717.39 + vmulpd %zmm13, %zmm14, %zmm14 #750.54 + .byte 144 #755.20 + vmovups 1024(%rsp), %zmm13 #755.20[spill] + vmulpd %zmm14, %zmm10, %zmm19 #750.36 + shll $4, %r12d #717.39 + subl %r12d, %r13d #717.39 + kmovb %r13d, %k5 #717.39 + kmovw %k1, %r13d #735.67 + kmovb %k5, %r12d #717.39 + kmovb %r12d, %k5 #735.41 + kmovb %r13d, %k1 #735.41 + movl %eax, %r13d #718.39 + kandb %k1, %k5, %k5 #735.41 + kmovb %k5, %r12d #735.41 + kmovw %r12d, %k5 #755.20 + lea (%rdx,%rdx), %r12d #718.39 + vfmadd231pd %zmm29, %zmm19, %zmm9{%k5} #757.20 + negl %r12d #718.39 + vfmadd231pd %zmm31, %zmm19, %zmm13{%k5} #755.20 + vmovups 1088(%rsp), %zmm31 #756.20[spill] + vmulpd %zmm16, %zmm18, %zmm29 #746.67 + vfmadd231pd %zmm30, %zmm19, %zmm31{%k5} #756.20 + vmovups %zmm13, 1024(%rsp) #755.20[spill] + vmulpd %zmm29, %zmm18, %zmm30 #746.51 + vmovups %zmm31, 1088(%rsp) #756.20[spill] + vmulpd %zmm30, %zmm18, %zmm13 #746.35 + vfmsub213pd %zmm1, %zmm18, %zmm30 #751.79 + vmulpd %zmm12, %zmm18, %zmm18 #751.105 + vmulpd %zmm18, %zmm30, %zmm14 #751.70 + addl $255, %r12d #718.39 + vmulpd %zmm14, %zmm13, %zmm19 #751.54 + vmulpd %zmm19, %zmm10, %zmm29 #751.36 + shll $5, %r13d #718.39 + subl %r13d, %r12d #718.39 + kmovb %r12d, %k1 #718.39 + kmovw %k6, %r12d #736.67 + kmovb %k1, %r13d #718.39 + kmovb %r13d, %k1 #736.41 + kmovb %r12d, %k6 #736.41 + movl %eax, %r12d #719.39 + kandb %k6, %k1, %k1 #736.41 + kmovb %k1, %r13d #736.41 + kmovw %r13d, %k1 #758.20 + lea (,%rdx,4), %r13d #719.39 + vfmadd231pd %zmm26, %zmm29, %zmm6{%k1} #760.20 + negl %r13d #719.39 + vfmadd231pd %zmm27, %zmm29, %zmm7{%k1} #759.20 + vfmadd231pd %zmm28, %zmm29, %zmm8{%k1} #758.20 + vmulpd %zmm16, %zmm17, %zmm26 #747.67 + vmulpd %zmm12, %zmm17, %zmm28 #752.105 + vmulpd %zmm16, %zmm15, %zmm16 #748.67 + vmulpd %zmm12, %zmm15, %zmm12 #753.105 + vmulpd %zmm26, %zmm17, %zmm27 #747.51 + vmulpd %zmm16, %zmm15, %zmm19 #748.51 + vmulpd %zmm27, %zmm17, %zmm13 #747.35 + vfmsub213pd %zmm1, %zmm17, %zmm27 #752.79 + vmulpd %zmm28, %zmm27, %zmm14 #752.70 + addl $255, %r13d #719.39 + vmulpd %zmm14, %zmm13, %zmm17 #752.54 + shll $3, %edx #720.39 + shll $6, %r12d #719.39 + negl %edx #720.39 + vmulpd %zmm17, %zmm10, %zmm18 #752.36 + subl %r12d, %r13d #719.39 + kmovb %r13d, %k6 #719.39 + addl $255, %edx #720.39 + shll $7, %eax #720.39 + subl %eax, %edx #720.39 + kmovb %k6, %eax #719.39 + kmovb %eax, %k6 #737.41 + kmovw %k7, %eax #737.67 + kmovb %eax, %k7 #737.41 + kandb %k7, %k6, %k7 #737.41 + kmovb %edx, %k6 #720.39 + kmovb %k7, %edx #737.41 + kmovw %edx, %k7 #761.20 + kmovw %k0, %edx #738.67 + vfmadd231pd %zmm23, %zmm18, %zmm11{%k7} #763.20 + vfmadd231pd %zmm24, %zmm18, %zmm4{%k7} #762.20 + vfmadd231pd %zmm25, %zmm18, %zmm5{%k7} #761.20 + vmulpd %zmm19, %zmm15, %zmm23 #748.35 + vfmsub213pd %zmm1, %zmm15, %zmm19 #753.79 + vmulpd %zmm12, %zmm19, %zmm15 #753.70 + vmulpd %zmm15, %zmm23, %zmm24 #753.54 + vmulpd %zmm24, %zmm10, %zmm25 #753.36 + kmovb %k6, %eax #720.39 + kmovb %eax, %k6 #738.41 + kmovb %edx, %k0 #738.41 + kandb %k0, %k6, %k0 #738.41 + kmovb %k0, %r12d #738.41 + kmovw %r12d, %k6 #764.20 + vfmadd231pd %zmm22, %zmm25, %zmm3{%k6} #764.20 + vfmadd231pd %zmm21, %zmm25, %zmm2{%k6} #765.20 + vfmadd231pd %zmm20, %zmm25, %zmm0{%k6} #766.20 + cmpq %rdi, %rsi #688.28 + jl ..B7.30 # Prob 82% #688.28 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER + # LOE rcx rbx rsi rdi r8 r9 r10 r14 r15 r11d zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm31 k2 k3 k4 +..B7.31: # Preds ..B7.30 + # Execution count [4.50e+00] + vmovaps %zmm31, %zmm12 # + vmovups 1024(%rsp), %zmm26 #[spill] + movq 8(%rsp), %r13 #[spill] + vpxord %zmm13, %zmm13, %zmm13 # + # LOE rcx rbx rdi r9 r13 r14 r15 r11d zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm13 zmm26 k2 k3 k4 +..B7.32: # Preds ..B7.31 ..B7.28 + # Execution count [5.00e+00] + vpermilpd $85, %zmm5, %zmm17 #769.9 + incl %r11d #652.49 + vaddpd %zmm5, %zmm17, %zmm22 #769.9 + incq %r15 #652.49 + vpermilpd $85, %zmm26, %zmm16 #769.9 + vpermilpd $85, %zmm3, %zmm19 #769.9 + vpermilpd $85, %zmm12, %zmm28 #770.9 + vpermilpd $85, %zmm9, %zmm15 #771.9 + vaddpd %zmm26, %zmm16, %zmm20 #769.9 + vaddpd %zmm19, %zmm3, %zmm22{%k3} #769.9 + vaddpd %zmm12, %zmm28, %zmm3 #770.9 + vaddpd %zmm9, %zmm15, %zmm16 #771.9 + valignd $8, %zmm22, %zmm22, %zmm23 #769.9 + vpermilpd $85, %zmm11, %zmm9 #771.9 + vpermilpd $85, %zmm8, %zmm18 #769.9 + vpermilpd $85, %zmm4, %zmm29 #770.9 + vpermilpd $85, %zmm7, %zmm30 #770.9 + vaddpd %zmm11, %zmm9, %zmm17 #771.9 + vaddpd %zmm18, %zmm8, %zmm20{%k3} #769.9 + vaddpd %zmm4, %zmm29, %zmm4 #770.9 + vaddpd %zmm30, %zmm7, %zmm3{%k3} #770.9 + valignd $8, %zmm20, %zmm20, %zmm21 #769.9 + vpermilpd $85, %zmm6, %zmm11 #771.9 + vaddpd %zmm11, %zmm6, %zmm16{%k3} #771.9 + vaddpd %zmm21, %zmm20, %zmm24 #769.9 + vpermilpd $85, %zmm2, %zmm31 #770.9 + vpermilpd $85, %zmm0, %zmm6 #771.9 + vaddpd %zmm31, %zmm2, %zmm4{%k3} #770.9 + valignd $8, %zmm3, %zmm3, %zmm2 #770.9 + vaddpd %zmm6, %zmm0, %zmm17{%k3} #771.9 + valignd $8, %zmm16, %zmm16, %zmm0 #771.9 + vaddpd %zmm2, %zmm3, %zmm7 #770.9 + valignd $8, %zmm4, %zmm4, %zmm5 #770.9 + vaddpd %zmm0, %zmm16, %zmm18 #771.9 + vxorpd %xmm0, %xmm0, %xmm0 #775.9 + valignd $8, %zmm17, %zmm17, %zmm16 #771.9 + vaddpd %zmm23, %zmm22, %zmm24{%k4} #769.9 + vaddpd %zmm5, %zmm4, %zmm7{%k4} #770.9 + vaddpd %zmm16, %zmm17, %zmm18{%k4} #771.9 + vshuff64x2 $177, %zmm24, %zmm24, %zmm25 #769.9 + vshuff64x2 $177, %zmm7, %zmm7, %zmm8 #770.9 + vshuff64x2 $177, %zmm18, %zmm18, %zmm19 #771.9 + vaddpd %zmm25, %zmm24, %zmm26 #769.9 + vaddpd %zmm8, %zmm7, %zmm12 #770.9 + vaddpd %zmm19, %zmm18, %zmm20 #771.9 + vshuff64x2 $238, %zmm26, %zmm26, %zmm26{%k2} #769.9 + vshuff64x2 $238, %zmm12, %zmm12, %zmm12{%k2} #770.9 + vshuff64x2 $238, %zmm20, %zmm20, %zmm20{%k2} #771.9 + vaddpd (%r9,%rcx,8), %ymm26, %ymm27 #769.9 + vaddpd 64(%r9,%rcx,8), %ymm12, %ymm14 #770.9 + vaddpd 128(%r9,%rcx,8), %ymm20, %ymm21 #771.9 + vmovupd %ymm27, (%r9,%rcx,8) #769.9 + vmovupd %ymm14, 64(%r9,%rcx,8) #770.9 + vmovupd %ymm21, 128(%r9,%rcx,8) #771.9 + addq %rdi, 8(%r13) #774.9 + vcvtsi2sd %edi, %xmm0, %xmm0 #775.9 + vcvttsd2si %xmm0, %rax #775.9 + incq (%r13) #773.9 + addq %rax, 16(%r13) #775.9 + cmpl 20(%rbx), %r11d #652.26 + jl ..B7.28 # Prob 82% #652.26 + # LOE rbx r13 r14 r15 r11d zmm1 zmm10 zmm13 k2 k3 k4 +..B7.34: # Preds ..B7.32 ..B7.26 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #779.5 + vzeroupper #779.5 +..___tag_value_computeForceLJ_4xn_full.397: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #779.5 +..___tag_value_computeForceLJ_4xn_full.398: + # LOE r12 +..B7.35: # Preds ..B7.34 + # Execution count [1.00e+00] + xorl %eax, %eax #782.16 +..___tag_value_computeForceLJ_4xn_full.399: +# getTimeStamp() + call getTimeStamp #782.16 +..___tag_value_computeForceLJ_4xn_full.400: + # LOE r12 xmm0 +..B7.42: # Preds ..B7.35 + # Execution count [1.00e+00] + vmovsd %xmm0, 8(%rsp) #782.16[spill] + # LOE r12 +..B7.36: # Preds ..B7.42 + # Execution count [1.00e+00] + movl $.L_2__STRING.6, %edi #783.5 + xorl %eax, %eax #783.5 +..___tag_value_computeForceLJ_4xn_full.402: +# debug_printf(const char *, ...) + call debug_printf #783.5 +..___tag_value_computeForceLJ_4xn_full.403: + # LOE r12 +..B7.37: # Preds ..B7.36 + # Execution count [1.00e+00] + vmovsd 8(%rsp), %xmm0 #784.14[spill] + vsubsd (%rsp), %xmm0, %xmm0 #784.14[spill] + addq $1176, %rsp #784.14 + .cfi_restore 3 + popq %rbx #784.14 + .cfi_restore 15 + popq %r15 #784.14 + .cfi_restore 14 + popq %r14 #784.14 + .cfi_restore 13 + popq %r13 #784.14 + .cfi_restore 12 + popq %r12 #784.14 + movq %rbp, %rsp #784.14 + popq %rbp #784.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #784.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B7.38: # Preds ..B7.5 + # Execution count [4.50e-01]: Infreq + xorl %r10d, %r10d #638.9 + jmp ..B7.18 # Prob 100% #638.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r12 r13 r14 edx ecx r10d r11d xmm0 xmm1 ymm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_4xn_full,@function + .size computeForceLJ_4xn_full,.-computeForceLJ_4xn_full +..LNcomputeForceLJ_4xn_full.6: + .data +# -- End computeForceLJ_4xn_full + .section .rodata, "a" + .align 64 + .align 64 +.L_2il0floatpacket.5: + .long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,64 + .align 16 +.L_2il0floatpacket.0: + .long 0x00000004,0x00000004,0x00000004,0x00000004 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,16 + .align 16 +.L_2il0floatpacket.1: + .long 0x00000000,0x00000001,0x00000002,0x00000003 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,16 + .align 8 +.L_2il0floatpacket.2: + .long 0x00000000,0x3fe00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,8 + .align 8 +.L_2il0floatpacket.3: + .long 0x00000000,0x40480000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,8 + .align 8 +.L_2il0floatpacket.4: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,8 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +.L_2__STRING.0: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 1646283340 + .long 1852401509 + .word 10 + .type .L_2__STRING.0,@object + .size .L_2__STRING.0,22 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.1: + .long 1668444006 + .word 101 + .type .L_2__STRING.1,@object + .size .L_2__STRING.1,6 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.2: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 1696614988 + .long 681070 + .type .L_2__STRING.2,@object + .size .L_2__STRING.2,20 + .align 4 +.L_2__STRING.3: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 845105740 + .long 544108152 + .long 1768383842 + .word 2670 + .byte 0 + .type .L_2__STRING.3,@object + .size .L_2__STRING.3,27 + .space 1, 0x00 # pad + .align 4 +.L_2__STRING.4: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 845105740 + .long 544108152 + .long 174354021 + .byte 0 + .type .L_2__STRING.4,@object + .size .L_2__STRING.4,25 + .space 3, 0x00 # pad + .align 4 +.L_2__STRING.5: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 878660172 + .long 1646292600 + .long 1852401509 + .word 10 + .type .L_2__STRING.5,@object + .size .L_2__STRING.5,26 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.6: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 878660172 + .long 1696624248 + .long 681070 + .type .L_2__STRING.6,@object + .size .L_2__STRING.6,24 + .data + .section .note.GNU-stack, "" +# End diff --git a/static_analysis/jan/gromacs-icc-avx512-sp.o b/static_analysis/jan/gromacs-icc-avx512-sp.o new file mode 100644 index 0000000..44b01df Binary files /dev/null and b/static_analysis/jan/gromacs-icc-avx512-sp.o differ diff --git a/static_analysis/jan/gromacs-icc-avx512-sp.s b/static_analysis/jan/gromacs-icc-avx512-sp.s new file mode 100644 index 0000000..5cc82ef --- /dev/null +++ b/static_analysis/jan/gromacs-icc-avx512-sp.s @@ -0,0 +1,4018 @@ +# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; +# mark_description "0226_000000"; +# mark_description "-I/apps/likwid/5.2.2/include -I././gromacs/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GN"; +# mark_description "U_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=1 -DCOMPUTE_STATS -DVECTOR_WIDTH=16 -D__ISA_AVX512__ -DENABLE_OM"; +# mark_description "P_SIMD -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o build-gromacs-ICC-AVX512-SP/for"; +# mark_description "ce_lj.s"; + .file "force_lj.c" + .text +..TXTST0: +.L_2__routine_start_computeForceLJ_ref_0: +# -- Begin computeForceLJ_ref + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_ref +# --- computeForceLJ_ref(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_ref: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_ref.1: +..L2: + #19.91 + pushq %rbp #19.91 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #19.91 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #19.91 + pushq %r12 #19.91 + pushq %r13 #19.91 + pushq %r14 #19.91 + pushq %r15 #19.91 + pushq %rbx #19.91 + subq $152, %rsp #19.91 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r15 #19.91 + movl $.L_2__STRING.1, %edi #20.5 + xorl %eax, %eax #20.5 + movq %rcx, %rbx #19.91 + movq %rdx, %r13 #19.91 + movq %rsi, %r14 #19.91 +..___tag_value_computeForceLJ_ref.11: +# debug_printf(const char *, ...) + call debug_printf #20.5 +..___tag_value_computeForceLJ_ref.12: + # LOE rbx r12 r13 r14 r15 +..B1.2: # Preds ..B1.1 + # Execution count [1.00e+00] + vmovss 108(%r15), %xmm16 #23.27 + xorl %ecx, %ecx #30.5 + vmulss %xmm16, %xmm16, %xmm0 #23.45 + xorl %esi, %esi #32.27 + vmovss 48(%r15), %xmm1 #24.23 + vmovss 40(%r15), %xmm2 #25.24 + movl 20(%r14), %edx #30.26 + vmovss %xmm0, 16(%rsp) #23.45[spill] + vmovss %xmm1, 8(%rsp) #24.23[spill] + vmovss %xmm2, 24(%rsp) #25.24[spill] + testl %edx, %edx #30.26 + jle ..B1.23 # Prob 9% #30.26 + # LOE rbx rsi r12 r13 r14 edx ecx +..B1.3: # Preds ..B1.2 + # Execution count [9.00e-01] + movq 176(%r14), %rdi #32.27 + xorl %r12d, %r12d #33.32 + movq 192(%r14), %rax #33.32 + vxorps %xmm2, %xmm2, %xmm2 #34.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #33.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #33.9 + # LOE rax rbx rsi rdi r13 r14 edx ecx r12d xmm0 xmm1 xmm2 +..B1.4: # Preds ..B1.21 ..B1.3 + # Execution count [5.00e+00] + movl %ecx, %r8d #31.27 + movl %ecx, %r9d #31.27 + sarl $1, %r8d #31.27 + andl $1, %r9d #31.27 + shll $2, %r9d #31.27 + lea (%r8,%r8,2), %r10d #31.27 + lea (%r9,%r10,8), %r11d #31.27 + movslq %r11d, %r11 #32.27 + lea (%rdi,%r11,4), %r15 #32.27 + movl (%rsi,%rax), %r11d #33.32 + testl %r11d, %r11d #33.32 + jle ..B1.21 # Prob 50% #33.32 + # LOE rax rbx rsi rdi r13 r14 r15 edx ecx r11d r12d xmm0 xmm1 xmm2 +..B1.5: # Preds ..B1.4 + # Execution count [4.50e+00] + cmpl $8, %r11d #33.9 + jl ..B1.152 # Prob 10% #33.9 + # LOE rax rbx rsi rdi r13 r14 r15 edx ecx r11d r12d xmm0 xmm1 xmm2 +..B1.6: # Preds ..B1.5 + # Execution count [4.50e+00] + lea 64(%r15), %r8 #36.13 + andq $15, %r8 #33.9 + testl $3, %r8d #33.9 + je ..B1.8 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r13 r14 r15 edx ecx r8d r11d r12d xmm0 xmm1 xmm2 +..B1.7: # Preds ..B1.6 + # Execution count [2.25e+00] + movl %r12d, %r8d #33.9 + jmp ..B1.9 # Prob 100% #33.9 + # LOE rax rbx rsi rdi r8 r13 r14 r15 edx ecx r11d r12d xmm0 xmm1 xmm2 +..B1.8: # Preds ..B1.6 + # Execution count [2.25e+00] + movl %r8d, %r9d #33.9 + negl %r9d #33.9 + addl $16, %r9d #33.9 + shrl $2, %r9d #33.9 + testl %r8d, %r8d #33.9 + cmovne %r9d, %r8d #33.9 + # LOE rax rbx rsi rdi r8 r13 r14 r15 edx ecx r11d r12d xmm0 xmm1 xmm2 +..B1.9: # Preds ..B1.7 ..B1.8 + # Execution count [4.50e+00] + lea 8(%r8), %r9d #33.9 + cmpl %r9d, %r11d #33.9 + jl ..B1.152 # Prob 10% #33.9 + # LOE rax rbx rsi rdi r8 r13 r14 r15 edx ecx r11d r12d xmm0 xmm1 xmm2 +..B1.10: # Preds ..B1.9 + # Execution count [5.00e+00] + movl %r11d, %r10d #33.9 + xorl %r9d, %r9d #33.9 + subl %r8d, %r10d #33.9 + andl $7, %r10d #33.9 + negl %r10d #33.9 + addl %r11d, %r10d #33.9 + cmpl $1, %r8d #33.9 + jb ..B1.14 # Prob 10% #33.9 + # LOE rax rbx rsi rdi r8 r9 r13 r14 r15 edx ecx r10d r11d r12d xmm0 xmm1 xmm2 +..B1.12: # Preds ..B1.10 ..B1.12 + # Execution count [2.50e+01] + movl %r12d, (%r15,%r9,4) #34.13 + movl %r12d, 32(%r15,%r9,4) #35.13 + movl %r12d, 64(%r15,%r9,4) #36.13 + incq %r9 #33.9 + cmpq %r8, %r9 #33.9 + jb ..B1.12 # Prob 82% #33.9 + # LOE rax rbx rsi rdi r8 r9 r13 r14 r15 edx ecx r10d r11d r12d xmm0 xmm1 xmm2 +..B1.14: # Preds ..B1.12 ..B1.10 + # Execution count [4.50e+00] + movslq %r10d, %r9 #33.9 + # LOE rax rbx rsi rdi r8 r9 r13 r14 r15 edx ecx r10d r11d r12d xmm0 xmm1 xmm2 +..B1.15: # Preds ..B1.15 ..B1.14 + # Execution count [2.50e+01] + vmovups %xmm2, (%r15,%r8,4) #34.13 + vmovups %xmm2, 32(%r15,%r8,4) #35.13 + vmovups %xmm2, 64(%r15,%r8,4) #36.13 + vmovups %xmm2, 16(%r15,%r8,4) #34.13 + vmovups %xmm2, 48(%r15,%r8,4) #35.13 + vmovups %xmm2, 80(%r15,%r8,4) #36.13 + addq $8, %r8 #33.9 + cmpq %r9, %r8 #33.9 + jb ..B1.15 # Prob 82% #33.9 + # LOE rax rbx rsi rdi r8 r9 r13 r14 r15 edx ecx r10d r11d r12d xmm0 xmm1 xmm2 +..B1.17: # Preds ..B1.15 ..B1.152 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #33.9 + cmpl %r11d, %r8d #33.9 + ja ..B1.21 # Prob 50% #33.9 + # LOE rax rbx rsi rdi r13 r14 r15 edx ecx r10d r11d r12d xmm0 xmm1 xmm2 +..B1.18: # Preds ..B1.17 + # Execution count [4.50e+00] + movslq %r10d, %r9 #34.13 + negl %r10d #33.9 + addl %r11d, %r10d #33.9 + xorl %r8d, %r8d #33.9 + movslq %r11d, %r11 #33.9 + vmovdqa %xmm0, %xmm4 #33.9 + vpbroadcastd %r10d, %xmm3 #33.9 + subq %r9, %r11 #33.9 + lea (%r15,%r9,4), %r15 #34.13 + # LOE rax rbx rsi rdi r8 r11 r13 r14 r15 edx ecx r12d xmm0 xmm1 xmm2 xmm3 xmm4 +..B1.19: # Preds ..B1.19 ..B1.18 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #33.9 + vpaddd %xmm1, %xmm4, %xmm4 #33.9 + vmovups %xmm2, (%r15,%r8,4){%k1} #34.13 + vmovups %xmm2, 32(%r15,%r8,4){%k1} #35.13 + vmovups %xmm2, 64(%r15,%r8,4){%k1} #36.13 + addq $4, %r8 #33.9 + cmpq %r11, %r8 #33.9 + jb ..B1.19 # Prob 82% #33.9 + # LOE rax rbx rsi rdi r8 r11 r13 r14 r15 edx ecx r12d xmm0 xmm1 xmm2 xmm3 xmm4 +..B1.21: # Preds ..B1.19 ..B1.4 ..B1.17 + # Execution count [5.00e+00] + incl %ecx #30.5 + addq $28, %rsi #30.5 + cmpl %edx, %ecx #30.5 + jb ..B1.4 # Prob 82% #30.5 + # LOE rax rbx rsi rdi r13 r14 edx ecx r12d xmm0 xmm1 xmm2 +..B1.23: # Preds ..B1.21 ..B1.2 + # Execution count [1.00e+00] + xorl %eax, %eax #40.16 +..___tag_value_computeForceLJ_ref.16: +# getTimeStamp() + call getTimeStamp #40.16 +..___tag_value_computeForceLJ_ref.17: + # LOE rbx r12 r13 r14 xmm0 +..B1.156: # Preds ..B1.23 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #40.16[spill] + # LOE rbx r12 r13 r14 +..B1.24: # Preds ..B1.156 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #44.5 +..___tag_value_computeForceLJ_ref.19: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #44.5 +..___tag_value_computeForceLJ_ref.20: + # LOE rbx r12 r13 r14 +..B1.25: # Preds ..B1.24 + # Execution count [9.00e-01] + movl 20(%r14), %eax #47.26 + movl %eax, 56(%rsp) #47.26[spill] + testl %eax, %eax #47.26 + jle ..B1.148 # Prob 0% #47.26 + # LOE rbx r12 r13 r14 +..B1.26: # Preds ..B1.25 + # Execution count [9.00e-01] + xorl %edx, %edx #47.5 + movq 160(%r14), %r10 #51.27 + movq 176(%r14), %r9 #52.27 + movq 8(%r13), %rdi #53.19 + movslq 16(%r13), %r8 #53.44 + movq 24(%r13), %r14 #54.25 + movl 32(%r13), %r11d #77.28 + movq (%rbx), %rcx #122.9 + movq 8(%rbx), %rsi #123.9 + movq 16(%rbx), %rax #124.9 + movl 56(%rsp), %r13d #47.5[spill] + # LOE rax rcx rbx rsi rdi r8 r9 r10 r12 r14 edx r11d r13d +..B1.27: # Preds ..B1.27 ..B1.26 + # Execution count [5.00e+00] + incl %edx #47.5 + incq %rcx #122.9 + cmpl %r13d, %edx #47.5 + jb ..B1.27 # Prob 82% #47.5 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r12 r14 edx r11d r13d +..B1.28: # Preds ..B1.27 + # Execution count [9.00e-01] + movq %rcx, (%rbx) #122.9 + xorl %ecx, %ecx #47.5 + vmovss 24(%rsp), %xmm0 #91.54[spill] + xorl %edx, %edx #48.22 + vmovss 8(%rsp), %xmm8 #48.22[spill] + vmovss 16(%rsp), %xmm11 #48.22[spill] + vmovsd .L_2il0floatpacket.2(%rip), %xmm9 #124.9 + vmovss .L_2il0floatpacket.5(%rip), %xmm10 #89.44 + vmulss .L_2il0floatpacket.3(%rip), %xmm0, %xmm7 #91.54 + vmovss .L_2il0floatpacket.4(%rip), %xmm3 #91.67 + movl %r11d, 32(%rsp) #48.22[spill] + movq %r14, 64(%rsp) #48.22[spill] + movq %rbx, 72(%rsp) #48.22[spill] + # LOE rax rdx rsi rdi r8 r9 r10 ecx xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.29: # Preds ..B1.146 ..B1.28 + # Execution count [5.00e+00] + movl %ecx, %r13d #48.22 + movl %ecx, %r15d #50.27 + sarl $1, %r13d #48.22 + andl $1, %r15d #50.27 + shll $2, %r15d #50.27 + movq 64(%rsp), %rbx #54.25[spill] + lea (%r13,%r13,2), %r11d #50.27 + movslq (%rbx,%rdx,4), %r12 #54.25 + lea (%r15,%r11,8), %r14d #50.27 + movslq %r14d, %r14 #50.27 + xorl %ebx, %ebx #56.9 + lea (%r10,%r14,4), %r11 #51.27 + lea (%r9,%r14,4), %r14 #52.27 + testq %r12, %r12 #56.28 + jle ..B1.146 # Prob 10% #56.28 + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 ecx r13d r15d xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.30: # Preds ..B1.29 + # Execution count [4.50e+00] + movq %rax, 24(%rsp) #[spill] + movq %rdx, 48(%rsp) #[spill] + movl %ecx, 40(%rsp) #[spill] + movq %rsi, 16(%rsp) #[spill] + movq %r8, 8(%rsp) #[spill] + movq %r9, 80(%rsp) #[spill] + movq %r10, 88(%rsp) #[spill] + movl 32(%rsp), %eax #[spill] + # LOE rbx rdi r11 r12 r14 eax r13d r15d xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.31: # Preds ..B1.144 ..B1.30 + # Execution count [2.50e+01] + movl (%rdi,%rbx,4), %r10d #57.22 + xorb %dl, %dl #59.21 + movslq %r10d, %r10 #58.31 + xorb %cl, %cl #63.13 + movq %rbx, 112(%rsp) #63.13[spill] + movl %r15d, %r9d #63.13 + movq %r12, 104(%rsp) #63.13[spill] + xorl %esi, %esi #63.13 + movq %rdi, 96(%rsp) #63.13[spill] + movq 80(%rsp), %rbx #63.13[spill] + lea (%r10,%r10,2), %r8 #60.28 + movq 88(%rsp), %rdi #63.13[spill] + movq 72(%rsp), %r12 #63.13[spill] + shlq $5, %r8 #60.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.32: # Preds ..B1.143 ..B1.31 + # Execution count [1.00e+02] + vmovss (%r11,%rsi,4), %xmm6 #64.33 + vxorps %xmm2, %xmm2, %xmm2 #67.30 + vmovaps %xmm2, %xmm1 #68.30 + vmovss 32(%r11,%rsi,4), %xmm5 #65.33 + vmovaps %xmm1, %xmm0 #69.30 + vmovss 64(%r11,%rsi,4), %xmm4 #66.33 + testl %eax, %eax #77.28 + je ..B1.37 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.33: # Preds ..B1.32 + # Execution count [5.00e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.39 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.34: # Preds ..B1.33 + # Execution count [2.50e+01] + testl %r9d, %r9d #77.99 + jl ..B1.39 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.35: # Preds ..B1.34 + # Execution count [6.25e+00] + jle ..B1.53 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.36: # Preds ..B1.35 + # Execution count [3.12e+00] + cmpl $2, %r9d #77.99 + jl ..B1.69 # Prob 50% #77.99 + jmp ..B1.48 # Prob 100% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.37: # Preds ..B1.32 + # Execution count [5.00e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.39 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.38: # Preds ..B1.37 + # Execution count [2.50e+01] + testl %r9d, %r9d #78.100 + je ..B1.53 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.39: # Preds ..B1.37 ..B1.38 ..B1.33 ..B1.34 + # Execution count [5.00e+01] + vsubss 32(%r8,%rdi), %xmm5, %xmm17 #85.48 + vsubss (%r8,%rdi), %xmm6, %xmm16 #84.48 + vsubss 64(%r8,%rdi), %xmm4, %xmm18 #86.48 + vmulss %xmm17, %xmm17, %xmm12 #87.61 + vfmadd231ss %xmm16, %xmm16, %xmm12 #87.75 + vfmadd231ss %xmm18, %xmm18, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.43 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm16 xmm17 xmm18 +..B1.40: # Preds ..B1.39 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm2 #89.51 + vmulss %xmm2, %xmm8, %xmm0 #90.50 + vmulss %xmm7, %xmm2, %xmm12 #91.67 + vmulss %xmm2, %xmm0, %xmm1 #90.56 + vmulss %xmm2, %xmm1, %xmm13 #90.62 + vmulss %xmm13, %xmm12, %xmm14 #91.76 + vsubss %xmm3, %xmm13, %xmm15 #91.67 + vmulss %xmm15, %xmm14, %xmm14 #91.82 + vmulss %xmm14, %xmm16, %xmm2 #94.67 + testl %eax, %eax #93.32 + je ..B1.42 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm14 xmm17 xmm18 +..B1.41: # Preds ..B1.40 + # Execution count [1.25e+01] + vmovss (%r8,%rbx), %xmm1 #94.33 + movb $1, %dl #102.29 + vmovss 32(%r8,%rbx), %xmm12 #95.33 + vsubss %xmm2, %xmm1, %xmm0 #94.33 + vmulss %xmm14, %xmm17, %xmm1 #95.67 + vfnmadd213ss %xmm12, %xmm14, %xmm17 #95.33 + vmovss 64(%r8,%rbx), %xmm13 #96.33 + vmovss %xmm0, (%r8,%rbx) #94.33 + vmulss %xmm14, %xmm18, %xmm0 #96.67 + vfnmadd213ss %xmm13, %xmm18, %xmm14 #96.33 + vmovss %xmm17, 32(%r8,%rbx) #95.33 + vmovss %xmm14, 64(%r8,%rbx) #96.33 + incq 24(%r12) #103.29 + jmp ..B1.44 # Prob 100% #103.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.42: # Preds ..B1.40 + # Execution count [1.25e+01] + vmulss %xmm14, %xmm17, %xmm1 #100.43 + movb $1, %dl #102.29 + vmulss %xmm14, %xmm18, %xmm0 #101.43 + incq 24(%r12) #103.29 + jmp ..B1.51 # Prob 100% #103.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.43: # Preds ..B1.39 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + testl %eax, %eax #77.28 + je ..B1.51 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.44: # Preds ..B1.41 ..B1.43 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.53 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.45: # Preds ..B1.44 + # Execution count [1.88e+01] + testl %r9d, %r9d #77.99 + jle ..B1.53 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.46: # Preds ..B1.45 + # Execution count [0.00e+00] + cmpl $2, %r9d #77.99 + jl ..B1.69 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.47: # Preds ..B1.46 + # Execution count [0.00e+00] + testl %eax, %eax #77.28 + je ..B1.81 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.48: # Preds ..B1.36 ..B1.47 + # Execution count [0.00e+00] + cmpl $3, %r9d #77.99 + jl ..B1.82 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.49: # Preds ..B1.48 + # Execution count [6.25e+00] + cmpl %r10d, %r13d #77.62 + jne ..B1.95 # Prob 50% #77.62 + jmp ..B1.63 # Prob 100% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.51: # Preds ..B1.42 ..B1.43 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.53 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.52: # Preds ..B1.51 + # Execution count [1.88e+01] + cmpl $1, %r9d #78.100 + je ..B1.69 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.53: # Preds ..B1.52 ..B1.45 ..B1.44 ..B1.51 ..B1.35 + # ..B1.38 + # Execution count [5.00e+01] + vsubss 36(%r8,%rdi), %xmm5, %xmm20 #85.48 + vsubss 4(%r8,%rdi), %xmm6, %xmm19 #84.48 + vsubss 68(%r8,%rdi), %xmm4, %xmm21 #86.48 + vmulss %xmm20, %xmm20, %xmm12 #87.61 + vfmadd231ss %xmm19, %xmm19, %xmm12 #87.75 + vfmadd231ss %xmm21, %xmm21, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.58 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm19 xmm20 xmm21 +..B1.54: # Preds ..B1.53 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm14 #89.51 + vmulss %xmm14, %xmm8, %xmm12 #90.50 + vmulss %xmm7, %xmm14, %xmm15 #91.67 + vmulss %xmm14, %xmm12, %xmm13 #90.56 + vmulss %xmm14, %xmm13, %xmm16 #90.62 + vmulss %xmm16, %xmm15, %xmm17 #91.76 + vsubss %xmm3, %xmm16, %xmm18 #91.67 + vmulss %xmm18, %xmm17, %xmm16 #91.82 + vmulss %xmm16, %xmm19, %xmm18 #94.67 + testl %eax, %eax #93.32 + je ..B1.56 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm18 xmm20 xmm21 +..B1.55: # Preds ..B1.54 + # Execution count [1.25e+01] + vmovss 4(%r8,%rbx), %xmm12 #94.33 + vmovss 36(%r8,%rbx), %xmm14 #95.33 + vsubss %xmm18, %xmm12, %xmm13 #94.33 + vmulss %xmm16, %xmm20, %xmm12 #95.67 + vmovss %xmm13, 4(%r8,%rbx) #94.33 + vsubss %xmm12, %xmm14, %xmm15 #95.33 + vmulss %xmm16, %xmm21, %xmm13 #96.67 + vmovss 68(%r8,%rbx), %xmm16 #96.33 + vmovss %xmm15, 36(%r8,%rbx) #95.33 + vsubss %xmm13, %xmm16, %xmm17 #96.33 + vmovss %xmm17, 68(%r8,%rbx) #96.33 + jmp ..B1.57 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.56: # Preds ..B1.54 + # Execution count [1.25e+01] + vmulss %xmm16, %xmm20, %xmm12 #100.43 + vmulss %xmm16, %xmm21, %xmm13 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.57: # Preds ..B1.55 ..B1.56 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm18, %xmm2, %xmm2 #99.29 + vaddss %xmm12, %xmm1, %xmm1 #100.29 + vaddss %xmm13, %xmm0, %xmm0 #101.29 + jmp ..B1.59 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.58: # Preds ..B1.53 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.59: # Preds ..B1.57 ..B1.58 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.67 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.60: # Preds ..B1.59 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.69 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.61: # Preds ..B1.60 + # Execution count [1.88e+01] + cmpl $2, %r9d #77.99 + jl ..B1.69 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.62: # Preds ..B1.61 + # Execution count [6.25e+00] + cmpl $3, %r9d #77.99 + jl ..B1.82 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.63: # Preds ..B1.62 ..B1.49 + # Execution count [3.91e+00] + cmpl $4, %r9d #77.99 + jl ..B1.95 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.64: # Preds ..B1.63 + # Execution count [0.00e+00] + testl %eax, %eax #77.28 + jne ..B1.79 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.65: # Preds ..B1.64 + # Execution count [7.81e+00] + cmpl %r10d, %r13d #78.62 + jne ..B1.109 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.66: # Preds ..B1.65 + # Execution count [3.91e+00] + cmpl $5, %r9d #78.100 + jne ..B1.109 # Prob 50% #78.100 + jmp ..B1.106 # Prob 100% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.67: # Preds ..B1.59 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.69 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.68: # Preds ..B1.67 + # Execution count [1.88e+01] + cmpl $2, %r9d #78.100 + je ..B1.82 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.69: # Preds ..B1.36 ..B1.61 ..B1.68 ..B1.60 ..B1.67 + # ..B1.46 ..B1.52 + # Execution count [5.00e+01] + vsubss 40(%r8,%rdi), %xmm5, %xmm20 #85.48 + vsubss 8(%r8,%rdi), %xmm6, %xmm19 #84.48 + vsubss 72(%r8,%rdi), %xmm4, %xmm21 #86.48 + vmulss %xmm20, %xmm20, %xmm12 #87.61 + vfmadd231ss %xmm19, %xmm19, %xmm12 #87.75 + vfmadd231ss %xmm21, %xmm21, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.74 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm19 xmm20 xmm21 +..B1.70: # Preds ..B1.69 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm14 #89.51 + vmulss %xmm14, %xmm8, %xmm12 #90.50 + vmulss %xmm7, %xmm14, %xmm15 #91.67 + vmulss %xmm14, %xmm12, %xmm13 #90.56 + vmulss %xmm14, %xmm13, %xmm16 #90.62 + vmulss %xmm16, %xmm15, %xmm17 #91.76 + vsubss %xmm3, %xmm16, %xmm18 #91.67 + vmulss %xmm18, %xmm17, %xmm16 #91.82 + vmulss %xmm16, %xmm19, %xmm18 #94.67 + testl %eax, %eax #93.32 + je ..B1.72 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm18 xmm20 xmm21 +..B1.71: # Preds ..B1.70 + # Execution count [1.25e+01] + vmovss 8(%r8,%rbx), %xmm12 #94.33 + vmovss 40(%r8,%rbx), %xmm14 #95.33 + vsubss %xmm18, %xmm12, %xmm13 #94.33 + vmulss %xmm16, %xmm20, %xmm12 #95.67 + vmovss %xmm13, 8(%r8,%rbx) #94.33 + vsubss %xmm12, %xmm14, %xmm15 #95.33 + vmulss %xmm16, %xmm21, %xmm13 #96.67 + vmovss 72(%r8,%rbx), %xmm16 #96.33 + vmovss %xmm15, 40(%r8,%rbx) #95.33 + vsubss %xmm13, %xmm16, %xmm17 #96.33 + vmovss %xmm17, 72(%r8,%rbx) #96.33 + jmp ..B1.73 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.72: # Preds ..B1.70 + # Execution count [1.25e+01] + vmulss %xmm16, %xmm20, %xmm12 #100.43 + vmulss %xmm16, %xmm21, %xmm13 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.73: # Preds ..B1.71 ..B1.72 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm18, %xmm2, %xmm2 #99.29 + vaddss %xmm12, %xmm1, %xmm1 #100.29 + vaddss %xmm13, %xmm0, %xmm0 #101.29 + jmp ..B1.75 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.74: # Preds ..B1.69 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.75: # Preds ..B1.73 ..B1.74 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.80 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.76: # Preds ..B1.75 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.82 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.77: # Preds ..B1.76 + # Execution count [1.88e+01] + cmpl $3, %r9d #77.99 + jl ..B1.82 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.78: # Preds ..B1.77 + # Execution count [2.34e+00] + cmpl $4, %r9d #77.99 + jl ..B1.95 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.79: # Preds ..B1.78 ..B1.64 + # Execution count [7.81e+00] + cmpl %r10d, %r13d #77.62 + jne ..B1.109 # Prob 50% #77.62 + jmp ..B1.91 # Prob 100% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.80: # Preds ..B1.75 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.82 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.81: # Preds ..B1.47 ..B1.80 + # Execution count [1.88e+01] + cmpl $3, %r9d #78.100 + je ..B1.95 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.82: # Preds ..B1.48 ..B1.77 ..B1.81 ..B1.76 ..B1.80 + # ..B1.62 ..B1.68 + # Execution count [5.00e+01] + vsubss 44(%r8,%rdi), %xmm5, %xmm20 #85.48 + vsubss 12(%r8,%rdi), %xmm6, %xmm19 #84.48 + vsubss 76(%r8,%rdi), %xmm4, %xmm21 #86.48 + vmulss %xmm20, %xmm20, %xmm12 #87.61 + vfmadd231ss %xmm19, %xmm19, %xmm12 #87.75 + vfmadd231ss %xmm21, %xmm21, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.87 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm19 xmm20 xmm21 +..B1.83: # Preds ..B1.82 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm14 #89.51 + vmulss %xmm14, %xmm8, %xmm12 #90.50 + vmulss %xmm7, %xmm14, %xmm15 #91.67 + vmulss %xmm14, %xmm12, %xmm13 #90.56 + vmulss %xmm14, %xmm13, %xmm16 #90.62 + vmulss %xmm16, %xmm15, %xmm17 #91.76 + vsubss %xmm3, %xmm16, %xmm18 #91.67 + vmulss %xmm18, %xmm17, %xmm16 #91.82 + vmulss %xmm16, %xmm19, %xmm18 #94.67 + testl %eax, %eax #93.32 + je ..B1.85 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm18 xmm20 xmm21 +..B1.84: # Preds ..B1.83 + # Execution count [1.25e+01] + vmovss 12(%r8,%rbx), %xmm12 #94.33 + vmovss 44(%r8,%rbx), %xmm14 #95.33 + vsubss %xmm18, %xmm12, %xmm13 #94.33 + vmulss %xmm16, %xmm20, %xmm12 #95.67 + vmovss %xmm13, 12(%r8,%rbx) #94.33 + vsubss %xmm12, %xmm14, %xmm15 #95.33 + vmulss %xmm16, %xmm21, %xmm13 #96.67 + vmovss 76(%r8,%rbx), %xmm16 #96.33 + vmovss %xmm15, 44(%r8,%rbx) #95.33 + vsubss %xmm13, %xmm16, %xmm17 #96.33 + vmovss %xmm17, 76(%r8,%rbx) #96.33 + jmp ..B1.86 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.85: # Preds ..B1.83 + # Execution count [1.25e+01] + vmulss %xmm16, %xmm20, %xmm12 #100.43 + vmulss %xmm16, %xmm21, %xmm13 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.86: # Preds ..B1.84 ..B1.85 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm18, %xmm2, %xmm2 #99.29 + vaddss %xmm12, %xmm1, %xmm1 #100.29 + vaddss %xmm13, %xmm0, %xmm0 #101.29 + jmp ..B1.88 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.87: # Preds ..B1.82 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.88: # Preds ..B1.86 ..B1.87 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.93 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.89: # Preds ..B1.88 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.95 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.90: # Preds ..B1.89 + # Execution count [1.88e+01] + cmpl $4, %r9d #77.99 + jl ..B1.95 # Prob 50% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.91: # Preds ..B1.90 ..B1.79 + # Execution count [6.25e+00] + cmpl $5, %r9d #77.99 + jl ..B1.109 # Prob 50% #77.99 + jmp ..B1.106 # Prob 100% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.93: # Preds ..B1.88 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.95 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.94: # Preds ..B1.93 + # Execution count [1.88e+01] + cmpl $4, %r9d #78.100 + je ..B1.109 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.95: # Preds ..B1.81 ..B1.78 ..B1.90 ..B1.94 ..B1.89 + # ..B1.93 ..B1.49 ..B1.63 + # Execution count [5.00e+01] + vsubss 48(%r8,%rdi), %xmm5, %xmm20 #85.48 + vsubss 16(%r8,%rdi), %xmm6, %xmm19 #84.48 + vsubss 80(%r8,%rdi), %xmm4, %xmm21 #86.48 + vmulss %xmm20, %xmm20, %xmm12 #87.61 + vfmadd231ss %xmm19, %xmm19, %xmm12 #87.75 + vfmadd231ss %xmm21, %xmm21, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.100 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm19 xmm20 xmm21 +..B1.96: # Preds ..B1.95 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm14 #89.51 + vmulss %xmm14, %xmm8, %xmm12 #90.50 + vmulss %xmm7, %xmm14, %xmm15 #91.67 + vmulss %xmm14, %xmm12, %xmm13 #90.56 + vmulss %xmm14, %xmm13, %xmm16 #90.62 + vmulss %xmm16, %xmm15, %xmm17 #91.76 + vsubss %xmm3, %xmm16, %xmm18 #91.67 + vmulss %xmm18, %xmm17, %xmm16 #91.82 + vmulss %xmm16, %xmm19, %xmm18 #94.67 + testl %eax, %eax #93.32 + je ..B1.98 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm18 xmm20 xmm21 +..B1.97: # Preds ..B1.96 + # Execution count [1.25e+01] + vmovss 16(%r8,%rbx), %xmm12 #94.33 + vmovss 48(%r8,%rbx), %xmm14 #95.33 + vsubss %xmm18, %xmm12, %xmm13 #94.33 + vmulss %xmm16, %xmm20, %xmm12 #95.67 + vmovss %xmm13, 16(%r8,%rbx) #94.33 + vsubss %xmm12, %xmm14, %xmm15 #95.33 + vmulss %xmm16, %xmm21, %xmm13 #96.67 + vmovss 80(%r8,%rbx), %xmm16 #96.33 + vmovss %xmm15, 48(%r8,%rbx) #95.33 + vsubss %xmm13, %xmm16, %xmm17 #96.33 + vmovss %xmm17, 80(%r8,%rbx) #96.33 + jmp ..B1.99 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.98: # Preds ..B1.96 + # Execution count [1.25e+01] + vmulss %xmm16, %xmm20, %xmm12 #100.43 + vmulss %xmm16, %xmm21, %xmm13 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.99: # Preds ..B1.97 ..B1.98 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm18, %xmm2, %xmm2 #99.29 + vaddss %xmm12, %xmm1, %xmm1 #100.29 + vaddss %xmm13, %xmm0, %xmm0 #101.29 + jmp ..B1.101 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.100: # Preds ..B1.95 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.101: # Preds ..B1.99 ..B1.100 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.104 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.102: # Preds ..B1.101 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.109 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.103: # Preds ..B1.102 + # Execution count [1.88e+01] + cmpl $5, %r9d #77.99 + jl ..B1.109 # Prob 50% #77.99 + jmp ..B1.106 # Prob 100% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.104: # Preds ..B1.101 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.109 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.105: # Preds ..B1.104 + # Execution count [1.88e+01] + cmpl $5, %r9d #78.100 + jne ..B1.109 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.106: # Preds ..B1.91 ..B1.66 ..B1.105 ..B1.103 + # Execution count [9.38e+00] + testl %eax, %eax #77.28 + jne ..B1.116 # Prob 50% #77.28 + jmp ..B1.119 # Prob 100% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.109: # Preds ..B1.94 ..B1.103 ..B1.105 ..B1.102 ..B1.104 + # ..B1.79 ..B1.91 ..B1.65 ..B1.66 + # Execution count [5.00e+01] + vsubss 52(%r8,%rdi), %xmm5, %xmm20 #85.48 + vsubss 20(%r8,%rdi), %xmm6, %xmm19 #84.48 + vsubss 84(%r8,%rdi), %xmm4, %xmm21 #86.48 + vmulss %xmm20, %xmm20, %xmm12 #87.61 + vfmadd231ss %xmm19, %xmm19, %xmm12 #87.75 + vfmadd231ss %xmm21, %xmm21, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.114 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm19 xmm20 xmm21 +..B1.110: # Preds ..B1.109 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm14 #89.51 + vmulss %xmm14, %xmm8, %xmm12 #90.50 + vmulss %xmm7, %xmm14, %xmm15 #91.67 + vmulss %xmm14, %xmm12, %xmm13 #90.56 + vmulss %xmm14, %xmm13, %xmm16 #90.62 + vmulss %xmm16, %xmm15, %xmm17 #91.76 + vsubss %xmm3, %xmm16, %xmm18 #91.67 + vmulss %xmm18, %xmm17, %xmm16 #91.82 + vmulss %xmm16, %xmm19, %xmm18 #94.67 + testl %eax, %eax #93.32 + je ..B1.112 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm18 xmm20 xmm21 +..B1.111: # Preds ..B1.110 + # Execution count [1.25e+01] + vmovss 20(%r8,%rbx), %xmm12 #94.33 + vmovss 52(%r8,%rbx), %xmm14 #95.33 + vsubss %xmm18, %xmm12, %xmm13 #94.33 + vmulss %xmm16, %xmm20, %xmm12 #95.67 + vmovss %xmm13, 20(%r8,%rbx) #94.33 + vsubss %xmm12, %xmm14, %xmm15 #95.33 + vmulss %xmm16, %xmm21, %xmm13 #96.67 + vmovss 84(%r8,%rbx), %xmm16 #96.33 + vmovss %xmm15, 52(%r8,%rbx) #95.33 + vsubss %xmm13, %xmm16, %xmm17 #96.33 + vmovss %xmm17, 84(%r8,%rbx) #96.33 + jmp ..B1.113 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.112: # Preds ..B1.110 + # Execution count [1.25e+01] + vmulss %xmm16, %xmm20, %xmm12 #100.43 + vmulss %xmm16, %xmm21, %xmm13 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.113: # Preds ..B1.111 ..B1.112 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm18, %xmm2, %xmm2 #99.29 + vaddss %xmm12, %xmm1, %xmm1 #100.29 + vaddss %xmm13, %xmm0, %xmm0 #101.29 + jmp ..B1.115 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.114: # Preds ..B1.109 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.115: # Preds ..B1.113 ..B1.114 + # Execution count [7.50e+01] + testl %eax, %eax #77.28 + je ..B1.119 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.116: # Preds ..B1.106 ..B1.115 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.122 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.118: # Preds ..B1.116 + # Execution count [2.50e+01] + cmpl $6, %r9d #77.99 + jl ..B1.122 # Prob 50% #77.99 + jmp ..B1.129 # Prob 100% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.119: # Preds ..B1.106 ..B1.115 + # Execution count [3.75e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.122 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.121: # Preds ..B1.119 + # Execution count [2.50e+01] + cmpl $6, %r9d #78.100 + je ..B1.129 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.122: # Preds ..B1.116 ..B1.119 ..B1.118 ..B1.121 + # Execution count [5.00e+01] + vsubss 56(%r8,%rdi), %xmm5, %xmm20 #85.48 + vsubss 24(%r8,%rdi), %xmm6, %xmm19 #84.48 + vsubss 88(%r8,%rdi), %xmm4, %xmm21 #86.48 + vmulss %xmm20, %xmm20, %xmm12 #87.61 + vfmadd231ss %xmm19, %xmm19, %xmm12 #87.75 + vfmadd231ss %xmm21, %xmm21, %xmm12 #87.75 + vcomiss %xmm12, %xmm11 #88.34 + jbe ..B1.127 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm19 xmm20 xmm21 +..B1.123: # Preds ..B1.122 + # Execution count [2.50e+01] + vdivss %xmm12, %xmm10, %xmm14 #89.51 + vmulss %xmm14, %xmm8, %xmm12 #90.50 + vmulss %xmm7, %xmm14, %xmm15 #91.67 + vmulss %xmm14, %xmm12, %xmm13 #90.56 + vmulss %xmm14, %xmm13, %xmm16 #90.62 + vmulss %xmm16, %xmm15, %xmm17 #91.76 + vsubss %xmm3, %xmm16, %xmm18 #91.67 + vmulss %xmm18, %xmm17, %xmm16 #91.82 + vmulss %xmm16, %xmm19, %xmm18 #94.67 + testl %eax, %eax #93.32 + je ..B1.125 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm18 xmm20 xmm21 +..B1.124: # Preds ..B1.123 + # Execution count [1.25e+01] + vmovss 24(%r8,%rbx), %xmm12 #94.33 + vmovss 56(%r8,%rbx), %xmm14 #95.33 + vsubss %xmm18, %xmm12, %xmm13 #94.33 + vmulss %xmm16, %xmm20, %xmm12 #95.67 + vmovss %xmm13, 24(%r8,%rbx) #94.33 + vsubss %xmm12, %xmm14, %xmm15 #95.33 + vmulss %xmm16, %xmm21, %xmm13 #96.67 + vmovss 88(%r8,%rbx), %xmm16 #96.33 + vmovss %xmm15, 56(%r8,%rbx) #95.33 + vsubss %xmm13, %xmm16, %xmm17 #96.33 + vmovss %xmm17, 88(%r8,%rbx) #96.33 + jmp ..B1.126 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.125: # Preds ..B1.123 + # Execution count [1.25e+01] + vmulss %xmm16, %xmm20, %xmm12 #100.43 + vmulss %xmm16, %xmm21, %xmm13 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm18 +..B1.126: # Preds ..B1.124 ..B1.125 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm18, %xmm2, %xmm2 #99.29 + vaddss %xmm12, %xmm1, %xmm1 #100.29 + vaddss %xmm13, %xmm0, %xmm0 #101.29 + jmp ..B1.129 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.127: # Preds ..B1.122 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.129: # Preds ..B1.127 ..B1.126 ..B1.121 ..B1.118 + # Execution count [1.25e+01] + testl %eax, %eax #77.28 + je ..B1.132 # Prob 50% #77.28 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.130: # Preds ..B1.129 + # Execution count [5.00e+01] + cmpl %r10d, %r13d #77.62 + jne ..B1.134 # Prob 50% #77.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.131: # Preds ..B1.130 + # Execution count [2.50e+01] + cmpl $7, %r9d #77.99 + jl ..B1.134 # Prob 50% #77.99 + jmp ..B1.140 # Prob 100% #77.99 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.132: # Preds ..B1.129 + # Execution count [5.00e+01] + cmpl %r10d, %r13d #78.62 + jne ..B1.134 # Prob 50% #78.62 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.133: # Preds ..B1.132 + # Execution count [2.50e+01] + cmpl $7, %r9d #78.100 + je ..B1.140 # Prob 50% #78.100 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.134: # Preds ..B1.130 ..B1.131 ..B1.132 ..B1.133 + # Execution count [5.00e+01] + vsubss 60(%r8,%rdi), %xmm5, %xmm17 #85.48 + vsubss 28(%r8,%rdi), %xmm6, %xmm16 #84.48 + vsubss 92(%r8,%rdi), %xmm4, %xmm18 #86.48 + vmulss %xmm17, %xmm17, %xmm4 #87.61 + vfmadd231ss %xmm16, %xmm16, %xmm4 #87.75 + vfmadd231ss %xmm18, %xmm18, %xmm4 #87.75 + vcomiss %xmm4, %xmm11 #88.34 + jbe ..B1.139 # Prob 50% #88.34 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm7 xmm8 xmm9 xmm10 xmm11 xmm16 xmm17 xmm18 +..B1.135: # Preds ..B1.134 + # Execution count [2.50e+01] + vdivss %xmm4, %xmm10, %xmm6 #89.51 + vmulss %xmm6, %xmm8, %xmm4 #90.50 + vmulss %xmm7, %xmm6, %xmm12 #91.67 + vmulss %xmm6, %xmm4, %xmm5 #90.56 + vmulss %xmm6, %xmm5, %xmm13 #90.62 + vmulss %xmm13, %xmm12, %xmm14 #91.76 + vsubss %xmm3, %xmm13, %xmm15 #91.67 + vmulss %xmm15, %xmm14, %xmm13 #91.82 + vmulss %xmm13, %xmm16, %xmm15 #94.67 + testl %eax, %eax #93.32 + je ..B1.137 # Prob 50% #93.32 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm15 xmm17 xmm18 +..B1.136: # Preds ..B1.135 + # Execution count [1.25e+01] + vmovss 28(%r8,%rbx), %xmm4 #94.33 + vmovss 60(%r8,%rbx), %xmm6 #95.33 + vsubss %xmm15, %xmm4, %xmm5 #94.33 + vmulss %xmm13, %xmm17, %xmm4 #95.67 + vmovss %xmm5, 28(%r8,%rbx) #94.33 + vsubss %xmm4, %xmm6, %xmm12 #95.33 + vmulss %xmm13, %xmm18, %xmm5 #96.67 + vmovss 92(%r8,%rbx), %xmm13 #96.33 + vmovss %xmm12, 60(%r8,%rbx) #95.33 + vsubss %xmm5, %xmm13, %xmm14 #96.33 + vmovss %xmm14, 92(%r8,%rbx) #96.33 + jmp ..B1.138 # Prob 100% #96.33 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm7 xmm8 xmm9 xmm10 xmm11 xmm15 +..B1.137: # Preds ..B1.135 + # Execution count [1.25e+01] + vmulss %xmm13, %xmm17, %xmm4 #100.43 + vmulss %xmm13, %xmm18, %xmm5 #101.43 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d cl xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm7 xmm8 xmm9 xmm10 xmm11 xmm15 +..B1.138: # Preds ..B1.136 ..B1.137 + # Execution count [2.50e+01] + incq 24(%r12) #103.29 + movb $1, %dl #102.29 + vaddss %xmm15, %xmm2, %xmm2 #99.29 + vaddss %xmm4, %xmm1, %xmm1 #100.29 + vaddss %xmm5, %xmm0, %xmm0 #101.29 + jmp ..B1.141 # Prob 100% #101.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.139: # Preds ..B1.134 + # Execution count [2.50e+01] + incq 32(%r12) #105.29 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.140: # Preds ..B1.131 ..B1.133 ..B1.139 + # Execution count [7.50e+01] + testb %dl, %dl #110.27 + je ..B1.142 # Prob 50% #110.27 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.141: # Preds ..B1.138 ..B1.140 + # Execution count [5.00e+01] + incq 40(%r12) #111.21 + jmp ..B1.143 # Prob 100% #111.21 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.142: # Preds ..B1.140 + # Execution count [5.00e+01] + incq 48(%r12) #113.21 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm0 xmm1 xmm2 xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.143: # Preds ..B1.141 ..B1.142 + # Execution count [1.00e+02] + incb %cl #63.13 + incl %r9d #63.13 + vaddss (%r14,%rsi,4), %xmm2, %xmm2 #116.17 + vaddss 32(%r14,%rsi,4), %xmm1, %xmm1 #117.17 + vaddss 64(%r14,%rsi,4), %xmm0, %xmm0 #118.17 + vmovss %xmm2, (%r14,%rsi,4) #116.17 + vmovss %xmm1, 32(%r14,%rsi,4) #117.17 + vmovss %xmm0, 64(%r14,%rsi,4) #118.17 + incq %rsi #63.13 + cmpb $4, %cl #63.13 + jb ..B1.32 # Prob 75% #63.13 + # LOE rbx rsi rdi r8 r11 r12 r14 eax r9d r10d r13d r15d dl cl xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.144: # Preds ..B1.143 + # Execution count [2.50e+01] + movq 112(%rsp), %rbx #[spill] + incq %rbx #56.9 + movq 104(%rsp), %r12 #[spill] + movq 96(%rsp), %rdi #[spill] + cmpq %r12, %rbx #56.9 + jb ..B1.31 # Prob 82% #56.9 + # LOE rbx rdi r11 r12 r14 eax r13d r15d xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.145: # Preds ..B1.144 + # Execution count [4.50e+00] + movq 48(%rsp), %rdx #[spill] + movl 40(%rsp), %ecx #[spill] + movq 24(%rsp), %rax #[spill] + movq 16(%rsp), %rsi #[spill] + movq 8(%rsp), %r8 #[spill] + movq 80(%rsp), %r9 #[spill] + movq 88(%rsp), %r10 #[spill] + # LOE rax rdx rsi rdi r8 r9 r10 r12 ecx xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.146: # Preds ..B1.145 ..B1.29 + # Execution count [5.00e+00] + vxorpd %xmm16, %xmm16, %xmm16 #124.9 + addq %r12, %rsi #123.9 + vcvtsi2sd %r12d, %xmm16, %xmm16 #124.9 + vmulsd %xmm16, %xmm9, %xmm0 #124.9 + incl %ecx #47.5 + vcvttsd2si %xmm0, %rbx #124.9 + incq %rdx #47.5 + addq %rbx, %rax #124.9 + lea (%rdi,%r8,4), %rdi #47.5 + cmpl 56(%rsp), %ecx #47.5[spill] + jb ..B1.29 # Prob 82% #47.5 + # LOE rax rdx rsi rdi r8 r9 r10 ecx xmm3 xmm7 xmm8 xmm9 xmm10 xmm11 +..B1.147: # Preds ..B1.146 + # Execution count [9.00e-01] + movq 72(%rsp), %rbx #[spill] + movq %rax, 16(%rbx) #124.9 + movq %rsi, 8(%rbx) #123.9 + # LOE r12 +..B1.148: # Preds ..B1.25 ..B1.147 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #127.5 +..___tag_value_computeForceLJ_ref.56: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #127.5 +..___tag_value_computeForceLJ_ref.57: + # LOE r12 +..B1.149: # Preds ..B1.148 + # Execution count [1.00e+00] + xorl %eax, %eax #130.16 +..___tag_value_computeForceLJ_ref.58: +# getTimeStamp() + call getTimeStamp #130.16 +..___tag_value_computeForceLJ_ref.59: + # LOE r12 xmm0 +..B1.157: # Preds ..B1.149 + # Execution count [1.00e+00] + vmovsd %xmm0, 8(%rsp) #130.16[spill] + # LOE r12 +..B1.150: # Preds ..B1.157 + # Execution count [1.00e+00] + movl $.L_2__STRING.3, %edi #131.5 + xorl %eax, %eax #131.5 +..___tag_value_computeForceLJ_ref.61: +# debug_printf(const char *, ...) + call debug_printf #131.5 +..___tag_value_computeForceLJ_ref.62: + # LOE r12 +..B1.151: # Preds ..B1.150 + # Execution count [1.00e+00] + vmovsd 8(%rsp), %xmm0 #132.14[spill] + vsubsd (%rsp), %xmm0, %xmm0 #132.14[spill] + addq $152, %rsp #132.14 + .cfi_restore 3 + popq %rbx #132.14 + .cfi_restore 15 + popq %r15 #132.14 + .cfi_restore 14 + popq %r14 #132.14 + .cfi_restore 13 + popq %r13 #132.14 + .cfi_restore 12 + popq %r12 #132.14 + movq %rbp, %rsp #132.14 + popq %rbp #132.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #132.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B1.152: # Preds ..B1.5 ..B1.9 + # Execution count [4.50e-01]: Infreq + movl %r12d, %r10d #33.9 + jmp ..B1.17 # Prob 100% #33.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r13 r14 r15 edx ecx r10d r11d r12d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_ref,@function + .size computeForceLJ_ref,.-computeForceLJ_ref +..LNcomputeForceLJ_ref.0: + .data +# -- End computeForceLJ_ref + .text +.L_2__routine_start_computeForceLJ_2xnn_full_1: +# -- Begin computeForceLJ_2xnn_full + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_2xnn_full +# --- computeForceLJ_2xnn_full(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_2xnn_full: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B2.1: # Preds ..B2.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_2xnn_full.80: +..L81: + #287.97 + pushq %rbp #287.97 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #287.97 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #287.97 + pushq %r12 #287.97 + pushq %r13 #287.97 + pushq %r14 #287.97 + pushq %r15 #287.97 + pushq %rbx #287.97 + subq $216, %rsp #287.97 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %rbx #287.97 + movl $.L_2__STRING.4, %edi #288.5 + xorl %eax, %eax #288.5 + movq %rcx, %r15 #287.97 + movq %rdx, %r14 #287.97 + movq %rsi, %r13 #287.97 +..___tag_value_computeForceLJ_2xnn_full.90: +# debug_printf(const char *, ...) + call debug_printf #288.5 +..___tag_value_computeForceLJ_2xnn_full.91: + # LOE rbx r12 r13 r14 r15 +..B2.2: # Preds ..B2.1 + # Execution count [1.00e+00] + vmovss 108(%rbx), %xmm0 #291.27 + xorl %edi, %edi #301.5 + vmulss %xmm0, %xmm0, %xmm1 #294.36 + xorl %ecx, %ecx #303.27 + vbroadcastss 48(%rbx), %zmm3 #295.32 + vbroadcastss 40(%rbx), %zmm4 #296.29 + vbroadcastss %xmm1, %zmm2 #294.36 + vmovups %zmm3, 128(%rsp) #295.32[spill] + vmovups %zmm4, 64(%rsp) #296.29[spill] + vmovups %zmm2, (%rsp) #294.36[spill] + movl 20(%r13), %edx #301.26 + xorl %ebx, %ebx #301.5 + testl %edx, %edx #301.26 + jle ..B2.23 # Prob 9% #301.26 + # LOE rcx r12 r13 r14 r15 edx ebx edi +..B2.3: # Preds ..B2.2 + # Execution count [9.00e-01] + movq 176(%r13), %rsi #303.27 + movq 192(%r13), %rax #304.32 + vxorps %xmm2, %xmm2, %xmm2 #305.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #304.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #304.9 + # LOE rax rcx rsi r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 +..B2.4: # Preds ..B2.21 ..B2.3 + # Execution count [5.00e+00] + movl %edi, %r8d #302.27 + movl %edi, %r9d #302.27 + sarl $1, %r8d #302.27 + andl $1, %r9d #302.27 + shll $2, %r9d #302.27 + lea (%r8,%r8,2), %r10d #302.27 + lea (%r9,%r10,8), %r11d #302.27 + movslq %r11d, %r11 #303.27 + lea (%rsi,%r11,4), %r12 #303.27 + movl (%rcx,%rax), %r11d #304.32 + testl %r11d, %r11d #304.32 + jle ..B2.21 # Prob 50% #304.32 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B2.5: # Preds ..B2.4 + # Execution count [4.50e+00] + cmpl $8, %r11d #304.9 + jl ..B2.37 # Prob 10% #304.9 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B2.6: # Preds ..B2.5 + # Execution count [4.50e+00] + lea 64(%r12), %r8 #307.13 + andq $15, %r8 #304.9 + testl $3, %r8d #304.9 + je ..B2.8 # Prob 50% #304.9 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r8d r11d xmm0 xmm1 xmm2 +..B2.7: # Preds ..B2.6 + # Execution count [2.25e+00] + movl %ebx, %r8d #304.9 + jmp ..B2.9 # Prob 100% #304.9 + # LOE rax rcx rsi r8 r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B2.8: # Preds ..B2.6 + # Execution count [2.25e+00] + movl %r8d, %r9d #304.9 + negl %r9d #304.9 + addl $16, %r9d #304.9 + shrl $2, %r9d #304.9 + testl %r8d, %r8d #304.9 + cmovne %r9d, %r8d #304.9 + # LOE rax rcx rsi r8 r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B2.9: # Preds ..B2.7 ..B2.8 + # Execution count [4.50e+00] + lea 8(%r8), %r9d #304.9 + cmpl %r9d, %r11d #304.9 + jl ..B2.37 # Prob 10% #304.9 + # LOE rax rcx rsi r8 r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B2.10: # Preds ..B2.9 + # Execution count [5.00e+00] + movl %r11d, %r10d #304.9 + xorl %r9d, %r9d #304.9 + subl %r8d, %r10d #304.9 + andl $7, %r10d #304.9 + negl %r10d #304.9 + addl %r11d, %r10d #304.9 + cmpl $1, %r8d #304.9 + jb ..B2.14 # Prob 10% #304.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B2.12: # Preds ..B2.10 ..B2.12 + # Execution count [2.50e+01] + movl %ebx, (%r12,%r9,4) #305.13 + movl %ebx, 32(%r12,%r9,4) #306.13 + movl %ebx, 64(%r12,%r9,4) #307.13 + incq %r9 #304.9 + cmpq %r8, %r9 #304.9 + jb ..B2.12 # Prob 82% #304.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B2.14: # Preds ..B2.12 ..B2.10 + # Execution count [4.50e+00] + movslq %r10d, %r9 #304.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B2.15: # Preds ..B2.15 ..B2.14 + # Execution count [2.50e+01] + vmovups %xmm2, (%r12,%r8,4) #305.13 + vmovups %xmm2, 32(%r12,%r8,4) #306.13 + vmovups %xmm2, 64(%r12,%r8,4) #307.13 + vmovups %xmm2, 16(%r12,%r8,4) #305.13 + vmovups %xmm2, 48(%r12,%r8,4) #306.13 + vmovups %xmm2, 80(%r12,%r8,4) #307.13 + addq $8, %r8 #304.9 + cmpq %r9, %r8 #304.9 + jb ..B2.15 # Prob 82% #304.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B2.17: # Preds ..B2.15 ..B2.37 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #304.9 + cmpl %r11d, %r8d #304.9 + ja ..B2.21 # Prob 50% #304.9 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B2.18: # Preds ..B2.17 + # Execution count [4.50e+00] + movslq %r10d, %r9 #305.13 + negl %r10d #304.9 + addl %r11d, %r10d #304.9 + xorl %r8d, %r8d #304.9 + movslq %r11d, %r11 #304.9 + vmovdqa %xmm0, %xmm4 #304.9 + vpbroadcastd %r10d, %xmm3 #304.9 + subq %r9, %r11 #304.9 + lea (%r12,%r9,4), %r12 #305.13 + # LOE rax rcx rsi r8 r11 r12 r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 xmm3 xmm4 +..B2.19: # Preds ..B2.19 ..B2.18 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #304.9 + vpaddd %xmm1, %xmm4, %xmm4 #304.9 + vmovups %xmm2, (%r12,%r8,4){%k1} #305.13 + vmovups %xmm2, 32(%r12,%r8,4){%k1} #306.13 + vmovups %xmm2, 64(%r12,%r8,4){%k1} #307.13 + addq $4, %r8 #304.9 + cmpq %r11, %r8 #304.9 + jb ..B2.19 # Prob 82% #304.9 + # LOE rax rcx rsi r8 r11 r12 r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 xmm3 xmm4 +..B2.21: # Preds ..B2.19 ..B2.4 ..B2.17 + # Execution count [5.00e+00] + incl %edi #301.5 + addq $28, %rcx #301.5 + cmpl %edx, %edi #301.5 + jb ..B2.4 # Prob 82% #301.5 + # LOE rax rcx rsi r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 +..B2.23: # Preds ..B2.21 ..B2.2 + # Execution count [1.00e+00] + xorl %eax, %eax #311.16 + vzeroupper #311.16 +..___tag_value_computeForceLJ_2xnn_full.95: +# getTimeStamp() + call getTimeStamp #311.16 +..___tag_value_computeForceLJ_2xnn_full.96: + # LOE r12 r13 r14 r15 ebx xmm0 +..B2.41: # Preds ..B2.23 + # Execution count [1.00e+00] + vmovsd %xmm0, 192(%rsp) #311.16[spill] + # LOE r12 r13 r14 r15 ebx +..B2.24: # Preds ..B2.41 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #315.5 +..___tag_value_computeForceLJ_2xnn_full.98: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #315.5 +..___tag_value_computeForceLJ_2xnn_full.99: + # LOE r12 r13 r14 r15 ebx +..B2.25: # Preds ..B2.24 + # Execution count [1.00e+00] + movl %ebx, %r9d #318.16 + xorl %r10d, %r10d #318.16 + cmpl $0, 20(%r13) #318.26 + jle ..B2.33 # Prob 10% #318.26 + # LOE r10 r12 r13 r14 r15 ebx r9d +..B2.26: # Preds ..B2.25 + # Execution count [9.00e-01] + movl $4369, %eax #406.9 + kmovw %eax, %k1 #406.9 + vmovups .L_2il0floatpacket.7(%rip), %zmm28 #406.9 + vmovups .L_2il0floatpacket.6(%rip), %zmm24 #406.9 + vmovups 64(%rsp), %zmm25 #406.9[spill] + vmovups 128(%rsp), %zmm26 #406.9[spill] + vmovups (%rsp), %zmm27 #406.9[spill] + vpxord %zmm8, %zmm8, %zmm8 #335.30 + # LOE r10 r13 r14 r15 ebx r9d zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B2.27: # Preds ..B2.31 ..B2.26 + # Execution count [5.00e+00] + movl %r9d, %eax #323.27 + movl %r9d, %ecx #323.27 + sarl $1, %eax #323.27 + andl $1, %ecx #323.27 + shll $2, %ecx #323.27 + movl 16(%r14), %edi #326.44 + imull %r9d, %edi #326.44 + movq 160(%r13), %rsi #324.27 + lea (%rax,%rax,2), %edx #323.27 + vmovaps %zmm8, %zmm16 #335.30 + lea (%rcx,%rdx,8), %eax #323.27 + movslq %eax, %rax #323.27 + vmovaps %zmm16, %zmm15 #336.30 + movslq %edi, %rdi #326.19 + vmovaps %zmm15, %zmm14 #337.30 + movq 8(%r14), %r8 #326.19 + movq 24(%r14), %r11 #327.25 + vbroadcastss 4(%rsi,%rax,4), %zmm21 #329.33 + vbroadcastss 12(%rsi,%rax,4), %zmm19 #330.33 + vbroadcastss 36(%rsi,%rax,4), %zmm17 #331.33 + vbroadcastss 44(%rsi,%rax,4), %zmm1 #332.33 + vbroadcastss 68(%rsi,%rax,4), %zmm3 #333.33 + vbroadcastss 76(%rsi,%rax,4), %zmm5 #334.33 + vbroadcastss 32(%rsi,%rax,4), %zmm0 #331.33 + vbroadcastss (%rsi,%rax,4), %zmm20 #329.33 + vbroadcastss 8(%rsi,%rax,4), %zmm18 #330.33 + vbroadcastss 40(%rsi,%rax,4), %zmm2 #332.33 + vbroadcastss 64(%rsi,%rax,4), %zmm4 #333.33 + vbroadcastss 72(%rsi,%rax,4), %zmm6 #334.33 + vmovaps %zmm14, %zmm13 #338.30 + lea (%r8,%rdi,4), %rdx #326.19 + vmovaps %zmm13, %zmm12 #339.30 + xorl %r8d, %r8d #342.19 + movslq (%r11,%r10,4), %rdi #327.25 + vmovaps %zmm12, %zmm22 #340.30 + movq 176(%r13), %rcx #325.27 + vinsertf64x4 $1, %ymm21, %zmm20, %zmm21 #329.33 + vinsertf64x4 $1, %ymm19, %zmm18, %zmm20 #330.33 + vinsertf64x4 $1, %ymm17, %zmm0, %zmm19 #331.33 + vinsertf64x4 $1, %ymm1, %zmm2, %zmm18 #332.33 + vinsertf64x4 $1, %ymm3, %zmm4, %zmm17 #333.33 + vinsertf64x4 $1, %ymm5, %zmm6, %zmm23 #334.33 + testq %rdi, %rdi #342.28 + jle ..B2.31 # Prob 10% #342.28 + # LOE rax rdx rcx rsi rdi r8 r10 r13 r14 r15 ebx r9d zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B2.28: # Preds ..B2.27 + # Execution count [4.50e+00] + movq %r13, 16(%rsp) #[spill] + movq %r14, 8(%rsp) #[spill] + movq %r15, (%rsp) #[spill] + # LOE rax rdx rcx rsi rdi r8 r10 ebx r9d zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B2.29: # Preds ..B2.29 ..B2.28 + # Execution count [2.50e+01] + movl (%rdx,%r8,4), %r13d #343.22 + incq %r8 #342.39 + lea (%r13,%r13,2), %r14d #344.31 + shll $3, %r14d #344.31 + lea (%r13,%r13), %r15d #365.56 + movslq %r14d, %r14 #345.31 + cmpl %r9d, %r15d #365.66 + lea 1(%r13,%r13), %r11d #366.61 + movl %ebx, %r13d #365.66 + sete %r13b #365.66 + cmpl %r9d, %r11d #366.66 + movl %ebx, %r11d #366.66 + vbroadcastf64x4 64(%rsi,%r14,4), %zmm31 #350.36 + sete %r11b #366.66 + vbroadcastf64x4 32(%rsi,%r14,4), %zmm30 #349.36 + vbroadcastf64x4 (%rsi,%r14,4), %zmm29 #348.36 + vsubps %zmm31, %zmm17, %zmm10 #353.35 + vsubps %zmm31, %zmm23, %zmm5 #356.35 + vsubps %zmm30, %zmm19, %zmm9 #352.35 + vsubps %zmm30, %zmm18, %zmm6 #355.35 + vsubps %zmm29, %zmm21, %zmm11 #351.35 + vsubps %zmm29, %zmm20, %zmm7 #354.35 + vmulps %zmm10, %zmm10, %zmm0 #383.80 + vmulps %zmm5, %zmm5, %zmm1 #384.80 + vfmadd231ps %zmm9, %zmm9, %zmm0 #383.57 + vfmadd231ps %zmm6, %zmm6, %zmm1 #384.57 + vfmadd231ps %zmm11, %zmm11, %zmm0 #383.34 + vfmadd231ps %zmm7, %zmm7, %zmm1 #384.34 + vrcp14ps %zmm0, %zmm4 #389.35 + vrcp14ps %zmm1, %zmm3 #390.35 + vcmpps $17, %zmm27, %zmm1, %k5 #387.67 + vcmpps $17, %zmm27, %zmm0, %k2 #386.67 + vmulps %zmm26, %zmm4, %zmm2 #392.67 + vmulps %zmm26, %zmm3, %zmm29 #393.67 + vmulps %zmm2, %zmm4, %zmm30 #392.51 + vmulps %zmm29, %zmm3, %zmm1 #393.51 + vmulps %zmm30, %zmm4, %zmm2 #392.35 + vmulps %zmm1, %zmm3, %zmm0 #393.35 + vfmsub213ps %zmm28, %zmm4, %zmm30 #395.79 + vfmsub213ps %zmm28, %zmm3, %zmm1 #396.79 + vmulps %zmm25, %zmm4, %zmm4 #395.105 + vmulps %zmm25, %zmm3, %zmm3 #396.105 + vmulps %zmm4, %zmm30, %zmm31 #395.70 + vmulps %zmm3, %zmm1, %zmm1 #396.70 + vmulps %zmm31, %zmm2, %zmm2 #395.54 + vmulps %zmm1, %zmm0, %zmm0 #396.54 + vmulps %zmm2, %zmm24, %zmm4 #395.36 + vmulps %zmm0, %zmm24, %zmm2 #396.36 + movl %r11d, %r14d #380.39 + lea (%r13,%r13), %r12d #380.39 + shll $5, %r14d #380.39 + negl %r12d #380.39 + subl %r14d, %r12d #380.39 + movl %r13d, %r14d #380.39 + movl %r11d, %r15d #380.39 + negl %r14d #380.39 + shll $4, %r15d #380.39 + shll $8, %r12d #380.39 + subl %r15d, %r14d #380.39 + addl $-256, %r12d #380.39 + addl $255, %r14d #380.39 + orl %r14d, %r12d #380.39 + lea (,%r13,8), %r14d #381.39 + kmovw %r12d, %k0 #386.41 + movl %r11d, %r12d #381.39 + shll $2, %r13d #381.39 + negl %r14d #381.39 + shll $7, %r12d #381.39 + negl %r13d #381.39 + shll $6, %r11d #381.39 + subl %r12d, %r14d #381.39 + shll $8, %r14d #381.39 + subl %r11d, %r13d #381.39 + addl $-256, %r14d #381.39 + addl $255, %r13d #381.39 + orl %r13d, %r14d #381.39 + kmovw %r14d, %k4 #387.41 + kandw %k2, %k0, %k3 #386.41 + kandw %k5, %k4, %k6 #387.41 + vfmadd231ps %zmm11, %zmm4, %zmm16{%k3} #398.20 + vfmadd231ps %zmm9, %zmm4, %zmm15{%k3} #399.20 + vfmadd231ps %zmm10, %zmm4, %zmm14{%k3} #400.20 + vfmadd231ps %zmm7, %zmm2, %zmm13{%k6} #401.20 + vfmadd231ps %zmm6, %zmm2, %zmm12{%k6} #402.20 + vfmadd231ps %zmm5, %zmm2, %zmm22{%k6} #403.20 + cmpq %rdi, %r8 #342.28 + jl ..B2.29 # Prob 82% #342.28 + # LOE rax rdx rcx rsi rdi r8 r10 ebx r9d zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B2.30: # Preds ..B2.29 + # Execution count [4.50e+00] + movq 16(%rsp), %r13 #[spill] + movq 8(%rsp), %r14 #[spill] + movq (%rsp), %r15 #[spill] + # LOE rax rcx rdi r10 r13 r14 r15 ebx r9d zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm22 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B2.31: # Preds ..B2.30 ..B2.27 + # Execution count [5.00e+00] + vshuff32x4 $136, %zmm13, %zmm16, %zmm17 #406.9 + incl %r9d #318.49 + vshuff32x4 $221, %zmm13, %zmm16, %zmm16 #406.9 + vshuff32x4 $136, %zmm12, %zmm15, %zmm31 #407.9 + vshuff32x4 $221, %zmm12, %zmm15, %zmm0 #407.9 + vshuff32x4 $136, %zmm22, %zmm14, %zmm9 #408.9 + vshuff32x4 $221, %zmm22, %zmm14, %zmm22 #408.9 + vaddps %zmm16, %zmm17, %zmm19 #406.9 + vaddps %zmm0, %zmm31, %zmm2 #407.9 + vxorpd %xmm0, %xmm0, %xmm0 #412.9 + vaddps %zmm22, %zmm9, %zmm11 #408.9 + vpermilps $78, %zmm19, %zmm18 #406.9 + incq %r10 #318.49 + vpermilps $78, %zmm2, %zmm1 #407.9 + vpermilps $78, %zmm11, %zmm10 #408.9 + vaddps %zmm19, %zmm18, %zmm21 #406.9 + vaddps %zmm2, %zmm1, %zmm4 #407.9 + vaddps %zmm11, %zmm10, %zmm13 #408.9 + vpermilps $177, %zmm21, %zmm20 #406.9 + vpermilps $177, %zmm4, %zmm3 #407.9 + vpermilps $177, %zmm13, %zmm12 #408.9 + vaddps %zmm21, %zmm20, %zmm23 #406.9 + vaddps %zmm4, %zmm3, %zmm5 #407.9 + vaddps %zmm13, %zmm12, %zmm14 #408.9 + vcompressps %zmm23, %zmm29{%k1}{z} #406.9 + vcompressps %zmm5, %zmm6{%k1}{z} #407.9 + vcompressps %zmm14, %zmm15{%k1}{z} #408.9 + vaddps (%rcx,%rax,4), %xmm29, %xmm30 #406.9 + vaddps 32(%rcx,%rax,4), %xmm6, %xmm7 #407.9 + vaddps 64(%rcx,%rax,4), %xmm15, %xmm16 #408.9 + vmovups %xmm30, (%rcx,%rax,4) #406.9 + vmovups %xmm7, 32(%rcx,%rax,4) #407.9 + vmovups %xmm16, 64(%rcx,%rax,4) #408.9 + addq %rdi, 8(%r15) #411.9 + vcvtsi2sd %edi, %xmm0, %xmm0 #412.9 + vcvttsd2si %xmm0, %rax #412.9 + incq (%r15) #410.9 + addq %rax, 16(%r15) #412.9 + cmpl 20(%r13), %r9d #318.26 + jl ..B2.27 # Prob 82% #318.26 + # LOE r10 r13 r14 r15 ebx r9d zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B2.33: # Preds ..B2.31 ..B2.25 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #416.5 + vzeroupper #416.5 +..___tag_value_computeForceLJ_2xnn_full.109: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #416.5 +..___tag_value_computeForceLJ_2xnn_full.110: + # LOE r12 +..B2.34: # Preds ..B2.33 + # Execution count [1.00e+00] + xorl %eax, %eax #419.16 +..___tag_value_computeForceLJ_2xnn_full.111: +# getTimeStamp() + call getTimeStamp #419.16 +..___tag_value_computeForceLJ_2xnn_full.112: + # LOE r12 xmm0 +..B2.42: # Preds ..B2.34 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #419.16[spill] + # LOE r12 +..B2.35: # Preds ..B2.42 + # Execution count [1.00e+00] + movl $.L_2__STRING.5, %edi #420.5 + xorl %eax, %eax #420.5 +..___tag_value_computeForceLJ_2xnn_full.114: +# debug_printf(const char *, ...) + call debug_printf #420.5 +..___tag_value_computeForceLJ_2xnn_full.115: + # LOE r12 +..B2.36: # Preds ..B2.35 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm0 #421.14[spill] + vsubsd 192(%rsp), %xmm0, %xmm0 #421.14[spill] + addq $216, %rsp #421.14 + .cfi_restore 3 + popq %rbx #421.14 + .cfi_restore 15 + popq %r15 #421.14 + .cfi_restore 14 + popq %r14 #421.14 + .cfi_restore 13 + popq %r13 #421.14 + .cfi_restore 12 + popq %r12 #421.14 + movq %rbp, %rsp #421.14 + popq %rbp #421.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #421.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B2.37: # Preds ..B2.5 ..B2.9 + # Execution count [4.50e-01]: Infreq + movl %ebx, %r10d #304.9 + jmp ..B2.17 # Prob 100% #304.9 + .align 16,0x90 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_2xnn_full,@function + .size computeForceLJ_2xnn_full,.-computeForceLJ_2xnn_full +..LNcomputeForceLJ_2xnn_full.1: + .data +# -- End computeForceLJ_2xnn_full + .text +.L_2__routine_start_computeForceLJ_2xnn_2: +# -- Begin computeForceLJ_2xnn + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_2xnn +# --- computeForceLJ_2xnn(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_2xnn: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B3.1: # Preds ..B3.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_2xnn.133: +..L134: + #424.92 + pushq %rbp #424.92 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #424.92 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #424.92 + pushq %r13 #424.92 + pushq %r14 #424.92 + pushq %r15 #424.92 + pushq %rbx #424.92 + subq $224, %rsp #424.92 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + movq %rdx, %r14 #424.92 + movq %rcx, %r15 #424.92 + movq %rsi, %r13 #424.92 + movq %rdi, %rbx #424.92 + cmpl $0, 32(%r14) #425.8 + je ..B3.4 # Prob 50% #425.8 + # LOE rbx r12 r13 r14 r15 +..B3.2: # Preds ..B3.1 + # Execution count [5.00e-01] + movq %rbx, %rdi #426.16 + movq %r13, %rsi #426.16 + movq %r14, %rdx #426.16 + movq %r15, %rcx #426.16 + addq $224, %rsp #426.16 + .cfi_restore 3 + popq %rbx #426.16 + .cfi_restore 15 + popq %r15 #426.16 + .cfi_restore 14 + popq %r14 #426.16 + .cfi_restore 13 + popq %r13 #426.16 + movq %rbp, %rsp #426.16 + popq %rbp #426.16 + .cfi_def_cfa 7, 8 + .cfi_restore 6 +# computeForceLJ_2xnn_half(Parameter *, Atom *, Neighbor *, Stats *) + jmp computeForceLJ_2xnn_half #426.16 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B3.4: # Preds ..B3.1 + # Execution count [5.00e-01] + movl $.L_2__STRING.4, %edi #429.12 + xorl %eax, %eax #429.12 +..___tag_value_computeForceLJ_2xnn.154: +# debug_printf(const char *, ...) + call debug_printf #429.12 +..___tag_value_computeForceLJ_2xnn.155: + # LOE rbx r12 r13 r14 r15 +..B3.5: # Preds ..B3.4 + # Execution count [5.00e-01] + vmovss 108(%rbx), %xmm0 #429.12 + xorl %r10d, %r10d #429.12 + vmulss %xmm0, %xmm0, %xmm1 #429.12 + xorl %r8d, %r8d #429.12 + vbroadcastss 48(%rbx), %zmm3 #429.12 + vbroadcastss 40(%rbx), %zmm4 #429.12 + vbroadcastss %xmm1, %zmm2 #429.12 + vmovups %zmm3, (%rsp) #429.12[spill] + vmovups %zmm4, 128(%rsp) #429.12[spill] + vmovups %zmm2, 64(%rsp) #429.12[spill] + movl 20(%r13), %edi #429.12 + xorl %ebx, %ebx #429.12 + testl %edi, %edi #429.12 + jle ..B3.26 # Prob 9% #429.12 + # LOE r8 r12 r13 r14 r15 ebx edi r10d +..B3.6: # Preds ..B3.5 + # Execution count [4.50e-01] + movq 176(%r13), %r9 #429.12 + movq 192(%r13), %rax #429.12 + vxorps %xmm2, %xmm2, %xmm2 #429.12 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #429.12 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #429.12 + movq %r12, 192(%rsp) #429.12[spill] + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + # LOE rax r8 r9 r13 r14 r15 ebx edi r10d xmm0 xmm1 xmm2 +..B3.7: # Preds ..B3.24 ..B3.6 + # Execution count [2.50e+00] + movl %r10d, %r11d #429.12 + movl %r10d, %r12d #429.12 + sarl $1, %r11d #429.12 + andl $1, %r12d #429.12 + shll $2, %r12d #429.12 + movl (%r8,%rax), %ecx #429.12 + lea (%r11,%r11,2), %r11d #429.12 + lea (%r12,%r11,8), %r11d #429.12 + movslq %r11d, %r11 #429.12 + lea (%r9,%r11,4), %rsi #429.12 + testl %ecx, %ecx #429.12 + jle ..B3.24 # Prob 50% #429.12 + # LOE rax rsi r8 r9 r13 r14 r15 ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.8: # Preds ..B3.7 + # Execution count [2.25e+00] + cmpl $8, %ecx #429.12 + jl ..B3.40 # Prob 10% #429.12 + # LOE rax rsi r8 r9 r13 r14 r15 ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.9: # Preds ..B3.8 + # Execution count [2.25e+00] + lea 64(%rsi), %r11 #429.12 + andq $15, %r11 #429.12 + testl $3, %r11d #429.12 + je ..B3.11 # Prob 50% #429.12 + # LOE rax rsi r8 r9 r13 r14 r15 ecx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B3.10: # Preds ..B3.9 + # Execution count [1.12e+00] + movl %ebx, %r11d #429.12 + jmp ..B3.12 # Prob 100% #429.12 + # LOE rax rsi r8 r9 r11 r13 r14 r15 ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.11: # Preds ..B3.9 + # Execution count [1.12e+00] + movl %r11d, %r12d #429.12 + negl %r12d #429.12 + addl $16, %r12d #429.12 + shrl $2, %r12d #429.12 + testl %r11d, %r11d #429.12 + cmovne %r12d, %r11d #429.12 + # LOE rax rsi r8 r9 r11 r13 r14 r15 ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.12: # Preds ..B3.10 ..B3.11 + # Execution count [2.25e+00] + lea 8(%r11), %r12d #429.12 + cmpl %r12d, %ecx #429.12 + jl ..B3.40 # Prob 10% #429.12 + # LOE rax rsi r8 r9 r11 r13 r14 r15 ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.13: # Preds ..B3.12 + # Execution count [2.50e+00] + movl %ecx, %edx #429.12 + xorl %r12d, %r12d #429.12 + subl %r11d, %edx #429.12 + andl $7, %edx #429.12 + negl %edx #429.12 + addl %ecx, %edx #429.12 + cmpl $1, %r11d #429.12 + jb ..B3.17 # Prob 10% #429.12 + # LOE rax rsi r8 r9 r11 r12 r13 r14 r15 edx ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.15: # Preds ..B3.13 ..B3.15 + # Execution count [1.25e+01] + movl %ebx, (%rsi,%r12,4) #429.12 + movl %ebx, 32(%rsi,%r12,4) #429.12 + movl %ebx, 64(%rsi,%r12,4) #429.12 + incq %r12 #429.12 + cmpq %r11, %r12 #429.12 + jb ..B3.15 # Prob 82% #429.12 + # LOE rax rsi r8 r9 r11 r12 r13 r14 r15 edx ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.17: # Preds ..B3.15 ..B3.13 + # Execution count [2.25e+00] + movslq %edx, %r12 #429.12 + # LOE rax rsi r8 r9 r11 r12 r13 r14 r15 edx ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.18: # Preds ..B3.18 ..B3.17 + # Execution count [1.25e+01] + vmovups %xmm2, (%rsi,%r11,4) #429.12 + vmovups %xmm2, 32(%rsi,%r11,4) #429.12 + vmovups %xmm2, 64(%rsi,%r11,4) #429.12 + vmovups %xmm2, 16(%rsi,%r11,4) #429.12 + vmovups %xmm2, 48(%rsi,%r11,4) #429.12 + vmovups %xmm2, 80(%rsi,%r11,4) #429.12 + addq $8, %r11 #429.12 + cmpq %r12, %r11 #429.12 + jb ..B3.18 # Prob 82% #429.12 + # LOE rax rsi r8 r9 r11 r12 r13 r14 r15 edx ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.20: # Preds ..B3.18 ..B3.40 + # Execution count [2.50e+00] + lea 1(%rdx), %r11d #429.12 + cmpl %ecx, %r11d #429.12 + ja ..B3.24 # Prob 50% #429.12 + # LOE rax rsi r8 r9 r13 r14 r15 edx ecx ebx edi r10d xmm0 xmm1 xmm2 +..B3.21: # Preds ..B3.20 + # Execution count [2.25e+00] + movslq %edx, %r12 #429.12 + negl %edx #429.12 + addl %ecx, %edx #429.12 + xorl %r11d, %r11d #429.12 + movslq %ecx, %rcx #429.12 + vmovdqa %xmm0, %xmm4 #429.12 + vpbroadcastd %edx, %xmm3 #429.12 + subq %r12, %rcx #429.12 + lea (%rsi,%r12,4), %rsi #429.12 + # LOE rax rcx rsi r8 r9 r11 r13 r14 r15 ebx edi r10d xmm0 xmm1 xmm2 xmm3 xmm4 +..B3.22: # Preds ..B3.22 ..B3.21 + # Execution count [1.25e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #429.12 + vpaddd %xmm1, %xmm4, %xmm4 #429.12 + vmovups %xmm2, (%rsi,%r11,4){%k1} #429.12 + vmovups %xmm2, 32(%rsi,%r11,4){%k1} #429.12 + vmovups %xmm2, 64(%rsi,%r11,4){%k1} #429.12 + addq $4, %r11 #429.12 + cmpq %rcx, %r11 #429.12 + jb ..B3.22 # Prob 82% #429.12 + # LOE rax rcx rsi r8 r9 r11 r13 r14 r15 ebx edi r10d xmm0 xmm1 xmm2 xmm3 xmm4 +..B3.24: # Preds ..B3.22 ..B3.7 ..B3.20 + # Execution count [2.50e+00] + incl %r10d #429.12 + addq $28, %r8 #429.12 + cmpl %edi, %r10d #429.12 + jb ..B3.7 # Prob 82% #429.12 + # LOE rax r8 r9 r13 r14 r15 ebx edi r10d xmm0 xmm1 xmm2 +..B3.25: # Preds ..B3.24 + # Execution count [4.50e-01] + movq 192(%rsp), %r12 #[spill] + .cfi_restore 12 + # LOE r12 r13 r14 r15 ebx +..B3.26: # Preds ..B3.5 ..B3.25 + # Execution count [5.00e-01] + xorl %eax, %eax #429.12 + vzeroupper #429.12 +..___tag_value_computeForceLJ_2xnn.162: +# getTimeStamp() + call getTimeStamp #429.12 +..___tag_value_computeForceLJ_2xnn.163: + # LOE r12 r13 r14 r15 ebx xmm0 +..B3.45: # Preds ..B3.26 + # Execution count [5.00e-01] + vmovsd %xmm0, 200(%rsp) #429.12[spill] + # LOE r12 r13 r14 r15 ebx +..B3.27: # Preds ..B3.45 + # Execution count [5.00e-01] + movl $.L_2__STRING.2, %edi #429.12 +..___tag_value_computeForceLJ_2xnn.165: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #429.12 +..___tag_value_computeForceLJ_2xnn.166: + # LOE r12 r13 r14 r15 ebx +..B3.28: # Preds ..B3.27 + # Execution count [5.00e-01] + movl %ebx, %edx #429.12 + xorl %ecx, %ecx #429.12 + cmpl $0, 20(%r13) #429.12 + jle ..B3.36 # Prob 10% #429.12 + # LOE rcx r12 r13 r14 r15 edx ebx +..B3.29: # Preds ..B3.28 + # Execution count [4.50e-01] + movl $4369, %eax #429.12 + kmovw %eax, %k1 #429.12 + vmovups .L_2il0floatpacket.7(%rip), %zmm28 #429.12 + vmovups .L_2il0floatpacket.6(%rip), %zmm24 #429.12 + vmovups 128(%rsp), %zmm25 #429.12[spill] + vmovups (%rsp), %zmm26 #429.12[spill] + vmovups 64(%rsp), %zmm27 #429.12[spill] + movq %r12, 192(%rsp) #429.12[spill] + vpxord %zmm8, %zmm8, %zmm8 #429.12 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + # LOE rcx r13 r14 r15 edx ebx zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B3.30: # Preds ..B3.34 ..B3.29 + # Execution count [2.50e+00] + movl %edx, %eax #429.12 + movl %edx, %r8d #429.12 + sarl $1, %eax #429.12 + andl $1, %r8d #429.12 + shll $2, %r8d #429.12 + movl 16(%r14), %r10d #429.12 + imull %edx, %r10d #429.12 + movq 160(%r13), %r9 #429.12 + lea (%rax,%rax,2), %edi #429.12 + vmovaps %zmm8, %zmm16 #429.12 + lea (%r8,%rdi,8), %eax #429.12 + movslq %eax, %rax #429.12 + vmovaps %zmm16, %zmm15 #429.12 + movslq %r10d, %r10 #429.12 + vmovaps %zmm15, %zmm14 #429.12 + movq 8(%r14), %r11 #429.12 + movq 24(%r14), %r12 #429.12 + vbroadcastss 4(%r9,%rax,4), %zmm21 #429.12 + vbroadcastss 12(%r9,%rax,4), %zmm19 #429.12 + vbroadcastss 36(%r9,%rax,4), %zmm17 #429.12 + vbroadcastss 44(%r9,%rax,4), %zmm1 #429.12 + vbroadcastss 68(%r9,%rax,4), %zmm3 #429.12 + vbroadcastss 76(%r9,%rax,4), %zmm5 #429.12 + vbroadcastss 32(%r9,%rax,4), %zmm0 #429.12 + vbroadcastss (%r9,%rax,4), %zmm20 #429.12 + vbroadcastss 8(%r9,%rax,4), %zmm18 #429.12 + vbroadcastss 40(%r9,%rax,4), %zmm2 #429.12 + vbroadcastss 64(%r9,%rax,4), %zmm4 #429.12 + vbroadcastss 72(%r9,%rax,4), %zmm6 #429.12 + vmovaps %zmm14, %zmm13 #429.12 + lea (%r11,%r10,4), %rdi #429.12 + vmovaps %zmm13, %zmm12 #429.12 + xorl %r11d, %r11d #429.12 + movslq (%r12,%rcx,4), %r10 #429.12 + vmovaps %zmm12, %zmm22 #429.12 + movq 176(%r13), %r8 #429.12 + vinsertf64x4 $1, %ymm21, %zmm20, %zmm21 #429.12 + vinsertf64x4 $1, %ymm19, %zmm18, %zmm20 #429.12 + vinsertf64x4 $1, %ymm17, %zmm0, %zmm19 #429.12 + vinsertf64x4 $1, %ymm1, %zmm2, %zmm18 #429.12 + vinsertf64x4 $1, %ymm3, %zmm4, %zmm17 #429.12 + vinsertf64x4 $1, %ymm5, %zmm6, %zmm23 #429.12 + testq %r10, %r10 #429.12 + jle ..B3.34 # Prob 10% #429.12 + # LOE rax rcx rdi r8 r9 r10 r11 r13 r14 r15 edx ebx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B3.31: # Preds ..B3.30 + # Execution count [2.25e+00] + movq %r13, 16(%rsp) #[spill] + movq %r14, 8(%rsp) #[spill] + movq %r15, (%rsp) #[spill] + # LOE rax rcx rdi r8 r9 r10 r11 edx ebx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B3.32: # Preds ..B3.32 ..B3.31 + # Execution count [1.25e+01] + movl (%rdi,%r11,4), %r13d #429.12 + incq %r11 #429.12 + lea (%r13,%r13,2), %r14d #429.12 + shll $3, %r14d #429.12 + lea (%r13,%r13), %r15d #429.12 + movslq %r14d, %r14 #429.12 + cmpl %edx, %r15d #429.12 + lea 1(%r13,%r13), %esi #429.12 + movl %ebx, %r13d #429.12 + sete %r13b #429.12 + cmpl %edx, %esi #429.12 + movl %ebx, %esi #429.12 + vbroadcastf64x4 64(%r9,%r14,4), %zmm31 #429.12 + sete %sil #429.12 + vbroadcastf64x4 32(%r9,%r14,4), %zmm30 #429.12 + vbroadcastf64x4 (%r9,%r14,4), %zmm29 #429.12 + vsubps %zmm31, %zmm17, %zmm10 #429.12 + vsubps %zmm31, %zmm23, %zmm5 #429.12 + vsubps %zmm30, %zmm19, %zmm9 #429.12 + vsubps %zmm30, %zmm18, %zmm6 #429.12 + vsubps %zmm29, %zmm21, %zmm11 #429.12 + vsubps %zmm29, %zmm20, %zmm7 #429.12 + vmulps %zmm10, %zmm10, %zmm0 #429.12 + vmulps %zmm5, %zmm5, %zmm1 #429.12 + vfmadd231ps %zmm9, %zmm9, %zmm0 #429.12 + vfmadd231ps %zmm6, %zmm6, %zmm1 #429.12 + vfmadd231ps %zmm11, %zmm11, %zmm0 #429.12 + vfmadd231ps %zmm7, %zmm7, %zmm1 #429.12 + vrcp14ps %zmm0, %zmm4 #429.12 + vrcp14ps %zmm1, %zmm3 #429.12 + vcmpps $17, %zmm27, %zmm1, %k5 #429.12 + vcmpps $17, %zmm27, %zmm0, %k2 #429.12 + vmulps %zmm4, %zmm26, %zmm2 #429.12 + vmulps %zmm3, %zmm26, %zmm29 #429.12 + vmulps %zmm2, %zmm4, %zmm30 #429.12 + vmulps %zmm29, %zmm3, %zmm1 #429.12 + vmulps %zmm30, %zmm4, %zmm2 #429.12 + vmulps %zmm1, %zmm3, %zmm0 #429.12 + vfmsub213ps %zmm28, %zmm4, %zmm30 #429.12 + vfmsub213ps %zmm28, %zmm3, %zmm1 #429.12 + vmulps %zmm4, %zmm25, %zmm4 #429.12 + vmulps %zmm3, %zmm25, %zmm3 #429.12 + vmulps %zmm4, %zmm30, %zmm31 #429.12 + vmulps %zmm3, %zmm1, %zmm1 #429.12 + vmulps %zmm31, %zmm2, %zmm2 #429.12 + vmulps %zmm1, %zmm0, %zmm0 #429.12 + vmulps %zmm2, %zmm24, %zmm4 #429.12 + vmulps %zmm0, %zmm24, %zmm2 #429.12 + movl %esi, %r14d #429.12 + lea (%r13,%r13), %r12d #429.12 + shll $5, %r14d #429.12 + negl %r12d #429.12 + subl %r14d, %r12d #429.12 + movl %r13d, %r14d #429.12 + movl %esi, %r15d #429.12 + negl %r14d #429.12 + shll $4, %r15d #429.12 + shll $8, %r12d #429.12 + subl %r15d, %r14d #429.12 + addl $-256, %r12d #429.12 + addl $255, %r14d #429.12 + orl %r14d, %r12d #429.12 + lea (,%r13,8), %r14d #429.12 + kmovw %r12d, %k0 #429.12 + movl %esi, %r12d #429.12 + shll $2, %r13d #429.12 + negl %r14d #429.12 + shll $7, %r12d #429.12 + negl %r13d #429.12 + shll $6, %esi #429.12 + subl %r12d, %r14d #429.12 + shll $8, %r14d #429.12 + subl %esi, %r13d #429.12 + addl $-256, %r14d #429.12 + addl $255, %r13d #429.12 + orl %r13d, %r14d #429.12 + kmovw %r14d, %k4 #429.12 + kandw %k2, %k0, %k3 #429.12 + kandw %k5, %k4, %k6 #429.12 + vfmadd231ps %zmm11, %zmm4, %zmm16{%k3} #429.12 + vfmadd231ps %zmm9, %zmm4, %zmm15{%k3} #429.12 + vfmadd231ps %zmm10, %zmm4, %zmm14{%k3} #429.12 + vfmadd231ps %zmm7, %zmm2, %zmm13{%k6} #429.12 + vfmadd231ps %zmm6, %zmm2, %zmm12{%k6} #429.12 + vfmadd231ps %zmm5, %zmm2, %zmm22{%k6} #429.12 + cmpq %r10, %r11 #429.12 + jl ..B3.32 # Prob 82% #429.12 + # LOE rax rcx rdi r8 r9 r10 r11 edx ebx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B3.33: # Preds ..B3.32 + # Execution count [2.25e+00] + movq 16(%rsp), %r13 #[spill] + movq 8(%rsp), %r14 #[spill] + movq (%rsp), %r15 #[spill] + # LOE rax rcx r8 r10 r13 r14 r15 edx ebx zmm8 zmm12 zmm13 zmm14 zmm15 zmm16 zmm22 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B3.34: # Preds ..B3.33 ..B3.30 + # Execution count [2.50e+00] + vshuff32x4 $136, %zmm13, %zmm16, %zmm17 #429.12 + incl %edx #429.12 + vshuff32x4 $221, %zmm13, %zmm16, %zmm16 #429.12 + vshuff32x4 $136, %zmm12, %zmm15, %zmm31 #429.12 + vshuff32x4 $221, %zmm12, %zmm15, %zmm0 #429.12 + vshuff32x4 $136, %zmm22, %zmm14, %zmm9 #429.12 + vshuff32x4 $221, %zmm22, %zmm14, %zmm22 #429.12 + vaddps %zmm16, %zmm17, %zmm19 #429.12 + vaddps %zmm0, %zmm31, %zmm2 #429.12 + vxorpd %xmm0, %xmm0, %xmm0 #429.12 + vaddps %zmm22, %zmm9, %zmm11 #429.12 + vpermilps $78, %zmm19, %zmm18 #429.12 + incq %rcx #429.12 + vpermilps $78, %zmm2, %zmm1 #429.12 + vpermilps $78, %zmm11, %zmm10 #429.12 + vaddps %zmm19, %zmm18, %zmm21 #429.12 + vaddps %zmm2, %zmm1, %zmm4 #429.12 + vaddps %zmm11, %zmm10, %zmm13 #429.12 + vpermilps $177, %zmm21, %zmm20 #429.12 + vpermilps $177, %zmm4, %zmm3 #429.12 + vpermilps $177, %zmm13, %zmm12 #429.12 + vaddps %zmm21, %zmm20, %zmm23 #429.12 + vaddps %zmm4, %zmm3, %zmm5 #429.12 + vaddps %zmm13, %zmm12, %zmm14 #429.12 + vcompressps %zmm23, %zmm29{%k1}{z} #429.12 + vcompressps %zmm5, %zmm6{%k1}{z} #429.12 + vcompressps %zmm14, %zmm15{%k1}{z} #429.12 + vaddps (%r8,%rax,4), %xmm29, %xmm30 #429.12 + vaddps 32(%r8,%rax,4), %xmm6, %xmm7 #429.12 + vaddps 64(%r8,%rax,4), %xmm15, %xmm16 #429.12 + vmovups %xmm30, (%r8,%rax,4) #429.12 + vmovups %xmm7, 32(%r8,%rax,4) #429.12 + vmovups %xmm16, 64(%r8,%rax,4) #429.12 + addq %r10, 8(%r15) #429.12 + vcvtsi2sd %r10d, %xmm0, %xmm0 #429.12 + vcvttsd2si %xmm0, %rax #429.12 + incq (%r15) #429.12 + addq %rax, 16(%r15) #429.12 + cmpl 20(%r13), %edx #429.12 + jl ..B3.30 # Prob 82% #429.12 + # LOE rcx r13 r14 r15 edx ebx zmm8 zmm24 zmm25 zmm26 zmm27 zmm28 k1 +..B3.35: # Preds ..B3.34 + # Execution count [4.50e-01] + movq 192(%rsp), %r12 #[spill] + .cfi_restore 12 + # LOE r12 +..B3.36: # Preds ..B3.35 ..B3.28 + # Execution count [5.00e-01] + movl $.L_2__STRING.2, %edi #429.12 + vzeroupper #429.12 +..___tag_value_computeForceLJ_2xnn.179: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #429.12 +..___tag_value_computeForceLJ_2xnn.180: + # LOE r12 +..B3.37: # Preds ..B3.36 + # Execution count [5.00e-01] + xorl %eax, %eax #429.12 +..___tag_value_computeForceLJ_2xnn.181: +# getTimeStamp() + call getTimeStamp #429.12 +..___tag_value_computeForceLJ_2xnn.182: + # LOE r12 xmm0 +..B3.46: # Preds ..B3.37 + # Execution count [5.00e-01] + vmovsd %xmm0, (%rsp) #429.12[spill] + # LOE r12 +..B3.38: # Preds ..B3.46 + # Execution count [5.00e-01] + movl $.L_2__STRING.5, %edi #429.12 + xorl %eax, %eax #429.12 +..___tag_value_computeForceLJ_2xnn.184: +# debug_printf(const char *, ...) + call debug_printf #429.12 +..___tag_value_computeForceLJ_2xnn.185: + # LOE r12 +..B3.39: # Preds ..B3.38 + # Execution count [5.00e-01] + vmovsd (%rsp), %xmm0 #429.12[spill] + vsubsd 200(%rsp), %xmm0, %xmm0 #429.12[spill] + addq $224, %rsp #429.12 + .cfi_restore 3 + popq %rbx #429.12 + .cfi_restore 15 + popq %r15 #429.12 + .cfi_restore 14 + popq %r14 #429.12 + .cfi_restore 13 + popq %r13 #429.12 + movq %rbp, %rsp #429.12 + popq %rbp #429.12 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #429.12 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B3.40: # Preds ..B3.8 ..B3.12 + # Execution count [2.25e-01]: Infreq + movl %ebx, %edx #429.12 + jmp ..B3.20 # Prob 100% #429.12 + .align 16,0x90 + # LOE rax rsi r8 r9 r13 r14 r15 edx ecx ebx edi r10d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_2xnn,@function + .size computeForceLJ_2xnn,.-computeForceLJ_2xnn +..LNcomputeForceLJ_2xnn.2: + .data +# -- End computeForceLJ_2xnn + .text +.L_2__routine_start_computeForceLJ_2xnn_half_3: +# -- Begin computeForceLJ_2xnn_half + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_2xnn_half +# --- computeForceLJ_2xnn_half(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_2xnn_half: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B4.1: # Preds ..B4.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_2xnn_half.202: +..L203: + #135.97 + pushq %rbp #135.97 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #135.97 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #135.97 + pushq %r12 #135.97 + pushq %r13 #135.97 + pushq %r14 #135.97 + pushq %r15 #135.97 + pushq %rbx #135.97 + subq $216, %rsp #135.97 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %rbx #135.97 + movl $.L_2__STRING.4, %edi #136.5 + xorl %eax, %eax #136.5 + movq %rcx, %r15 #135.97 + movq %rdx, %r14 #135.97 + movq %rsi, %r13 #135.97 +..___tag_value_computeForceLJ_2xnn_half.212: +# debug_printf(const char *, ...) + call debug_printf #136.5 +..___tag_value_computeForceLJ_2xnn_half.213: + # LOE rbx r12 r13 r14 r15 +..B4.2: # Preds ..B4.1 + # Execution count [1.00e+00] + vmovss 108(%rbx), %xmm0 #139.27 + xorl %edi, %edi #149.5 + vmulss %xmm0, %xmm0, %xmm1 #142.36 + xorl %ecx, %ecx #151.27 + vbroadcastss 48(%rbx), %zmm3 #143.32 + vbroadcastss 40(%rbx), %zmm4 #144.29 + vbroadcastss %xmm1, %zmm2 #142.36 + vmovups %zmm3, (%rsp) #143.32[spill] + vmovups %zmm4, 128(%rsp) #144.29[spill] + vmovups %zmm2, 64(%rsp) #142.36[spill] + movl 20(%r13), %edx #149.26 + xorl %ebx, %ebx #149.5 + testl %edx, %edx #149.26 + jle ..B4.23 # Prob 9% #149.26 + # LOE rcx r12 r13 r14 r15 edx ebx edi +..B4.3: # Preds ..B4.2 + # Execution count [9.00e-01] + movq 176(%r13), %rsi #151.27 + movq 192(%r13), %rax #152.32 + vxorps %xmm2, %xmm2, %xmm2 #153.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #152.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #152.9 + # LOE rax rcx rsi r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 +..B4.4: # Preds ..B4.21 ..B4.3 + # Execution count [5.00e+00] + movl %edi, %r8d #150.27 + movl %edi, %r9d #150.27 + sarl $1, %r8d #150.27 + andl $1, %r9d #150.27 + shll $2, %r9d #150.27 + lea (%r8,%r8,2), %r10d #150.27 + lea (%r9,%r10,8), %r11d #150.27 + movslq %r11d, %r11 #151.27 + lea (%rsi,%r11,4), %r12 #151.27 + movl (%rcx,%rax), %r11d #152.32 + testl %r11d, %r11d #152.32 + jle ..B4.21 # Prob 50% #152.32 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B4.5: # Preds ..B4.4 + # Execution count [4.50e+00] + cmpl $8, %r11d #152.9 + jl ..B4.37 # Prob 10% #152.9 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B4.6: # Preds ..B4.5 + # Execution count [4.50e+00] + lea 64(%r12), %r8 #155.13 + andq $15, %r8 #152.9 + testl $3, %r8d #152.9 + je ..B4.8 # Prob 50% #152.9 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r8d r11d xmm0 xmm1 xmm2 +..B4.7: # Preds ..B4.6 + # Execution count [2.25e+00] + movl %ebx, %r8d #152.9 + jmp ..B4.9 # Prob 100% #152.9 + # LOE rax rcx rsi r8 r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B4.8: # Preds ..B4.6 + # Execution count [2.25e+00] + movl %r8d, %r9d #152.9 + negl %r9d #152.9 + addl $16, %r9d #152.9 + shrl $2, %r9d #152.9 + testl %r8d, %r8d #152.9 + cmovne %r9d, %r8d #152.9 + # LOE rax rcx rsi r8 r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B4.9: # Preds ..B4.7 ..B4.8 + # Execution count [4.50e+00] + lea 8(%r8), %r9d #152.9 + cmpl %r9d, %r11d #152.9 + jl ..B4.37 # Prob 10% #152.9 + # LOE rax rcx rsi r8 r12 r13 r14 r15 edx ebx edi r11d xmm0 xmm1 xmm2 +..B4.10: # Preds ..B4.9 + # Execution count [5.00e+00] + movl %r11d, %r10d #152.9 + xorl %r9d, %r9d #152.9 + subl %r8d, %r10d #152.9 + andl $7, %r10d #152.9 + negl %r10d #152.9 + addl %r11d, %r10d #152.9 + cmpl $1, %r8d #152.9 + jb ..B4.14 # Prob 10% #152.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B4.12: # Preds ..B4.10 ..B4.12 + # Execution count [2.50e+01] + movl %ebx, (%r12,%r9,4) #153.13 + movl %ebx, 32(%r12,%r9,4) #154.13 + movl %ebx, 64(%r12,%r9,4) #155.13 + incq %r9 #152.9 + cmpq %r8, %r9 #152.9 + jb ..B4.12 # Prob 82% #152.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B4.14: # Preds ..B4.12 ..B4.10 + # Execution count [4.50e+00] + movslq %r10d, %r9 #152.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B4.15: # Preds ..B4.15 ..B4.14 + # Execution count [2.50e+01] + vmovups %xmm2, (%r12,%r8,4) #153.13 + vmovups %xmm2, 32(%r12,%r8,4) #154.13 + vmovups %xmm2, 64(%r12,%r8,4) #155.13 + vmovups %xmm2, 16(%r12,%r8,4) #153.13 + vmovups %xmm2, 48(%r12,%r8,4) #154.13 + vmovups %xmm2, 80(%r12,%r8,4) #155.13 + addq $8, %r8 #152.9 + cmpq %r9, %r8 #152.9 + jb ..B4.15 # Prob 82% #152.9 + # LOE rax rcx rsi r8 r9 r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B4.17: # Preds ..B4.15 ..B4.37 + # Execution count [5.00e+00] + lea 1(%r10), %r8d #152.9 + cmpl %r11d, %r8d #152.9 + ja ..B4.21 # Prob 50% #152.9 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 +..B4.18: # Preds ..B4.17 + # Execution count [4.50e+00] + movslq %r10d, %r9 #153.13 + negl %r10d #152.9 + addl %r11d, %r10d #152.9 + xorl %r8d, %r8d #152.9 + movslq %r11d, %r11 #152.9 + vmovdqa %xmm0, %xmm4 #152.9 + vpbroadcastd %r10d, %xmm3 #152.9 + subq %r9, %r11 #152.9 + lea (%r12,%r9,4), %r12 #153.13 + # LOE rax rcx rsi r8 r11 r12 r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 xmm3 xmm4 +..B4.19: # Preds ..B4.19 ..B4.18 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #152.9 + vpaddd %xmm1, %xmm4, %xmm4 #152.9 + vmovups %xmm2, (%r12,%r8,4){%k1} #153.13 + vmovups %xmm2, 32(%r12,%r8,4){%k1} #154.13 + vmovups %xmm2, 64(%r12,%r8,4){%k1} #155.13 + addq $4, %r8 #152.9 + cmpq %r11, %r8 #152.9 + jb ..B4.19 # Prob 82% #152.9 + # LOE rax rcx rsi r8 r11 r12 r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 xmm3 xmm4 +..B4.21: # Preds ..B4.19 ..B4.4 ..B4.17 + # Execution count [5.00e+00] + incl %edi #149.5 + addq $28, %rcx #149.5 + cmpl %edx, %edi #149.5 + jb ..B4.4 # Prob 82% #149.5 + # LOE rax rcx rsi r13 r14 r15 edx ebx edi xmm0 xmm1 xmm2 +..B4.23: # Preds ..B4.21 ..B4.2 + # Execution count [1.00e+00] + xorl %eax, %eax #159.16 + vzeroupper #159.16 +..___tag_value_computeForceLJ_2xnn_half.217: +# getTimeStamp() + call getTimeStamp #159.16 +..___tag_value_computeForceLJ_2xnn_half.218: + # LOE r12 r13 r14 r15 ebx xmm0 +..B4.41: # Preds ..B4.23 + # Execution count [1.00e+00] + vmovsd %xmm0, 192(%rsp) #159.16[spill] + # LOE r12 r13 r14 r15 ebx +..B4.24: # Preds ..B4.41 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #163.5 +..___tag_value_computeForceLJ_2xnn_half.220: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #163.5 +..___tag_value_computeForceLJ_2xnn_half.221: + # LOE r12 r13 r14 r15 ebx +..B4.25: # Preds ..B4.24 + # Execution count [1.00e+00] + movl %ebx, %r8d #166.16 + xorl %r10d, %r10d #166.16 + cmpl $0, 20(%r13) #166.26 + jle ..B4.33 # Prob 10% #166.26 + # LOE r10 r12 r13 r14 r15 ebx r8d +..B4.26: # Preds ..B4.25 + # Execution count [9.00e-01] + movl $4369, %eax #270.9 + vmovsd .L_2il0floatpacket.2(%rip), %xmm21 #276.9 + kmovw %eax, %k1 #270.9 + vmovups .L_2il0floatpacket.7(%rip), %zmm20 #270.9 + vmovups .L_2il0floatpacket.6(%rip), %zmm16 #270.9 + vmovups 128(%rsp), %zmm17 #270.9[spill] + vmovups (%rsp), %zmm18 #270.9[spill] + vmovups 64(%rsp), %zmm19 #270.9[spill] + vpxord %zmm1, %zmm1, %zmm1 #183.30 + # LOE r10 r13 r14 r15 ebx r8d xmm21 zmm1 zmm16 zmm17 zmm18 zmm19 zmm20 k1 +..B4.27: # Preds ..B4.31 ..B4.26 + # Execution count [5.00e+00] + movl %r8d, %ecx #171.27 + movl %r8d, %edi #171.27 + sarl $1, %ecx #171.27 + andl $1, %edi #171.27 + shll $2, %edi #171.27 + movl 16(%r14), %edx #174.44 + imull %r8d, %edx #174.44 + movq 160(%r13), %r11 #172.27 + lea (%rcx,%rcx,2), %r9d #171.27 + vmovaps %zmm1, %zmm22 #183.30 + lea (%rdi,%r9,8), %r9d #171.27 + movslq %r9d, %r9 #171.27 + vmovaps %zmm22, %zmm23 #184.30 + movslq %edx, %rdx #174.19 + vmovaps %zmm23, %zmm24 #185.30 + movq 24(%r14), %rsi #175.25 + vbroadcastss 4(%r11,%r9,4), %zmm13 #177.33 + vbroadcastss 12(%r11,%r9,4), %zmm11 #178.33 + vbroadcastss 36(%r11,%r9,4), %zmm9 #179.33 + vbroadcastss 44(%r11,%r9,4), %zmm2 #180.33 + vbroadcastss 68(%r11,%r9,4), %zmm4 #181.33 + vbroadcastss 76(%r11,%r9,4), %zmm6 #182.33 + vbroadcastss 32(%r11,%r9,4), %zmm0 #179.33 + vbroadcastss (%r11,%r9,4), %zmm12 #177.33 + vbroadcastss 8(%r11,%r9,4), %zmm10 #178.33 + vbroadcastss 40(%r11,%r9,4), %zmm3 #180.33 + vbroadcastss 64(%r11,%r9,4), %zmm5 #181.33 + vbroadcastss 72(%r11,%r9,4), %zmm7 #182.33 + movq 8(%r14), %rax #174.19 + vmovaps %zmm24, %zmm25 #186.30 + vmovaps %zmm25, %zmm26 #187.30 + movslq (%rsi,%r10,4), %rsi #175.25 + lea (%rax,%rdx,4), %rdx #174.19 + movq 176(%r13), %rcx #173.27 + movq %rcx, %rdi #173.27 + vmovaps %zmm26, %zmm14 #188.30 + xorl %eax, %eax #190.19 + vinsertf64x4 $1, %ymm13, %zmm12, %zmm13 #177.33 + vinsertf64x4 $1, %ymm11, %zmm10, %zmm12 #178.33 + vinsertf64x4 $1, %ymm9, %zmm0, %zmm11 #179.33 + vinsertf64x4 $1, %ymm2, %zmm3, %zmm10 #180.33 + vinsertf64x4 $1, %ymm4, %zmm5, %zmm9 #181.33 + vinsertf64x4 $1, %ymm6, %zmm7, %zmm6 #182.33 + testq %rsi, %rsi #190.28 + jle ..B4.31 # Prob 10% #190.28 + # LOE rax rdx rcx rsi rdi r9 r10 r11 r13 r14 r15 ebx r8d xmm21 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm16 zmm17 zmm18 zmm19 zmm20 zmm22 zmm23 zmm24 zmm25 zmm26 k1 +..B4.28: # Preds ..B4.27 + # Execution count [4.50e+00] + vmovups .L_2il0floatpacket.8(%rip), %zmm15 #266.13 + movq %r9, 24(%rsp) #266.13[spill] + movq %r10, 16(%rsp) #266.13[spill] + movq %r14, 8(%rsp) #266.13[spill] + movq %r15, (%rsp) #266.13[spill] + # LOE rax rdx rcx rsi rdi r11 r13 ebx r8d xmm21 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm22 zmm23 zmm24 zmm25 zmm26 k1 +..B4.29: # Preds ..B4.43 ..B4.28 + # Execution count [2.50e+01] + movl (%rdx,%rax,4), %r15d #191.22 + movl %ebx, %r12d #214.66 + movslq %r15d, %r15 #192.31 + incq %rax #190.39 + lea (%r15,%r15), %r9d #214.56 + cmpl %r8d, %r9d #214.66 + sete %r12b #214.66 + lea (%r15,%r15,2), %r14 #193.31 + shlq $5, %r14 #193.31 + lea (%r12,%r12,2), %r10d #229.39 + vbroadcastf64x4 64(%r14,%r11), %zmm28 #199.36 + negl %r10d #229.39 + vbroadcastf64x4 (%r14,%r11), %zmm3 #197.36 + vbroadcastf64x4 32(%r14,%r11), %zmm27 #198.36 + vsubps %zmm28, %zmm9, %zmm5 #202.35 + vsubps %zmm28, %zmm6, %zmm4 #205.35 + vsubps %zmm3, %zmm13, %zmm0 #200.35 + vsubps %zmm27, %zmm11, %zmm7 #201.35 + vsubps %zmm3, %zmm12, %zmm8 #203.35 + vsubps %zmm27, %zmm10, %zmm3 #204.35 + vmulps %zmm5, %zmm5, %zmm31 #232.80 + vmulps %zmm4, %zmm4, %zmm30 #233.80 + vfmadd231ps %zmm7, %zmm7, %zmm31 #232.57 + vfmadd231ps %zmm3, %zmm3, %zmm30 #233.57 + vfmadd231ps %zmm0, %zmm0, %zmm31 #232.34 + vfmadd231ps %zmm8, %zmm8, %zmm30 #233.34 + vrcp14ps %zmm31, %zmm29 #238.35 + vrcp14ps %zmm30, %zmm27 #239.35 + vcmpps $17, %zmm19, %zmm30, %k5 #236.67 + vcmpps $17, %zmm19, %zmm31, %k2 #235.67 + vmulps %zmm18, %zmm29, %zmm2 #241.67 + vmulps %zmm18, %zmm27, %zmm28 #242.67 + vmulps %zmm2, %zmm29, %zmm30 #241.51 + vmulps %zmm28, %zmm27, %zmm28 #242.51 + vmulps %zmm30, %zmm29, %zmm2 #241.35 + vmulps %zmm28, %zmm27, %zmm31 #242.35 + vfmsub213ps %zmm20, %zmm29, %zmm30 #244.79 + vfmsub213ps %zmm20, %zmm27, %zmm28 #245.79 + vmulps %zmm17, %zmm29, %zmm29 #244.105 + vmulps %zmm17, %zmm27, %zmm27 #245.105 + vmulps %zmm29, %zmm30, %zmm30 #244.70 + vmulps %zmm27, %zmm28, %zmm27 #245.70 + vmovups 32(%r14,%rdi), %ymm28 #266.13 + vmulps %zmm30, %zmm2, %zmm2 #244.54 + vmulps %zmm27, %zmm31, %zmm31 #245.54 + vmovups 64(%r14,%rdi), %ymm27 #266.13 + vmulps %zmm2, %zmm16, %zmm2 #244.36 + vmulps %zmm31, %zmm16, %zmm30 #245.36 + vmovups (%r14,%rdi), %ymm31 #266.13 + lea 1(%r15,%r15), %r11d #215.61 + movl %r12d, %r15d #229.39 + cmpl %r8d, %r11d #215.66 + movl %ebx, %r11d #215.66 + sete %r11b #215.66 + negl %r15d #229.39 + movl %r11d, %r9d #229.39 + shll $6, %r9d #229.39 + negl %r9d #229.39 + addl %r11d, %r9d #229.39 + addl %r9d, %r10d #229.39 + movl %r11d, %r9d #229.39 + shll $5, %r9d #229.39 + negl %r9d #229.39 + addl %r11d, %r9d #229.39 + shll $8, %r10d #229.39 + addl $-256, %r10d #229.39 + lea 255(%r15,%r9), %r15d #229.39 + movl %r12d, %r9d #230.39 + orl %r15d, %r10d #229.39 + movl %r11d, %r15d #230.39 + kmovw %r10d, %k0 #235.41 + movl %r11d, %r10d #230.39 + shll $4, %r9d #230.39 + shll $8, %r10d #230.39 + negl %r9d #230.39 + negl %r10d #230.39 + addl %r12d, %r9d #230.39 + addl %r11d, %r10d #230.39 + shll $7, %r15d #230.39 + addl %r10d, %r9d #230.39 + subl %r15d, %r11d #230.39 + lea (,%r12,8), %r10d #230.39 + subl %r10d, %r12d #230.39 + shll $8, %r9d #230.39 + addl $-256, %r9d #230.39 + kandw %k2, %k0, %k3 #235.41 + vmulps %zmm2, %zmm0, %zmm29{%k3}{z} #247.33 + lea 255(%r12,%r11), %r12d #230.39 + vmulps %zmm2, %zmm7, %zmm0{%k3}{z} #248.33 + vmulps %zmm2, %zmm5, %zmm5{%k3}{z} #249.33 + vaddps %zmm22, %zmm29, %zmm22 #254.20 + vaddps %zmm23, %zmm0, %zmm23 #255.20 + vaddps %zmm24, %zmm5, %zmm24 #256.20 + orl %r12d, %r9d #230.39 + kmovw %r9d, %k4 #236.41 + kandw %k5, %k4, %k6 #236.41 + vmulps %zmm30, %zmm8, %zmm7{%k6}{z} #250.33 + vmulps %zmm30, %zmm3, %zmm8{%k6}{z} #251.33 + vmulps %zmm30, %zmm4, %zmm2{%k6}{z} #252.33 + vaddps %zmm7, %zmm29, %zmm3 #266.38 + vaddps %zmm8, %zmm0, %zmm4 #266.49 + vaddps %zmm2, %zmm5, %zmm29 #266.60 + vaddps %zmm26, %zmm8, %zmm26 #258.20 + vaddps %zmm25, %zmm7, %zmm25 #257.20 + vaddps %zmm14, %zmm2, %zmm14 #259.20 + vpermd %zmm3, %zmm15, %zmm0 #266.13 + vpermd %zmm4, %zmm15, %zmm8 #266.13 + vpermd %zmm29, %zmm15, %zmm30 #266.13 + vaddps %zmm3, %zmm0, %zmm5 #266.13 + vaddps %zmm4, %zmm8, %zmm4 #266.13 + vaddps %zmm29, %zmm30, %zmm29 #266.13 + vsubps %ymm5, %ymm31, %ymm7 #266.13 + vsubps %ymm4, %ymm28, %ymm28 #266.13 + vsubps %ymm29, %ymm27, %ymm27 #266.13 + vmovups %ymm7, (%r14,%rdi) #266.13 + vmovups %ymm28, 32(%r14,%rdi) #266.13 + vmovups %ymm27, 64(%r14,%rdi) #266.13 + cmpq %rsi, %rax #190.28 + jge ..B4.30 # Prob 18% #190.28 + # LOE rax rdx rcx rsi r13 ebx r8d xmm21 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm22 zmm23 zmm24 zmm25 zmm26 k1 +..B4.43: # Preds ..B4.29 + # Execution count [2.05e+01] + movq 176(%r13), %rdi #151.27 + movq 160(%r13), %r11 #172.27 + jmp ..B4.29 # Prob 100% #172.27 + # LOE rax rdx rcx rsi rdi r11 r13 ebx r8d xmm21 zmm1 zmm6 zmm9 zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm22 zmm23 zmm24 zmm25 zmm26 k1 +..B4.30: # Preds ..B4.29 + # Execution count [4.50e+00] + movq 24(%rsp), %r9 #[spill] + movq 16(%rsp), %r10 #[spill] + movq 8(%rsp), %r14 #[spill] + movq (%rsp), %r15 #[spill] + # LOE rcx rsi r9 r10 r13 r14 r15 ebx r8d xmm21 zmm1 zmm14 zmm16 zmm17 zmm18 zmm19 zmm20 zmm22 zmm23 zmm24 zmm25 zmm26 k1 +..B4.31: # Preds ..B4.30 ..B4.27 + # Execution count [5.00e+00] + vshuff32x4 $136, %zmm25, %zmm22, %zmm27 #270.9 + incl %r8d #166.49 + vshuff32x4 $221, %zmm25, %zmm22, %zmm22 #270.9 + vshuff32x4 $136, %zmm26, %zmm23, %zmm3 #271.9 + vshuff32x4 $221, %zmm26, %zmm23, %zmm23 #271.9 + vshuff32x4 $136, %zmm14, %zmm24, %zmm10 #272.9 + vshuff32x4 $221, %zmm14, %zmm24, %zmm24 #272.9 + vaddps %zmm22, %zmm27, %zmm28 #270.9 + vaddps %zmm23, %zmm3, %zmm4 #271.9 + vaddps %zmm24, %zmm10, %zmm12 #272.9 + vxorpd %xmm24, %xmm24, %xmm24 #276.9 + vpermilps $78, %zmm28, %zmm25 #270.9 + incq %r10 #166.49 + vpermilps $78, %zmm4, %zmm26 #271.9 + vpermilps $78, %zmm12, %zmm11 #272.9 + vaddps %zmm28, %zmm25, %zmm30 #270.9 + vaddps %zmm4, %zmm26, %zmm6 #271.9 + vaddps %zmm12, %zmm11, %zmm14 #272.9 + vpermilps $177, %zmm30, %zmm29 #270.9 + vpermilps $177, %zmm6, %zmm5 #271.9 + vpermilps $177, %zmm14, %zmm13 #272.9 + vaddps %zmm30, %zmm29, %zmm31 #270.9 + vaddps %zmm6, %zmm5, %zmm7 #271.9 + vaddps %zmm14, %zmm13, %zmm15 #272.9 + vcompressps %zmm31, %zmm0{%k1}{z} #270.9 + vcompressps %zmm7, %zmm8{%k1}{z} #271.9 + vcompressps %zmm15, %zmm22{%k1}{z} #272.9 + vaddps (%rcx,%r9,4), %xmm0, %xmm2 #270.9 + vaddps 32(%rcx,%r9,4), %xmm8, %xmm9 #271.9 + vaddps 64(%rcx,%r9,4), %xmm22, %xmm23 #272.9 + vmovups %xmm2, (%rcx,%r9,4) #270.9 + vmovups %xmm9, 32(%rcx,%r9,4) #271.9 + vmovups %xmm23, 64(%rcx,%r9,4) #272.9 + addq %rsi, 8(%r15) #275.9 + vcvtsi2sd %esi, %xmm24, %xmm24 #276.9 + vmulsd %xmm24, %xmm21, %xmm0 #276.9 + vcvttsd2si %xmm0, %rax #276.9 + incq (%r15) #274.9 + addq %rax, 16(%r15) #276.9 + cmpl 20(%r13), %r8d #166.26 + jl ..B4.27 # Prob 82% #166.26 + # LOE r10 r13 r14 r15 ebx r8d xmm21 zmm1 zmm16 zmm17 zmm18 zmm19 zmm20 k1 +..B4.33: # Preds ..B4.31 ..B4.25 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #279.5 + vzeroupper #279.5 +..___tag_value_computeForceLJ_2xnn_half.233: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #279.5 +..___tag_value_computeForceLJ_2xnn_half.234: + # LOE r12 +..B4.34: # Preds ..B4.33 + # Execution count [1.00e+00] + xorl %eax, %eax #282.16 +..___tag_value_computeForceLJ_2xnn_half.235: +# getTimeStamp() + call getTimeStamp #282.16 +..___tag_value_computeForceLJ_2xnn_half.236: + # LOE r12 xmm0 +..B4.42: # Preds ..B4.34 + # Execution count [1.00e+00] + vmovsd %xmm0, (%rsp) #282.16[spill] + # LOE r12 +..B4.35: # Preds ..B4.42 + # Execution count [1.00e+00] + movl $.L_2__STRING.5, %edi #283.5 + xorl %eax, %eax #283.5 +..___tag_value_computeForceLJ_2xnn_half.238: +# debug_printf(const char *, ...) + call debug_printf #283.5 +..___tag_value_computeForceLJ_2xnn_half.239: + # LOE r12 +..B4.36: # Preds ..B4.35 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm0 #284.14[spill] + vsubsd 192(%rsp), %xmm0, %xmm0 #284.14[spill] + addq $216, %rsp #284.14 + .cfi_restore 3 + popq %rbx #284.14 + .cfi_restore 15 + popq %r15 #284.14 + .cfi_restore 14 + popq %r14 #284.14 + .cfi_restore 13 + popq %r13 #284.14 + .cfi_restore 12 + popq %r12 #284.14 + movq %rbp, %rsp #284.14 + popq %rbp #284.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #284.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B4.37: # Preds ..B4.5 ..B4.9 + # Execution count [4.50e-01]: Infreq + movl %ebx, %r10d #152.9 + jmp ..B4.17 # Prob 100% #152.9 + .align 16,0x90 + # LOE rax rcx rsi r12 r13 r14 r15 edx ebx edi r10d r11d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_2xnn_half,@function + .size computeForceLJ_2xnn_half,.-computeForceLJ_2xnn_half +..LNcomputeForceLJ_2xnn_half.3: + .data +# -- End computeForceLJ_2xnn_half + .text +.L_2__routine_start_computeForceLJ_4xn_full_4: +# -- Begin computeForceLJ_4xn_full + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_4xn_full +# --- computeForceLJ_4xn_full(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_4xn_full: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B5.1: # Preds ..B5.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_4xn_full.257: +..L258: + #622.96 + pushq %rbp #622.96 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #622.96 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #622.96 + pushq %rbx #622.96 + subq $56, %rsp #622.96 + movl $.L_2__STRING.6, %edi #623.5 + xorl %eax, %eax #623.5 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + movq %rsi, %rbx #622.96 +..___tag_value_computeForceLJ_4xn_full.263: +# debug_printf(const char *, ...) + call debug_printf #623.5 +..___tag_value_computeForceLJ_4xn_full.264: + # LOE rbx r12 r13 r14 r15 +..B5.2: # Preds ..B5.1 + # Execution count [1.00e+00] + xorl %r8d, %r8d #635.5 + xorl %esi, %esi #637.27 + xorl %ecx, %ecx #635.5 + movl 20(%rbx), %edx #635.26 + testl %edx, %edx #635.26 + jle ..B5.23 # Prob 9% #635.26 + # LOE rbx rsi r12 r13 r14 r15 edx ecx r8d +..B5.3: # Preds ..B5.2 + # Execution count [9.00e-01] + movq 176(%rbx), %rdi #637.27 + movq 192(%rbx), %rax #638.32 + vxorps %xmm2, %xmm2, %xmm2 #639.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #638.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #638.9 + movq %r14, (%rsp) #638.9[spill] + movq %r15, 8(%rsp) #638.9[spill] + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22 + # LOE rax rbx rsi rdi r12 r13 edx ecx r8d xmm0 xmm1 xmm2 +..B5.4: # Preds ..B5.21 ..B5.3 + # Execution count [5.00e+00] + movl %r8d, %r9d #636.27 + movl %r8d, %r10d #636.27 + sarl $1, %r9d #636.27 + andl $1, %r10d #636.27 + shll $2, %r10d #636.27 + lea (%r9,%r9,2), %r11d #636.27 + lea (%r10,%r11,8), %r14d #636.27 + movl (%rsi,%rax), %r11d #638.32 + movslq %r14d, %r14 #637.27 + lea (%rdi,%r14,4), %r15 #637.27 + testl %r11d, %r11d #638.32 + jle ..B5.21 # Prob 50% #638.32 + # LOE rax rbx rsi rdi r12 r13 r15 edx ecx r8d r11d xmm0 xmm1 xmm2 +..B5.5: # Preds ..B5.4 + # Execution count [4.50e+00] + cmpl $8, %r11d #638.9 + jl ..B5.32 # Prob 10% #638.9 + # LOE rax rbx rsi rdi r12 r13 r15 edx ecx r8d r11d xmm0 xmm1 xmm2 +..B5.6: # Preds ..B5.5 + # Execution count [4.50e+00] + lea 64(%r15), %r9 #641.13 + andq $15, %r9 #638.9 + testl $3, %r9d #638.9 + je ..B5.8 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r15 edx ecx r8d r9d r11d xmm0 xmm1 xmm2 +..B5.7: # Preds ..B5.6 + # Execution count [2.25e+00] + movl %ecx, %r9d #638.9 + jmp ..B5.9 # Prob 100% #638.9 + # LOE rax rbx rsi rdi r9 r12 r13 r15 edx ecx r8d r11d xmm0 xmm1 xmm2 +..B5.8: # Preds ..B5.6 + # Execution count [2.25e+00] + movl %r9d, %r10d #638.9 + negl %r10d #638.9 + addl $16, %r10d #638.9 + shrl $2, %r10d #638.9 + testl %r9d, %r9d #638.9 + cmovne %r10d, %r9d #638.9 + # LOE rax rbx rsi rdi r9 r12 r13 r15 edx ecx r8d r11d xmm0 xmm1 xmm2 +..B5.9: # Preds ..B5.7 ..B5.8 + # Execution count [4.50e+00] + lea 8(%r9), %r10d #638.9 + cmpl %r10d, %r11d #638.9 + jl ..B5.32 # Prob 10% #638.9 + # LOE rax rbx rsi rdi r9 r12 r13 r15 edx ecx r8d r11d xmm0 xmm1 xmm2 +..B5.10: # Preds ..B5.9 + # Execution count [5.00e+00] + movl %r11d, %r14d #638.9 + xorl %r10d, %r10d #638.9 + subl %r9d, %r14d #638.9 + andl $7, %r14d #638.9 + negl %r14d #638.9 + addl %r11d, %r14d #638.9 + cmpl $1, %r9d #638.9 + jb ..B5.14 # Prob 10% #638.9 + # LOE rax rbx rsi rdi r9 r10 r12 r13 r15 edx ecx r8d r11d r14d xmm0 xmm1 xmm2 +..B5.12: # Preds ..B5.10 ..B5.12 + # Execution count [2.50e+01] + movl %ecx, (%r15,%r10,4) #639.13 + movl %ecx, 32(%r15,%r10,4) #640.13 + movl %ecx, 64(%r15,%r10,4) #641.13 + incq %r10 #638.9 + cmpq %r9, %r10 #638.9 + jb ..B5.12 # Prob 82% #638.9 + # LOE rax rbx rsi rdi r9 r10 r12 r13 r15 edx ecx r8d r11d r14d xmm0 xmm1 xmm2 +..B5.14: # Preds ..B5.12 ..B5.10 + # Execution count [4.50e+00] + movslq %r14d, %r10 #638.9 + # LOE rax rbx rsi rdi r9 r10 r12 r13 r15 edx ecx r8d r11d r14d xmm0 xmm1 xmm2 +..B5.15: # Preds ..B5.15 ..B5.14 + # Execution count [2.50e+01] + vmovups %xmm2, (%r15,%r9,4) #639.13 + vmovups %xmm2, 32(%r15,%r9,4) #640.13 + vmovups %xmm2, 64(%r15,%r9,4) #641.13 + vmovups %xmm2, 16(%r15,%r9,4) #639.13 + vmovups %xmm2, 48(%r15,%r9,4) #640.13 + vmovups %xmm2, 80(%r15,%r9,4) #641.13 + addq $8, %r9 #638.9 + cmpq %r10, %r9 #638.9 + jb ..B5.15 # Prob 82% #638.9 + # LOE rax rbx rsi rdi r9 r10 r12 r13 r15 edx ecx r8d r11d r14d xmm0 xmm1 xmm2 +..B5.17: # Preds ..B5.15 ..B5.32 + # Execution count [5.00e+00] + lea 1(%r14), %r9d #638.9 + cmpl %r11d, %r9d #638.9 + ja ..B5.21 # Prob 50% #638.9 + # LOE rax rbx rsi rdi r12 r13 r15 edx ecx r8d r11d r14d xmm0 xmm1 xmm2 +..B5.18: # Preds ..B5.17 + # Execution count [4.50e+00] + movslq %r14d, %r10 #639.13 + negl %r14d #638.9 + addl %r11d, %r14d #638.9 + xorl %r9d, %r9d #638.9 + movslq %r11d, %r11 #638.9 + vmovdqa %xmm0, %xmm4 #638.9 + vpbroadcastd %r14d, %xmm3 #638.9 + subq %r10, %r11 #638.9 + lea (%r15,%r10,4), %r15 #639.13 + # LOE rax rbx rsi rdi r9 r11 r12 r13 r15 edx ecx r8d xmm0 xmm1 xmm2 xmm3 xmm4 +..B5.19: # Preds ..B5.19 ..B5.18 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #638.9 + vpaddd %xmm1, %xmm4, %xmm4 #638.9 + vmovups %xmm2, (%r15,%r9,4){%k1} #639.13 + vmovups %xmm2, 32(%r15,%r9,4){%k1} #640.13 + vmovups %xmm2, 64(%r15,%r9,4){%k1} #641.13 + addq $4, %r9 #638.9 + cmpq %r11, %r9 #638.9 + jb ..B5.19 # Prob 82% #638.9 + # LOE rax rbx rsi rdi r9 r11 r12 r13 r15 edx ecx r8d xmm0 xmm1 xmm2 xmm3 xmm4 +..B5.21: # Preds ..B5.19 ..B5.4 ..B5.17 + # Execution count [5.00e+00] + incl %r8d #635.5 + addq $28, %rsi #635.5 + cmpl %edx, %r8d #635.5 + jb ..B5.4 # Prob 82% #635.5 + # LOE rax rbx rsi rdi r12 r13 edx ecx r8d xmm0 xmm1 xmm2 +..B5.22: # Preds ..B5.21 + # Execution count [9.00e-01] + movq (%rsp), %r14 #[spill] + .cfi_restore 14 + movq 8(%rsp), %r15 #[spill] + .cfi_restore 15 + # LOE rbx r12 r13 r14 r15 +..B5.23: # Preds ..B5.2 ..B5.22 + # Execution count [1.00e+00] + xorl %eax, %eax #645.16 +..___tag_value_computeForceLJ_4xn_full.272: +# getTimeStamp() + call getTimeStamp #645.16 +..___tag_value_computeForceLJ_4xn_full.273: + # LOE rbx r12 r13 r14 r15 xmm0 +..B5.36: # Preds ..B5.23 + # Execution count [1.00e+00] + vmovsd %xmm0, 16(%rsp) #645.16[spill] + # LOE rbx r12 r13 r14 r15 +..B5.24: # Preds ..B5.36 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #649.5 +..___tag_value_computeForceLJ_4xn_full.275: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #649.5 +..___tag_value_computeForceLJ_4xn_full.276: + # LOE rbx r12 r13 r14 r15 +..B5.25: # Preds ..B5.24 + # Execution count [1.00e+00] + cmpl $0, 20(%rbx) #652.26 + jle ..B5.28 # Prob 10% #652.26 + # LOE r12 r13 r14 r15 +..B5.26: # Preds ..B5.25 + # Execution count [5.00e+00] + movl $il0_peep_printf_format_0, %edi #769.9 + movq stderr(%rip), %rsi #769.9 + call fputs #769.9 + # LOE +..B5.27: # Preds ..B5.26 + # Execution count [5.00e+00] + movl $-1, %edi #769.9 +# exit(int) + call exit #769.9 + # LOE +..B5.28: # Preds ..B5.25 + # Execution count [1.00e+00]: Infreq + movl $.L_2__STRING.2, %edi #779.5 +..___tag_value_computeForceLJ_4xn_full.277: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #779.5 +..___tag_value_computeForceLJ_4xn_full.278: + # LOE r12 r13 r14 r15 +..B5.29: # Preds ..B5.28 + # Execution count [1.00e+00]: Infreq + xorl %eax, %eax #782.16 +..___tag_value_computeForceLJ_4xn_full.279: +# getTimeStamp() + call getTimeStamp #782.16 +..___tag_value_computeForceLJ_4xn_full.280: + # LOE r12 r13 r14 r15 xmm0 +..B5.37: # Preds ..B5.29 + # Execution count [1.00e+00]: Infreq + vmovsd %xmm0, 24(%rsp) #782.16[spill] + # LOE r12 r13 r14 r15 +..B5.30: # Preds ..B5.37 + # Execution count [1.00e+00]: Infreq + movl $.L_2__STRING.7, %edi #783.5 + xorl %eax, %eax #783.5 +..___tag_value_computeForceLJ_4xn_full.282: +# debug_printf(const char *, ...) + call debug_printf #783.5 +..___tag_value_computeForceLJ_4xn_full.283: + # LOE r12 r13 r14 r15 +..B5.31: # Preds ..B5.30 + # Execution count [1.00e+00]: Infreq + vmovsd 24(%rsp), %xmm0 #784.14[spill] + vsubsd 16(%rsp), %xmm0, %xmm0 #784.14[spill] + addq $56, %rsp #784.14 + .cfi_restore 3 + popq %rbx #784.14 + movq %rbp, %rsp #784.14 + popq %rbp #784.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #784.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B5.32: # Preds ..B5.5 ..B5.9 + # Execution count [4.50e-01]: Infreq + movl %ecx, %r14d #638.9 + jmp ..B5.17 # Prob 100% #638.9 + .align 16,0x90 + # LOE rax rbx rsi rdi r12 r13 r15 edx ecx r8d r11d r14d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_4xn_full,@function + .size computeForceLJ_4xn_full,.-computeForceLJ_4xn_full +..LNcomputeForceLJ_4xn_full.4: + .section .rodata.str1.32, "aMS",@progbits,1 + .align 32 + .align 32 +il0_peep_printf_format_0: + .long 1684892019 + .long 1918855263 + .long 1668637797 + .long 1970495333 + .long 975775853 + .long 1818313504 + .long 543450476 + .long 1752459639 + .long 1482047776 + .long 540160309 + .long 1920233065 + .long 1769172585 + .long 1629516643 + .long 1931502702 + .long 1818717801 + .long 1919954277 + .long 1936286565 + .long 544108393 + .long 1667852407 + .long 1936269416 + .long 1953459744 + .long 1818326560 + .long 169960553 + .byte 0 + .data +# -- End computeForceLJ_4xn_full + .text +.L_2__routine_start_computeForceLJ_4xn_5: +# -- Begin computeForceLJ_4xn + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_4xn +# --- computeForceLJ_4xn(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_4xn: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B6.1: # Preds ..B6.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_4xn.295: +..L296: + #787.91 + pushq %rbp #787.91 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #787.91 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #787.91 + pushq %rbx #787.91 + subq $56, %rsp #787.91 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + movq %rsi, %rbx #787.91 + cmpl $0, 32(%rdx) #788.8 + je ..B6.4 # Prob 50% #788.8 + # LOE rdx rcx rbx rdi r12 r13 r14 r15 +..B6.2: # Preds ..B6.1 + # Execution count [5.00e-01] + movq %rbx, %rsi #789.16 + addq $56, %rsp #789.16 + .cfi_restore 3 + popq %rbx #789.16 + movq %rbp, %rsp #789.16 + popq %rbp #789.16 + .cfi_def_cfa 7, 8 + .cfi_restore 6 +# computeForceLJ_4xn_half(Parameter *, Atom *, Neighbor *, Stats *) + jmp computeForceLJ_4xn_half #789.16 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + # LOE +..B6.4: # Preds ..B6.1 + # Execution count [5.00e-01] + movl $.L_2__STRING.6, %edi #792.12 + xorl %eax, %eax #792.12 +..___tag_value_computeForceLJ_4xn.307: +# debug_printf(const char *, ...) + call debug_printf #792.12 +..___tag_value_computeForceLJ_4xn.308: + # LOE rbx r12 r13 r14 r15 +..B6.5: # Preds ..B6.4 + # Execution count [5.00e-01] + xorl %r9d, %r9d #792.12 + xorl %edi, %edi #792.12 + movl 20(%rbx), %edx #792.12 + xorl %ecx, %ecx #792.12 + testl %edx, %edx #792.12 + jle ..B6.26 # Prob 9% #792.12 + # LOE rbx rdi r12 r13 r14 r15 edx ecx r9d +..B6.6: # Preds ..B6.5 + # Execution count [4.50e-01] + movq 176(%rbx), %r8 #792.12 + movq 192(%rbx), %rax #792.12 + vxorps %xmm2, %xmm2, %xmm2 #792.12 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #792.12 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #792.12 + movq %r14, (%rsp) #792.12[spill] + movq %r15, 8(%rsp) #792.12[spill] + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22 + # LOE rax rbx rdi r8 r12 r13 edx ecx r9d xmm0 xmm1 xmm2 +..B6.7: # Preds ..B6.24 ..B6.6 + # Execution count [2.50e+00] + movl %r9d, %r10d #792.12 + movl %r9d, %r11d #792.12 + sarl $1, %r10d #792.12 + andl $1, %r11d #792.12 + shll $2, %r11d #792.12 + lea (%r10,%r10,2), %r14d #792.12 + lea (%r11,%r14,8), %r15d #792.12 + movl (%rdi,%rax), %r14d #792.12 + movslq %r15d, %r15 #792.12 + lea (%r8,%r15,4), %rsi #792.12 + testl %r14d, %r14d #792.12 + jle ..B6.24 # Prob 50% #792.12 + # LOE rax rbx rsi rdi r8 r12 r13 edx ecx r9d r14d xmm0 xmm1 xmm2 +..B6.8: # Preds ..B6.7 + # Execution count [2.25e+00] + cmpl $8, %r14d #792.12 + jl ..B6.35 # Prob 10% #792.12 + # LOE rax rbx rsi rdi r8 r12 r13 edx ecx r9d r14d xmm0 xmm1 xmm2 +..B6.9: # Preds ..B6.8 + # Execution count [2.25e+00] + lea 64(%rsi), %r10 #792.12 + andq $15, %r10 #792.12 + testl $3, %r10d #792.12 + je ..B6.11 # Prob 50% #792.12 + # LOE rax rbx rsi rdi r8 r12 r13 edx ecx r9d r10d r14d xmm0 xmm1 xmm2 +..B6.10: # Preds ..B6.9 + # Execution count [1.12e+00] + movl %ecx, %r10d #792.12 + jmp ..B6.12 # Prob 100% #792.12 + # LOE rax rbx rsi rdi r8 r10 r12 r13 edx ecx r9d r14d xmm0 xmm1 xmm2 +..B6.11: # Preds ..B6.9 + # Execution count [1.12e+00] + movl %r10d, %r11d #792.12 + negl %r11d #792.12 + addl $16, %r11d #792.12 + shrl $2, %r11d #792.12 + testl %r10d, %r10d #792.12 + cmovne %r11d, %r10d #792.12 + # LOE rax rbx rsi rdi r8 r10 r12 r13 edx ecx r9d r14d xmm0 xmm1 xmm2 +..B6.12: # Preds ..B6.10 ..B6.11 + # Execution count [2.25e+00] + lea 8(%r10), %r11d #792.12 + cmpl %r11d, %r14d #792.12 + jl ..B6.35 # Prob 10% #792.12 + # LOE rax rbx rsi rdi r8 r10 r12 r13 edx ecx r9d r14d xmm0 xmm1 xmm2 +..B6.13: # Preds ..B6.12 + # Execution count [2.50e+00] + movl %r14d, %r15d #792.12 + xorl %r11d, %r11d #792.12 + subl %r10d, %r15d #792.12 + andl $7, %r15d #792.12 + negl %r15d #792.12 + addl %r14d, %r15d #792.12 + cmpl $1, %r10d #792.12 + jb ..B6.17 # Prob 10% #792.12 + # LOE rax rbx rsi rdi r8 r10 r11 r12 r13 edx ecx r9d r14d r15d xmm0 xmm1 xmm2 +..B6.15: # Preds ..B6.13 ..B6.15 + # Execution count [1.25e+01] + movl %ecx, (%rsi,%r11,4) #792.12 + movl %ecx, 32(%rsi,%r11,4) #792.12 + movl %ecx, 64(%rsi,%r11,4) #792.12 + incq %r11 #792.12 + cmpq %r10, %r11 #792.12 + jb ..B6.15 # Prob 82% #792.12 + # LOE rax rbx rsi rdi r8 r10 r11 r12 r13 edx ecx r9d r14d r15d xmm0 xmm1 xmm2 +..B6.17: # Preds ..B6.15 ..B6.13 + # Execution count [2.25e+00] + movslq %r15d, %r11 #792.12 + # LOE rax rbx rsi rdi r8 r10 r11 r12 r13 edx ecx r9d r14d r15d xmm0 xmm1 xmm2 +..B6.18: # Preds ..B6.18 ..B6.17 + # Execution count [1.25e+01] + vmovups %xmm2, (%rsi,%r10,4) #792.12 + vmovups %xmm2, 32(%rsi,%r10,4) #792.12 + vmovups %xmm2, 64(%rsi,%r10,4) #792.12 + vmovups %xmm2, 16(%rsi,%r10,4) #792.12 + vmovups %xmm2, 48(%rsi,%r10,4) #792.12 + vmovups %xmm2, 80(%rsi,%r10,4) #792.12 + addq $8, %r10 #792.12 + cmpq %r11, %r10 #792.12 + jb ..B6.18 # Prob 82% #792.12 + # LOE rax rbx rsi rdi r8 r10 r11 r12 r13 edx ecx r9d r14d r15d xmm0 xmm1 xmm2 +..B6.20: # Preds ..B6.18 ..B6.35 + # Execution count [2.50e+00] + lea 1(%r15), %r10d #792.12 + cmpl %r14d, %r10d #792.12 + ja ..B6.24 # Prob 50% #792.12 + # LOE rax rbx rsi rdi r8 r12 r13 edx ecx r9d r14d r15d xmm0 xmm1 xmm2 +..B6.21: # Preds ..B6.20 + # Execution count [2.25e+00] + movslq %r15d, %r11 #792.12 + negl %r15d #792.12 + addl %r14d, %r15d #792.12 + xorl %r10d, %r10d #792.12 + movslq %r14d, %r14 #792.12 + vmovdqa %xmm0, %xmm4 #792.12 + vpbroadcastd %r15d, %xmm3 #792.12 + subq %r11, %r14 #792.12 + lea (%rsi,%r11,4), %rsi #792.12 + # LOE rax rbx rsi rdi r8 r10 r12 r13 r14 edx ecx r9d xmm0 xmm1 xmm2 xmm3 xmm4 +..B6.22: # Preds ..B6.22 ..B6.21 + # Execution count [1.25e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #792.12 + vpaddd %xmm1, %xmm4, %xmm4 #792.12 + vmovups %xmm2, (%rsi,%r10,4){%k1} #792.12 + vmovups %xmm2, 32(%rsi,%r10,4){%k1} #792.12 + vmovups %xmm2, 64(%rsi,%r10,4){%k1} #792.12 + addq $4, %r10 #792.12 + cmpq %r14, %r10 #792.12 + jb ..B6.22 # Prob 82% #792.12 + # LOE rax rbx rsi rdi r8 r10 r12 r13 r14 edx ecx r9d xmm0 xmm1 xmm2 xmm3 xmm4 +..B6.24: # Preds ..B6.22 ..B6.7 ..B6.20 + # Execution count [2.50e+00] + incl %r9d #792.12 + addq $28, %rdi #792.12 + cmpl %edx, %r9d #792.12 + jb ..B6.7 # Prob 82% #792.12 + # LOE rax rbx rdi r8 r12 r13 edx ecx r9d xmm0 xmm1 xmm2 +..B6.25: # Preds ..B6.24 + # Execution count [4.50e-01] + movq (%rsp), %r14 #[spill] + .cfi_restore 14 + movq 8(%rsp), %r15 #[spill] + .cfi_restore 15 + # LOE rbx r12 r13 r14 r15 +..B6.26: # Preds ..B6.5 ..B6.25 + # Execution count [5.00e-01] + xorl %eax, %eax #792.12 +..___tag_value_computeForceLJ_4xn.316: +# getTimeStamp() + call getTimeStamp #792.12 +..___tag_value_computeForceLJ_4xn.317: + # LOE rbx r12 r13 r14 r15 xmm0 +..B6.40: # Preds ..B6.26 + # Execution count [5.00e-01] + vmovsd %xmm0, 16(%rsp) #792.12[spill] + # LOE rbx r12 r13 r14 r15 +..B6.27: # Preds ..B6.40 + # Execution count [5.00e-01] + movl $.L_2__STRING.2, %edi #792.12 +..___tag_value_computeForceLJ_4xn.319: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #792.12 +..___tag_value_computeForceLJ_4xn.320: + # LOE rbx r12 r13 r14 r15 +..B6.28: # Preds ..B6.27 + # Execution count [5.00e-01] + cmpl $0, 20(%rbx) #792.12 + jle ..B6.31 # Prob 10% #792.12 + # LOE r12 r13 r14 r15 +..B6.29: # Preds ..B6.28 + # Execution count [2.50e+00] + movl $il0_peep_printf_format_1, %edi #792.12 + movq stderr(%rip), %rsi #792.12 + call fputs #792.12 + # LOE +..B6.30: # Preds ..B6.29 + # Execution count [2.50e+00] + movl $-1, %edi #792.12 +# exit(int) + call exit #792.12 + # LOE +..B6.31: # Preds ..B6.28 + # Execution count [5.00e-01]: Infreq + movl $.L_2__STRING.2, %edi #792.12 +..___tag_value_computeForceLJ_4xn.321: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #792.12 +..___tag_value_computeForceLJ_4xn.322: + # LOE r12 r13 r14 r15 +..B6.32: # Preds ..B6.31 + # Execution count [5.00e-01]: Infreq + xorl %eax, %eax #792.12 +..___tag_value_computeForceLJ_4xn.323: +# getTimeStamp() + call getTimeStamp #792.12 +..___tag_value_computeForceLJ_4xn.324: + # LOE r12 r13 r14 r15 xmm0 +..B6.41: # Preds ..B6.32 + # Execution count [5.00e-01]: Infreq + vmovsd %xmm0, 24(%rsp) #792.12[spill] + # LOE r12 r13 r14 r15 +..B6.33: # Preds ..B6.41 + # Execution count [5.00e-01]: Infreq + movl $.L_2__STRING.7, %edi #792.12 + xorl %eax, %eax #792.12 +..___tag_value_computeForceLJ_4xn.326: +# debug_printf(const char *, ...) + call debug_printf #792.12 +..___tag_value_computeForceLJ_4xn.327: + # LOE r12 r13 r14 r15 +..B6.34: # Preds ..B6.33 + # Execution count [5.00e-01]: Infreq + vmovsd 24(%rsp), %xmm0 #792.12[spill] + vsubsd 16(%rsp), %xmm0, %xmm0 #792.12[spill] + addq $56, %rsp #792.12 + .cfi_restore 3 + popq %rbx #792.12 + movq %rbp, %rsp #792.12 + popq %rbp #792.12 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #792.12 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xc8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B6.35: # Preds ..B6.8 ..B6.12 + # Execution count [2.25e-01]: Infreq + movl %ecx, %r15d #792.12 + jmp ..B6.20 # Prob 100% #792.12 + .align 16,0x90 + # LOE rax rbx rsi rdi r8 r12 r13 edx ecx r9d r14d r15d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_4xn,@function + .size computeForceLJ_4xn,.-computeForceLJ_4xn +..LNcomputeForceLJ_4xn.5: + .section .rodata.str1.32, "aMS",@progbits,1 + .space 3, 0x00 # pad + .align 32 +il0_peep_printf_format_1: + .long 1684892019 + .long 1918855263 + .long 1668637797 + .long 1970495333 + .long 975775853 + .long 1818313504 + .long 543450476 + .long 1752459639 + .long 1482047776 + .long 540160309 + .long 1920233065 + .long 1769172585 + .long 1629516643 + .long 1931502702 + .long 1818717801 + .long 1919954277 + .long 1936286565 + .long 544108393 + .long 1667852407 + .long 1936269416 + .long 1953459744 + .long 1818326560 + .long 169960553 + .byte 0 + .data +# -- End computeForceLJ_4xn + .text +.L_2__routine_start_computeForceLJ_4xn_half_6: +# -- Begin computeForceLJ_4xn_half + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJ_4xn_half +# --- computeForceLJ_4xn_half(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJ_4xn_half: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B7.1: # Preds ..B7.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJ_4xn_half.339: +..L340: + #432.96 + pushq %rbp #432.96 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #432.96 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #432.96 + pushq %r13 #432.96 + pushq %r14 #432.96 + pushq %r15 #432.96 + subq $424, %rsp #432.96 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r13 #432.96 + movl $.L_2__STRING.6, %edi #433.5 + xorl %eax, %eax #433.5 + movq %rdx, %r15 #432.96 + movq %rsi, %r14 #432.96 +..___tag_value_computeForceLJ_4xn_half.347: +# debug_printf(const char *, ...) + call debug_printf #433.5 +..___tag_value_computeForceLJ_4xn_half.348: + # LOE rbx r12 r13 r14 r15 +..B7.2: # Preds ..B7.1 + # Execution count [1.00e+00] + vmovss 108(%r13), %xmm0 #436.27 + xorl %edi, %edi #445.5 + vmulss %xmm0, %xmm0, %xmm1 #439.36 + xorl %ecx, %ecx #447.27 + vbroadcastss 48(%r13), %zmm3 #440.32 + vbroadcastss 40(%r13), %zmm4 #441.29 + vbroadcastss %xmm1, %zmm2 #439.36 + vmovups %zmm3, 64(%rsp) #440.32[spill] + vmovups %zmm4, (%rsp) #441.29[spill] + vmovups %zmm2, 128(%rsp) #439.36[spill] + movl 20(%r14), %edx #445.26 + xorl %r13d, %r13d #445.5 + testl %edx, %edx #445.26 + jle ..B7.23 # Prob 9% #445.26 + # LOE rcx rbx r12 r14 r15 edx edi r13d +..B7.3: # Preds ..B7.2 + # Execution count [9.00e-01] + movq 176(%r14), %rsi #447.27 + movq 192(%r14), %rax #448.32 + vxorps %xmm2, %xmm2, %xmm2 #449.39 + vmovdqu .L_2il0floatpacket.0(%rip), %xmm1 #448.9 + vmovdqu .L_2il0floatpacket.1(%rip), %xmm0 #448.9 + movq %rbx, 192(%rsp) #448.9[spill] + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22 + # LOE rax rcx rsi r12 r14 r15 edx edi r13d xmm0 xmm1 xmm2 +..B7.4: # Preds ..B7.21 ..B7.3 + # Execution count [5.00e+00] + movl %edi, %ebx #446.27 + movl %edi, %r8d #446.27 + sarl $1, %ebx #446.27 + andl $1, %r8d #446.27 + shll $2, %r8d #446.27 + lea (%rbx,%rbx,2), %r9d #446.27 + lea (%r8,%r9,8), %r10d #446.27 + movslq %r10d, %r10 #447.27 + lea (%rsi,%r10,4), %r11 #447.27 + movl (%rcx,%rax), %r10d #448.32 + testl %r10d, %r10d #448.32 + jle ..B7.21 # Prob 50% #448.32 + # LOE rax rcx rsi r11 r12 r14 r15 edx edi r10d r13d xmm0 xmm1 xmm2 +..B7.5: # Preds ..B7.4 + # Execution count [4.50e+00] + cmpl $8, %r10d #448.9 + jl ..B7.36 # Prob 10% #448.9 + # LOE rax rcx rsi r11 r12 r14 r15 edx edi r10d r13d xmm0 xmm1 xmm2 +..B7.6: # Preds ..B7.5 + # Execution count [4.50e+00] + lea 64(%r11), %rbx #451.13 + andq $15, %rbx #448.9 + testb $3, %bl #448.9 + je ..B7.8 # Prob 50% #448.9 + # LOE rax rcx rsi r11 r12 r14 r15 edx ebx edi r10d r13d xmm0 xmm1 xmm2 +..B7.7: # Preds ..B7.6 + # Execution count [2.25e+00] + movl %r13d, %ebx #448.9 + jmp ..B7.9 # Prob 100% #448.9 + # LOE rax rcx rbx rsi r11 r12 r14 r15 edx edi r10d r13d xmm0 xmm1 xmm2 +..B7.8: # Preds ..B7.6 + # Execution count [2.25e+00] + movl %ebx, %r8d #448.9 + negl %r8d #448.9 + addl $16, %r8d #448.9 + shrl $2, %r8d #448.9 + testl %ebx, %ebx #448.9 + cmovne %r8d, %ebx #448.9 + # LOE rax rcx rbx rsi r11 r12 r14 r15 edx edi r10d r13d xmm0 xmm1 xmm2 +..B7.9: # Preds ..B7.7 ..B7.8 + # Execution count [4.50e+00] + lea 8(%rbx), %r8d #448.9 + cmpl %r8d, %r10d #448.9 + jl ..B7.36 # Prob 10% #448.9 + # LOE rax rcx rbx rsi r11 r12 r14 r15 edx edi r10d r13d xmm0 xmm1 xmm2 +..B7.10: # Preds ..B7.9 + # Execution count [5.00e+00] + movl %r10d, %r9d #448.9 + xorl %r8d, %r8d #448.9 + subl %ebx, %r9d #448.9 + andl $7, %r9d #448.9 + negl %r9d #448.9 + addl %r10d, %r9d #448.9 + cmpl $1, %ebx #448.9 + jb ..B7.14 # Prob 10% #448.9 + # LOE rax rcx rbx rsi r8 r11 r12 r14 r15 edx edi r9d r10d r13d xmm0 xmm1 xmm2 +..B7.12: # Preds ..B7.10 ..B7.12 + # Execution count [2.50e+01] + movl %r13d, (%r11,%r8,4) #449.13 + movl %r13d, 32(%r11,%r8,4) #450.13 + movl %r13d, 64(%r11,%r8,4) #451.13 + incq %r8 #448.9 + cmpq %rbx, %r8 #448.9 + jb ..B7.12 # Prob 82% #448.9 + # LOE rax rcx rbx rsi r8 r11 r12 r14 r15 edx edi r9d r10d r13d xmm0 xmm1 xmm2 +..B7.14: # Preds ..B7.12 ..B7.10 + # Execution count [4.50e+00] + movslq %r9d, %r8 #448.9 + # LOE rax rcx rbx rsi r8 r11 r12 r14 r15 edx edi r9d r10d r13d xmm0 xmm1 xmm2 +..B7.15: # Preds ..B7.15 ..B7.14 + # Execution count [2.50e+01] + vmovups %xmm2, (%r11,%rbx,4) #449.13 + vmovups %xmm2, 32(%r11,%rbx,4) #450.13 + vmovups %xmm2, 64(%r11,%rbx,4) #451.13 + vmovups %xmm2, 16(%r11,%rbx,4) #449.13 + vmovups %xmm2, 48(%r11,%rbx,4) #450.13 + vmovups %xmm2, 80(%r11,%rbx,4) #451.13 + addq $8, %rbx #448.9 + cmpq %r8, %rbx #448.9 + jb ..B7.15 # Prob 82% #448.9 + # LOE rax rcx rbx rsi r8 r11 r12 r14 r15 edx edi r9d r10d r13d xmm0 xmm1 xmm2 +..B7.17: # Preds ..B7.15 ..B7.36 + # Execution count [5.00e+00] + lea 1(%r9), %ebx #448.9 + cmpl %r10d, %ebx #448.9 + ja ..B7.21 # Prob 50% #448.9 + # LOE rax rcx rsi r11 r12 r14 r15 edx edi r9d r10d r13d xmm0 xmm1 xmm2 +..B7.18: # Preds ..B7.17 + # Execution count [4.50e+00] + movslq %r9d, %r8 #449.13 + negl %r9d #448.9 + addl %r10d, %r9d #448.9 + xorl %ebx, %ebx #448.9 + movslq %r10d, %r10 #448.9 + vmovdqa %xmm0, %xmm4 #448.9 + vpbroadcastd %r9d, %xmm3 #448.9 + subq %r8, %r10 #448.9 + lea (%r11,%r8,4), %r11 #449.13 + # LOE rax rcx rbx rsi r10 r11 r12 r14 r15 edx edi r13d xmm0 xmm1 xmm2 xmm3 xmm4 +..B7.19: # Preds ..B7.19 ..B7.18 + # Execution count [2.50e+01] + vpcmpgtd %xmm4, %xmm3, %k1 #448.9 + vpaddd %xmm1, %xmm4, %xmm4 #448.9 + vmovups %xmm2, (%r11,%rbx,4){%k1} #449.13 + vmovups %xmm2, 32(%r11,%rbx,4){%k1} #450.13 + vmovups %xmm2, 64(%r11,%rbx,4){%k1} #451.13 + addq $4, %rbx #448.9 + cmpq %r10, %rbx #448.9 + jb ..B7.19 # Prob 82% #448.9 + # LOE rax rcx rbx rsi r10 r11 r12 r14 r15 edx edi r13d xmm0 xmm1 xmm2 xmm3 xmm4 +..B7.21: # Preds ..B7.19 ..B7.4 ..B7.17 + # Execution count [5.00e+00] + incl %edi #445.5 + addq $28, %rcx #445.5 + cmpl %edx, %edi #445.5 + jb ..B7.4 # Prob 82% #445.5 + # LOE rax rcx rsi r12 r14 r15 edx edi r13d xmm0 xmm1 xmm2 +..B7.22: # Preds ..B7.21 + # Execution count [9.00e-01] + movq 192(%rsp), %rbx #[spill] + .cfi_restore 3 + # LOE rbx r12 r14 r15 r13d +..B7.23: # Preds ..B7.2 ..B7.22 + # Execution count [1.00e+00] + xorl %eax, %eax #455.16 + vzeroupper #455.16 +..___tag_value_computeForceLJ_4xn_half.355: +# getTimeStamp() + call getTimeStamp #455.16 +..___tag_value_computeForceLJ_4xn_half.356: + # LOE rbx r12 r14 r15 r13d xmm0 +..B7.40: # Preds ..B7.23 + # Execution count [1.00e+00] + vmovsd %xmm0, 200(%rsp) #455.16[spill] + # LOE rbx r12 r14 r15 r13d +..B7.24: # Preds ..B7.40 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #459.5 +..___tag_value_computeForceLJ_4xn_half.358: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #459.5 +..___tag_value_computeForceLJ_4xn_half.359: + # LOE rbx r12 r14 r15 r13d +..B7.25: # Preds ..B7.24 + # Execution count [1.00e+00] + cmpl $0, 20(%r14) #462.26 + jle ..B7.32 # Prob 10% #462.26 + # LOE rbx r12 r14 r15 r13d +..B7.26: # Preds ..B7.25 + # Execution count [5.00e+00] + movq 24(%r15), %rax #471.25 + movq 160(%r14), %rbx #468.27 + movq 8(%r15), %rcx #470.19 + movslq (%rax), %rdx #471.25 + xorl %eax, %eax #498.9 + vbroadcastss (%rbx), %zmm19 #473.33 + vbroadcastss 4(%rbx), %zmm18 #474.33 + vbroadcastss 8(%rbx), %zmm3 #475.33 + vbroadcastss 12(%rbx), %zmm2 #476.33 + vbroadcastss 32(%rbx), %zmm17 #477.33 + vbroadcastss 36(%rbx), %zmm16 #478.33 + vbroadcastss 40(%rbx), %zmm15 #479.33 + vbroadcastss 44(%rbx), %zmm14 #480.33 + vbroadcastss 64(%rbx), %zmm13 #481.33 + vbroadcastss 68(%rbx), %zmm12 #482.33 + vbroadcastss 72(%rbx), %zmm1 #483.33 + vbroadcastss 76(%rbx), %zmm0 #484.33 + testq %rdx, %rdx #498.28 + jle ..B7.30 # Prob 10% #498.28 + # LOE rax rdx rcx rbx r14 r13d zmm0 zmm1 zmm2 zmm3 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 +..B7.27: # Preds ..B7.26 + # Execution count [4.50e+00] + movq 176(%r14), %rsi #502.31 + vmovups %zmm1, 256(%rsp) #502.31[spill] + vmovups %zmm2, 192(%rsp) #502.31[spill] + vmovups %zmm3, 320(%rsp) #502.31[spill] + vmovups .L_2il0floatpacket.7(%rip), %zmm20 #502.31 + vmovups .L_2il0floatpacket.6(%rip), %zmm21 #502.31 + vmovups (%rsp), %zmm22 #502.31[spill] + vmovups 64(%rsp), %zmm23 #502.31[spill] + vmovups 128(%rsp), %zmm24 #502.31[spill] + # LOE rax rdx rcx rbx rsi r13d zmm0 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# pointer_increment=64 7fb3a6c85239682e1bd1d29c30267e79 +# LLVM-MCA-BEGIN +..B7.28: # Preds ..B7.28 ..B7.27 + # Execution count [2.50e+01] + movl (%rcx,%rax,4), %edi #499.22 + movl %r13d, %r12d #528.39 + movslq %edi, %rdi #500.31 + incq %rax #498.9 + vmovups 320(%rsp), %zmm10 #512.35[spill] + testl $2147483647, %edi #526.56 + vmovups 256(%rsp), %zmm11 #514.35[spill] + vmovups 192(%rsp), %zmm9 #515.35[spill] + sete %r12b #528.39 + lea (%rdi,%rdi,2), %r14 #501.31 + shlq $5, %r14 #501.31 + movl %r12d, %r8d #528.39 + negl %r8d #528.39 + movl %r12d, %r11d #531.39 + addl $255, %r8d #528.39 + kmovw %r8d, %k0 #546.41 + lea (%r12,%r12,2), %r9d #529.39 + vsubps 64(%r14,%rbx), %zmm13, %zmm3 #508.35 + vsubps (%r14,%rbx), %zmm10, %zmm4 #512.35 + vsubps 64(%r14,%rbx), %zmm11, %zmm10 #514.35 + vsubps 32(%r14,%rbx), %zmm17, %zmm5 #507.35 + vsubps (%r14,%rbx), %zmm19, %zmm27 #506.35 + vsubps 32(%r14,%rbx), %zmm15, %zmm8 #513.35 + vsubps 64(%r14,%rbx), %zmm0, %zmm11 #517.35 + vsubps (%r14,%rbx), %zmm9, %zmm7 #515.35 + vsubps 32(%r14,%rbx), %zmm14, %zmm9 #516.35 + vsubps 64(%r14,%rbx), %zmm12, %zmm29 #511.35 + vsubps 32(%r14,%rbx), %zmm16, %zmm28 #510.35 + vsubps (%r14,%rbx), %zmm18, %zmm25 #509.35 + vmulps %zmm3, %zmm3, %zmm2 #541.80 + vmulps %zmm10, %zmm10, %zmm30 #543.80 + vmulps %zmm11, %zmm11, %zmm1 #544.80 + vmulps %zmm29, %zmm29, %zmm26 #542.80 + vfmadd231ps %zmm5, %zmm5, %zmm2 #541.57 + vfmadd231ps %zmm8, %zmm8, %zmm30 #543.57 + vfmadd231ps %zmm9, %zmm9, %zmm1 #544.57 + vfmadd231ps %zmm28, %zmm28, %zmm26 #542.57 + vfmadd231ps %zmm27, %zmm27, %zmm2 #541.34 + vfmadd231ps %zmm4, %zmm4, %zmm30 #543.34 + vfmadd231ps %zmm7, %zmm7, %zmm1 #544.34 + vfmadd231ps %zmm25, %zmm25, %zmm26 #542.34 + vrcp14ps %zmm2, %zmm31 #551.35 + vcmpps $17, %zmm24, %zmm30, %k7 #548.67 + vrcp14ps %zmm30, %zmm6 #553.35 + vcmpps $17, %zmm24, %zmm2, %k3 #546.67 + vrcp14ps %zmm1, %zmm2 #554.35 + vcmpps $17, %zmm24, %zmm26, %k5 #547.67 + vrcp14ps %zmm26, %zmm26 #552.35 + vmulps %zmm23, %zmm31, %zmm30 #556.67 + kandw %k3, %k0, %k2 #546.41 + vcmpps $17, %zmm24, %zmm1, %k3 #549.67 + vmulps %zmm30, %zmm31, %zmm1 #556.51 + vmulps %zmm1, %zmm31, %zmm30 #556.35 + negl %r9d #529.39 + vfmsub213ps %zmm20, %zmm31, %zmm1 #561.79 + vmulps %zmm22, %zmm31, %zmm31 #561.105 + vmulps %zmm31, %zmm1, %zmm31 #561.70 + addl $255, %r9d #529.39 + kmovw %r9d, %k4 #547.41 + vmulps %zmm31, %zmm30, %zmm30 #561.54 + kandw %k5, %k4, %k1 #547.41 + vmulps %zmm30, %zmm21, %zmm1 #561.36 + vmulps %zmm23, %zmm26, %zmm30 #557.67 + vmulps %zmm30, %zmm26, %zmm31 #557.51 + lea (,%r12,8), %r10d #530.39 + vmulps %zmm31, %zmm26, %zmm30 #557.35 + negl %r10d #530.39 + vfmsub213ps %zmm20, %zmm26, %zmm31 #562.79 + vmulps %zmm22, %zmm26, %zmm26 #562.105 + vmulps %zmm26, %zmm31, %zmm31 #562.70 + addl %r12d, %r10d #530.39 + vmulps %zmm31, %zmm30, %zmm30 #562.54 + addl $255, %r10d #530.39 + kmovw %r10d, %k6 #548.41 + vmulps %zmm30, %zmm21, %zmm26 #562.36 + kandw %k7, %k6, %k4 #548.41 + vmulps %zmm26, %zmm25, %zmm25{%k1}{z} #569.33 + vmulps %zmm26, %zmm28, %zmm31{%k1}{z} #570.33 + vmulps %zmm23, %zmm6, %zmm28 #558.67 + vmulps %zmm26, %zmm29, %zmm30{%k1}{z} #571.33 + vmulps %zmm23, %zmm2, %zmm29 #559.67 + vfmadd231ps %zmm1, %zmm27, %zmm25{%k2} #599.83 + vfmadd231ps %zmm1, %zmm5, %zmm31{%k2} #600.83 + vmulps %zmm28, %zmm6, %zmm27 #558.51 + vfmadd231ps %zmm1, %zmm3, %zmm30{%k2} #601.83 + vmulps %zmm29, %zmm2, %zmm1 #559.51 + vmulps %zmm27, %zmm6, %zmm5 #558.35 + vfmsub213ps %zmm20, %zmm6, %zmm27 #563.79 + vmulps %zmm22, %zmm6, %zmm6 #563.105 + vmulps %zmm1, %zmm2, %zmm3 #559.35 + vfmsub213ps %zmm20, %zmm2, %zmm1 #564.79 + vmulps %zmm22, %zmm2, %zmm2 #564.105 + vmulps %zmm6, %zmm27, %zmm26 #563.70 + vmulps %zmm2, %zmm1, %zmm1 #564.70 + vmulps %zmm26, %zmm5, %zmm5 #563.54 + vmulps %zmm1, %zmm3, %zmm3 #564.54 + vmulps %zmm5, %zmm21, %zmm6 #563.36 + vmulps %zmm3, %zmm21, %zmm27 #564.36 + vfmadd231ps %zmm6, %zmm4, %zmm25{%k4} #599.89 + vmovups (%r14,%rsi), %zmm4 #599.44 + vfmadd231ps %zmm6, %zmm8, %zmm31{%k4} #600.89 + vfmadd231ps %zmm6, %zmm10, %zmm30{%k4} #601.89 + shll $4, %r11d #531.39 + subl %r11d, %r12d #531.39 + addl $255, %r12d #531.39 + kmovw %r12d, %k0 #549.41 + kandw %k3, %k0, %k5 #549.41 + vfmadd231ps %zmm27, %zmm7, %zmm25{%k5} #599.95 + vfmadd231ps %zmm27, %zmm9, %zmm31{%k5} #600.95 + vfmadd231ps %zmm27, %zmm11, %zmm30{%k5} #601.95 + vsubps %zmm25, %zmm4, %zmm7 #599.95 + vmovups %zmm7, (%r14,%rsi) #599.13 + vmovups 32(%r14,%rsi), %zmm8 #600.44 + vsubps %zmm31, %zmm8, %zmm4 #600.95 + vmovups %zmm4, 32(%r14,%rsi) #600.13 + vmovups 64(%r14,%rsi), %zmm1 #601.44 + vsubps %zmm30, %zmm1, %zmm2 #601.95 + vmovups %zmm2, 64(%r14,%rsi) #601.13 + cmpq %rdx, %rax #498.9 + jb ..B7.28 # Prob 82% #498.9 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER + # LOE rax rdx rcx rbx rsi r13d zmm0 zmm12 zmm13 zmm14 zmm15 zmm16 zmm17 zmm18 zmm19 zmm20 zmm21 zmm22 zmm23 zmm24 +..B7.30: # Preds ..B7.28 ..B7.26 + # Execution count [5.00e+00] + movl $il0_peep_printf_format_2, %edi #605.9 + movq stderr(%rip), %rsi #605.9 + vzeroupper #605.9 + call fputs #605.9 + # LOE +..B7.31: # Preds ..B7.30 + # Execution count [5.00e+00] + movl $-1, %edi #605.9 +# exit(int) + call exit #605.9 + # LOE +..B7.32: # Preds ..B7.25 + # Execution count [1.00e+00]: Infreq + movl $.L_2__STRING.2, %edi #614.5 +..___tag_value_computeForceLJ_4xn_half.369: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #614.5 +..___tag_value_computeForceLJ_4xn_half.370: + # LOE rbx r12 +..B7.33: # Preds ..B7.32 + # Execution count [1.00e+00]: Infreq + xorl %eax, %eax #617.16 +..___tag_value_computeForceLJ_4xn_half.371: +# getTimeStamp() + call getTimeStamp #617.16 +..___tag_value_computeForceLJ_4xn_half.372: + # LOE rbx r12 xmm0 +..B7.41: # Preds ..B7.33 + # Execution count [1.00e+00]: Infreq + vmovsd %xmm0, (%rsp) #617.16[spill] + # LOE rbx r12 +..B7.34: # Preds ..B7.41 + # Execution count [1.00e+00]: Infreq + movl $.L_2__STRING.7, %edi #618.5 + xorl %eax, %eax #618.5 +..___tag_value_computeForceLJ_4xn_half.374: +# debug_printf(const char *, ...) + call debug_printf #618.5 +..___tag_value_computeForceLJ_4xn_half.375: + # LOE rbx r12 +..B7.35: # Preds ..B7.34 + # Execution count [1.00e+00]: Infreq + vmovsd (%rsp), %xmm0 #619.14[spill] + vsubsd 200(%rsp), %xmm0, %xmm0 #619.14[spill] + addq $424, %rsp #619.14 + .cfi_restore 15 + popq %r15 #619.14 + .cfi_restore 14 + popq %r14 #619.14 + .cfi_restore 13 + popq %r13 #619.14 + movq %rbp, %rsp #619.14 + popq %rbp #619.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #619.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B7.36: # Preds ..B7.5 ..B7.9 + # Execution count [4.50e-01]: Infreq + movl %r13d, %r9d #448.9 + jmp ..B7.17 # Prob 100% #448.9 + .align 16,0x90 + # LOE rax rcx rsi r11 r12 r14 r15 edx edi r9d r10d r13d xmm0 xmm1 xmm2 + .cfi_endproc +# mark_end; + .type computeForceLJ_4xn_half,@function + .size computeForceLJ_4xn_half,.-computeForceLJ_4xn_half +..LNcomputeForceLJ_4xn_half.6: + .section .rodata.str1.32, "aMS",@progbits,1 + .space 3, 0x00 # pad + .align 32 +il0_peep_printf_format_2: + .long 1684892019 + .long 1918855263 + .long 1668637797 + .long 1970495333 + .long 975775853 + .long 1818313504 + .long 543450476 + .long 1752459639 + .long 1482047776 + .long 540160309 + .long 1920233065 + .long 1769172585 + .long 1629516643 + .long 1931502702 + .long 1818717801 + .long 1919954277 + .long 1936286565 + .long 544108393 + .long 1667852407 + .long 1936269416 + .long 1953459744 + .long 1818326560 + .long 169960553 + .byte 0 + .data +# -- End computeForceLJ_4xn_half + .section .rodata, "a" + .align 64 + .align 64 +.L_2il0floatpacket.6: + .long 0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000,0x42400000 + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,64 + .align 64 +.L_2il0floatpacket.7: + .long 0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000,0x3f000000 + .type .L_2il0floatpacket.7,@object + .size .L_2il0floatpacket.7,64 + .align 64 +.L_2il0floatpacket.8: + .long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f + .type .L_2il0floatpacket.8,@object + .size .L_2il0floatpacket.8,64 + .align 16 +.L_2il0floatpacket.0: + .long 0x00000004,0x00000004,0x00000004,0x00000004 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,16 + .align 16 +.L_2il0floatpacket.1: + .long 0x00000000,0x00000001,0x00000002,0x00000003 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,16 + .align 8 +.L_2il0floatpacket.2: + .long 0x00000000,0x3fe00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,8 + .align 4 +.L_2il0floatpacket.3: + .long 0x42400000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,4 + .align 4 +.L_2il0floatpacket.4: + .long 0x3f000000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,4 + .align 4 +.L_2il0floatpacket.5: + .long 0x3f800000 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,4 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +.L_2__STRING.1: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 1646283340 + .long 1852401509 + .word 10 + .type .L_2__STRING.1,@object + .size .L_2__STRING.1,22 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.2: + .long 1668444006 + .word 101 + .type .L_2__STRING.2,@object + .size .L_2__STRING.2,6 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.3: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 1696614988 + .long 681070 + .type .L_2__STRING.3,@object + .size .L_2__STRING.3,20 + .align 4 +.L_2__STRING.4: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 845105740 + .long 544108152 + .long 1768383842 + .word 2670 + .byte 0 + .type .L_2__STRING.4,@object + .size .L_2__STRING.4,27 + .space 1, 0x00 # pad + .align 4 +.L_2__STRING.5: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 845105740 + .long 544108152 + .long 174354021 + .byte 0 + .type .L_2__STRING.5,@object + .size .L_2__STRING.5,25 + .space 3, 0x00 # pad + .align 4 +.L_2__STRING.6: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 878660172 + .long 1646292600 + .long 1852401509 + .word 10 + .type .L_2__STRING.6,@object + .size .L_2__STRING.6,26 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.7: + .long 1886220131 + .long 1181054069 + .long 1701016175 + .long 878660172 + .long 1696624248 + .long 681070 + .type .L_2__STRING.7,@object + .size .L_2__STRING.7,24 + .data + .section .note.GNU-stack, "" +# End diff --git a/static_analysis/jan/gromacs-icx-avx512-dp.o b/static_analysis/jan/gromacs-icx-avx512-dp.o new file mode 100644 index 0000000..ac5e533 Binary files /dev/null and b/static_analysis/jan/gromacs-icx-avx512-dp.o differ diff --git a/static_analysis/jan/icx-icx-gromacs-avx512.s b/static_analysis/jan/gromacs-icx-avx512-dp.s similarity index 99% rename from static_analysis/jan/icx-icx-gromacs-avx512.s rename to static_analysis/jan/gromacs-icx-avx512-dp.s index f9863ad..bd35fde 100644 --- a/static_analysis/jan/icx-icx-gromacs-avx512.s +++ b/static_analysis/jan/gromacs-icx-avx512-dp.s @@ -2238,8 +2238,8 @@ movl $111, %ebx # OSACA START MARKER .byte 100 # OSACA START MARKER .byte 103 # OSACA START MARKER .byte 144 # OSACA START MARKER +# pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 # LLVM-MCA-BEGIN -# pointer_increment=256 da67166e5736661e6b03ea29ee7bfd67 .LBB5_12: # Parent Loop BB5_7 Depth=1 # => This Inner Loop Header: Depth=2 movslq (%r10,%rbx,4), %rcx diff --git a/static_analysis/jan/gromacs-icx-avx512-sp.o b/static_analysis/jan/gromacs-icx-avx512-sp.o new file mode 100644 index 0000000..119d7f9 Binary files /dev/null and b/static_analysis/jan/gromacs-icx-avx512-sp.o differ diff --git a/static_analysis/jan/icx-icx-gromacs-avx512-sp.s b/static_analysis/jan/gromacs-icx-avx512-sp.s similarity index 84% rename from static_analysis/jan/icx-icx-gromacs-avx512-sp.s rename to static_analysis/jan/gromacs-icx-avx512-sp.s index 69698ef..b1506e1 100644 --- a/static_analysis/jan/icx-icx-gromacs-avx512-sp.s +++ b/static_analysis/jan/gromacs-icx-avx512-sp.s @@ -1,17 +1,17 @@ .text .file "force_lj.c" - .section .rodata.cst8,"aM",@progbits,8 - .p2align 3 # -- Begin function computeForceLJ_ref -.LCPI0_0: - .quad 0x4048000000000000 # 48 -.LCPI0_2: - .quad 0xbfe0000000000000 # -0.5 -.LCPI0_3: - .quad 0x3fe0000000000000 # 0.5 .section .rodata.cst4,"aM",@progbits,4 - .p2align 2 + .p2align 2 # -- Begin function computeForceLJ_ref +.LCPI0_0: + .long 0x42400000 # 48 .LCPI0_1: .long 0x3f800000 # 1 +.LCPI0_2: + .long 0xbf000000 # -0.5 + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 +.LCPI0_3: + .quad 0x3fe0000000000000 # 0.5 .text .globl computeForceLJ_ref .p2align 4, 0x90 @@ -146,8 +146,7 @@ computeForceLJ_ref: # leaq 32(%rax), %r15 vmovss (%rsp), %xmm0 # 4-byte Reload # xmm0 = mem[0],zero,zero,zero - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd .LCPI0_0(%rip), %xmm0, %xmm12 + vmulss .LCPI0_0(%rip), %xmm0, %xmm12 leaq 24(%rax), %rdx movq 160(%r14), %rdi movq 176(%r14), %rbp @@ -170,7 +169,7 @@ computeForceLJ_ref: # movq %rax, 120(%rsp) # 8-byte Spill xorl %edi, %edi vmovss .LCPI0_1(%rip), %xmm10 # xmm10 = mem[0],zero,zero,zero - vmovsd .LCPI0_2(%rip), %xmm11 # xmm11 = mem[0],zero + vmovss .LCPI0_2(%rip), %xmm11 # xmm11 = mem[0],zero,zero,zero vmovsd .LCPI0_3(%rip), %xmm8 # xmm8 = mem[0],zero vmovss 28(%rsp), %xmm20 # 4-byte Reload # xmm20 = mem[0],zero,zero,zero @@ -259,13 +258,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm3, %xmm14 vmulss %xmm3, %xmm3, %xmm4 vmulss %xmm4, %xmm14, %xmm4 - vcvtss2sd %xmm4, %xmm4, %xmm4 - vaddsd %xmm4, %xmm11, %xmm14 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vmulsd %xmm3, %xmm12, %xmm3 - vmulsd %xmm4, %xmm14, %xmm4 - vmulsd %xmm3, %xmm4, %xmm3 - vcvtsd2ss %xmm3, %xmm3, %xmm3 + vaddss %xmm4, %xmm11, %xmm14 + vmulss %xmm3, %xmm12, %xmm3 + vmulss %xmm4, %xmm14, %xmm4 + vmulss %xmm3, %xmm4, %xmm3 vfmadd231ss %xmm2, %xmm3, %xmm7 # xmm7 = (xmm3 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm3, %xmm5 # xmm5 = (xmm3 * xmm1) + xmm5 vfmadd231ss %xmm0, %xmm3, %xmm19 # xmm19 = (xmm3 * xmm0) + xmm19 @@ -311,25 +307,22 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm3, %xmm4 vmulss %xmm3, %xmm3, %xmm6 vmulss %xmm4, %xmm6, %xmm4 - vcvtss2sd %xmm4, %xmm4, %xmm4 - vaddsd %xmm4, %xmm11, %xmm6 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vmulsd %xmm3, %xmm12, %xmm3 - vmulsd %xmm4, %xmm6, %xmm4 - vmulsd %xmm3, %xmm4, %xmm3 - vcvtsd2ss %xmm3, %xmm3, %xmm3 - vmovss -32(%rsi,%rbx,4), %xmm4 # xmm4 = mem[0],zero,zero,zero - vmovss -64(%rsi,%rbx,4), %xmm6 # xmm6 = mem[0],zero,zero,zero - vfnmadd231ss %xmm2, %xmm3, %xmm6 # xmm6 = -(xmm3 * xmm2) + xmm6 - vmovss %xmm6, -64(%rsi,%rbx,4) - vfnmadd231ss %xmm0, %xmm3, %xmm4 # xmm4 = -(xmm3 * xmm0) + xmm4 - vmovss %xmm4, -32(%rsi,%rbx,4) + vaddss %xmm4, %xmm11, %xmm6 + vmulss %xmm3, %xmm12, %xmm3 + vmulss %xmm4, %xmm6, %xmm4 + vmovss -32(%rsi,%rbx,4), %xmm6 # xmm6 = mem[0],zero,zero,zero + vmulss %xmm3, %xmm4, %xmm3 + vmovss -64(%rsi,%rbx,4), %xmm4 # xmm4 = mem[0],zero,zero,zero + vfnmadd231ss %xmm3, %xmm2, %xmm4 # xmm4 = -(xmm2 * xmm3) + xmm4 + vmovss %xmm4, -64(%rsi,%rbx,4) + vfnmadd231ss %xmm3, %xmm0, %xmm6 # xmm6 = -(xmm0 * xmm3) + xmm6 + vmovss %xmm6, -32(%rsi,%rbx,4) vmovss (%rsi,%rbx,4), %xmm4 # xmm4 = mem[0],zero,zero,zero - vfnmadd231ss %xmm1, %xmm3, %xmm4 # xmm4 = -(xmm3 * xmm1) + xmm4 + vfnmadd231ss %xmm3, %xmm1, %xmm4 # xmm4 = -(xmm1 * xmm3) + xmm4 vmovss %xmm4, (%rsi,%rbx,4) vfmadd231ss %xmm2, %xmm3, %xmm7 # xmm7 = (xmm3 * xmm2) + xmm7 vfmadd231ss %xmm0, %xmm3, %xmm5 # xmm5 = (xmm3 * xmm0) + xmm5 - vfmadd231ss %xmm1, %xmm3, %xmm19 # xmm19 = (xmm3 * xmm1) + xmm19 + vfmadd231ss %xmm3, %xmm1, %xmm19 # xmm19 = (xmm1 * xmm3) + xmm19 movl $1, %r14d movq %rdx, %rcx .LBB0_71: # in Loop: Header=BB0_12 Depth=4 @@ -392,16 +385,13 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm3 vmulss %xmm0, %xmm0, %xmm5 vmulss %xmm3, %xmm5, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm3, %xmm11, %xmm5 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm5, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm3, %xmm11, %xmm5 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm5, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vmulss %xmm0, %xmm15, %xmm7 - vmulss %xmm0, %xmm1, %xmm5 - vmulss %xmm0, %xmm2, %xmm19 + vmulss %xmm1, %xmm0, %xmm5 + vmulss %xmm2, %xmm0, %xmm19 movl $1, %r14d movq %rdx, %rbx .LBB0_26: # in Loop: Header=BB0_10 Depth=3 @@ -425,13 +415,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm17 vmulss %xmm0, %xmm0, %xmm3 vmulss %xmm17, %xmm3, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm11, %xmm3, %xmm17 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm17, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm11, %xmm3, %xmm17 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm17, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 @@ -463,13 +450,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm17 vmulss %xmm0, %xmm0, %xmm3 vmulss %xmm17, %xmm3, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm11, %xmm3, %xmm17 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm17, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm11, %xmm3, %xmm17 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm17, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 @@ -501,13 +485,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm17 vmulss %xmm0, %xmm0, %xmm3 vmulss %xmm17, %xmm3, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm11, %xmm3, %xmm17 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm17, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm11, %xmm3, %xmm17 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm17, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 @@ -539,13 +520,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm17 vmulss %xmm0, %xmm0, %xmm3 vmulss %xmm17, %xmm3, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm11, %xmm3, %xmm17 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm17, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm11, %xmm3, %xmm17 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm17, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 @@ -577,13 +555,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm17 vmulss %xmm0, %xmm0, %xmm3 vmulss %xmm17, %xmm3, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm11, %xmm3, %xmm17 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm17, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm11, %xmm3, %xmm17 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm17, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 @@ -615,13 +590,10 @@ computeForceLJ_ref: # vmulss %xmm20, %xmm0, %xmm17 vmulss %xmm0, %xmm0, %xmm3 vmulss %xmm17, %xmm3, %xmm3 - vcvtss2sd %xmm3, %xmm3, %xmm3 - vaddsd %xmm11, %xmm3, %xmm17 - vcvtss2sd %xmm0, %xmm0, %xmm0 - vmulsd %xmm0, %xmm12, %xmm0 - vmulsd %xmm3, %xmm17, %xmm3 - vmulsd %xmm0, %xmm3, %xmm0 - vcvtsd2ss %xmm0, %xmm0, %xmm0 + vaddss %xmm11, %xmm3, %xmm17 + vmulss %xmm0, %xmm12, %xmm0 + vmulss %xmm3, %xmm17, %xmm3 + vmulss %xmm0, %xmm3, %xmm0 vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 @@ -704,8 +676,6 @@ computeForceLJ_ref: # .p2align 3 .LCPI1_3: .quad 0x3fe0000000000000 # 0.5 -.LCPI1_4: - .quad 0x41cdcd6500000000 # 1.0E+9 .text .globl computeForceLJ_2xnn_half .p2align 4, 0x90 @@ -725,27 +695,25 @@ computeForceLJ_2xnn_half: # .cfi_def_cfa_offset 48 pushq %rbx .cfi_def_cfa_offset 56 - subq $232, %rsp - .cfi_def_cfa_offset 288 + subq $216, %rsp + .cfi_def_cfa_offset 272 .cfi_offset %rbx, -56 .cfi_offset %r12, -48 .cfi_offset %r13, -40 .cfi_offset %r14, -32 .cfi_offset %r15, -24 .cfi_offset %rbp, -16 - movq %rcx, 48(%rsp) # 8-byte Spill - movq %rdx, %r12 + movq %rcx, 32(%rsp) # 8-byte Spill + movq %rdx, %r14 movq %rsi, %r15 - movq %rdi, %rbp - xorl %ebx, %ebx + movq %rdi, %r12 movl $.L.str.3, %edi xorl %eax, %eax callq debug_printf - vmovss 108(%rbp), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss 108(%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero vmovss %xmm0, (%rsp) # 4-byte Spill - vbroadcastss 48(%rbp), %zmm2 - movq %rbp, 32(%rsp) # 8-byte Spill - vpbroadcastd 40(%rbp), %zmm3 + vbroadcastss 48(%r12), %zmm2 + vpbroadcastd 40(%r12), %zmm3 movl 20(%r15), %r11d testl %r11d, %r11d jle .LBB1_5 @@ -769,70 +737,71 @@ computeForceLJ_2xnn_half: # leaq (%rdi,%rdi,8), %rax leaq (%rax,%rax,2), %rax addq %rdi, %rax - movl (%r10,%rax), %ecx - testl %ecx, %ecx + movl (%r10,%rax), %esi + testl %esi, %esi jle .LBB1_21 # %bb.3: # in Loop: Header=BB1_2 Depth=1 - leal (,%rdi,4), %esi - movl %esi, %eax + leal (,%rdi,4), %ebx + movl %ebx, %eax andl $2147483640, %eax # imm = 0x7FFFFFF8 leal (%rax,%rax,2), %eax - andl $4, %esi - orl %eax, %esi - cmpl $7, %ecx + andl $4, %ebx + orl %eax, %ebx + cmpl $7, %esi ja .LBB1_15 # %bb.4: # in Loop: Header=BB1_2 Depth=1 - movl %ecx, %ebp + movl %esi, %ebp andl $-8, %ebp - cmpq %rcx, %rbp + cmpq %rsi, %rbp jae .LBB1_21 jmp .LBB1_19 .p2align 5, 0x90 .LBB1_15: # in Loop: Header=BB1_2 Depth=1 - leaq (,%rcx,4), %rbp + leaq (,%rsi,4), %rbp andq $-32, %rbp - movl %esi, %r14d - leaq (%r9,%r14,4), %rax - xorl %edx, %edx + movl %ebx, %ecx + leaq (%r9,%rcx,4), %rdx + xorl %eax, %eax .p2align 4, 0x90 .LBB1_16: # Parent Loop BB1_2 Depth=1 # => This Inner Loop Header: Depth=2 - vmovups %ymm0, (%rax,%rdx) - addq $32, %rdx - cmpq %rdx, %rbp + vmovups %ymm0, (%rdx,%rax) + addq $32, %rax + cmpq %rax, %rbp jne .LBB1_16 # %bb.17: # in Loop: Header=BB1_2 Depth=1 - movl %ecx, %ebp + movl %esi, %ebp andl $-8, %ebp - addq %rbp, %r14 - vmovups %zmm1, (%r9,%r14,4) - cmpq %rcx, %rbp + addq %rbp, %rcx + vmovups %zmm1, (%r9,%rcx,4) + cmpq %rsi, %rbp jae .LBB1_21 .LBB1_19: # in Loop: Header=BB1_2 Depth=1 - movl %esi, %eax - leaq (%r8,%rax,4), %rdx + movl %ebx, %eax + leaq (%r8,%rax,4), %rcx .p2align 4, 0x90 .LBB1_20: # Parent Loop BB1_2 Depth=1 # => This Inner Loop Header: Depth=2 - movl $0, -64(%rdx,%rbp,4) - movl $0, -32(%rdx,%rbp,4) - movl $0, (%rdx,%rbp,4) + movl $0, -64(%rcx,%rbp,4) + movl $0, -32(%rcx,%rbp,4) + movl $0, (%rcx,%rbp,4) incq %rbp - cmpq %rbp, %rcx + cmpq %rbp, %rsi jne .LBB1_20 jmp .LBB1_21 .p2align 5, 0x90 .LBB1_5: + xorl %r13d, %r13d xorl %eax, %eax - vmovups %zmm2, 160(%rsp) # 64-byte Spill - vmovdqu64 %zmm3, 96(%rsp) # 64-byte Spill + vmovups %zmm2, 144(%rsp) # 64-byte Spill + vmovdqu64 %zmm3, 80(%rsp) # 64-byte Spill vzeroupper callq getTimeStamp - vmovsd %xmm0, 24(%rsp) # 8-byte Spill + vmovsd %xmm0, 16(%rsp) # 8-byte Spill movl $.L.str.1, %edi callq likwid_markerStartRegion - vmovups 96(%rsp), %zmm31 # 64-byte Reload - vmovups 160(%rsp), %zmm30 # 64-byte Reload + vmovups 80(%rsp), %zmm31 # 64-byte Reload + vmovups 144(%rsp), %zmm30 # 64-byte Reload cmpl $0, 20(%r15) jle .LBB1_10 # %bb.6: @@ -840,27 +809,25 @@ computeForceLJ_2xnn_half: # # xmm0 = mem[0],zero,zero,zero vmulss %xmm0, %xmm0, %xmm0 vbroadcastss %xmm0, %zmm0 - xorl %r11d, %r11d vbroadcastss .LCPI1_0(%rip), %zmm1 # zmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] vbroadcastss .LCPI1_1(%rip), %zmm2 # zmm2 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] - movw $4369, %cx # imm = 0x1111 - kmovw %ecx, %k1 + movw $4369, %ax # imm = 0x1111 + kmovw %eax, %k1 vmovdqu .LCPI1_2(%rip), %xmm3 # xmm3 = <1,u> # AlignMOV convert to UnAlignMOV vmovsd .LCPI1_3(%rip), %xmm4 # xmm4 = mem[0],zero - xorl %ebx, %ebx - movq %r12, 40(%rsp) # 8-byte Spill - movq %r15, 16(%rsp) # 8-byte Spill + movq %r14, 24(%rsp) # 8-byte Spill + movq %r15, 8(%rsp) # 8-byte Spill + movl $248, %ebp jmp .LBB1_7 .p2align 5, 0x90 .LBB1_13: # in Loop: Header=BB1_7 Depth=1 - movl 12(%rsp), %ebx # 4-byte Reload - movq 40(%rsp), %r12 # 8-byte Reload - movq 16(%rsp), %r15 # 8-byte Reload - movq 80(%rsp), %rax # 8-byte Reload - movq 72(%rsp), %rsi # 8-byte Reload - movq 64(%rsp), %r10 # 8-byte Reload - movq 56(%rsp), %rcx # 8-byte Reload + movq 24(%rsp), %r14 # 8-byte Reload + movq 8(%rsp), %r15 # 8-byte Reload + movq 64(%rsp), %rcx # 8-byte Reload + movq 56(%rsp), %rdx # 8-byte Reload + movq 48(%rsp), %rsi # 8-byte Reload + movq 40(%rsp), %rax # 8-byte Reload .LBB1_9: # in Loop: Header=BB1_7 Depth=1 vshuff64x2 $136, %zmm14, %zmm12, %zmm7 # zmm7 = zmm12[0,1,4,5],zmm14[0,1,4,5] vshuff64x2 $221, %zmm14, %zmm12, %zmm10 # zmm10 = zmm12[2,3,6,7],zmm14[2,3,6,7] @@ -870,8 +837,8 @@ computeForceLJ_2xnn_half: # vpermilps $177, %zmm7, %zmm10 # zmm10 = zmm7[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] vaddps %zmm10, %zmm7, %zmm7 vcompressps %zmm7, %zmm7 {%k1} {z} - vaddps (%r10,%rax,4), %xmm7, %xmm7 - vmovups %xmm7, (%r10,%rax,4) # AlignMOV convert to UnAlignMOV + vaddps (%rsi,%rcx,4), %xmm7, %xmm7 + vmovups %xmm7, (%rsi,%rcx,4) # AlignMOV convert to UnAlignMOV vshuff64x2 $136, %zmm9, %zmm8, %zmm7 # zmm7 = zmm8[0,1,4,5],zmm9[0,1,4,5] vshuff64x2 $221, %zmm9, %zmm8, %zmm8 # zmm8 = zmm8[2,3,6,7],zmm9[2,3,6,7] vaddps %zmm8, %zmm7, %zmm7 @@ -880,8 +847,8 @@ computeForceLJ_2xnn_half: # vpermilps $177, %zmm7, %zmm8 # zmm8 = zmm7[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] vaddps %zmm8, %zmm7, %zmm7 vcompressps %zmm7, %zmm7 {%k1} {z} - vaddps 32(%r10,%rax,4), %xmm7, %xmm7 - vmovups %xmm7, 32(%r10,%rax,4) # AlignMOV convert to UnAlignMOV + vaddps 32(%rsi,%rcx,4), %xmm7, %xmm7 + vmovups %xmm7, 32(%rsi,%rcx,4) # AlignMOV convert to UnAlignMOV vshuff64x2 $136, %zmm5, %zmm6, %zmm7 # zmm7 = zmm6[0,1,4,5],zmm5[0,1,4,5] vshuff64x2 $221, %zmm5, %zmm6, %zmm5 # zmm5 = zmm6[2,3,6,7],zmm5[2,3,6,7] vaddps %zmm5, %zmm7, %zmm5 @@ -890,70 +857,69 @@ computeForceLJ_2xnn_half: # vpermilps $177, %zmm5, %zmm6 # zmm6 = zmm5[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] vaddps %zmm6, %zmm5, %zmm5 vcompressps %zmm5, %zmm5 {%k1} {z} - vaddps 64(%r10,%rax,4), %xmm5, %xmm5 - vmovups %xmm5, 64(%r10,%rax,4) # AlignMOV convert to UnAlignMOV - vpinsrq $1, %rcx, %xmm3, %xmm5 - movq 48(%rsp), %rdx # 8-byte Reload - vpaddq (%rdx), %xmm5, %xmm5 - vmovdqu %xmm5, (%rdx) - addl %esi, %ebx - vcvtsi2sd %esi, %xmm11, %xmm5 + vaddps 64(%rsi,%rcx,4), %xmm5, %xmm5 + vmovups %xmm5, 64(%rsi,%rcx,4) # AlignMOV convert to UnAlignMOV + vpinsrq $1, %rax, %xmm3, %xmm5 + movq 32(%rsp), %rcx # 8-byte Reload + vpaddq (%rcx), %xmm5, %xmm5 + vmovdqu %xmm5, (%rcx) + vcvtsi2sd %edx, %xmm11, %xmm5 vmulsd %xmm4, %xmm5, %xmm5 - vcvttsd2si %xmm5, %rcx - addq %rcx, 16(%rdx) - incq %r11 - movslq 20(%r15), %rcx - cmpq %rcx, %r11 + vcvttsd2si %xmm5, %rax + addq %rax, 16(%rcx) + incq %r13 + movslq 20(%r15), %rax + cmpq %rax, %r13 jge .LBB1_10 .LBB1_7: # =>This Loop Header: Depth=1 # Child Loop BB1_12 Depth 2 - leal (,%r11,4), %eax - movl %eax, %ecx - andl $2147483640, %ecx # imm = 0x7FFFFFF8 - leal (%rcx,%rcx,2), %ecx - andl $4, %eax - orl %ecx, %eax - movq 176(%r15), %r10 - movq 24(%r12), %rcx - movslq (%rcx,%r11,4), %rsi - testq %rsi, %rsi + leal (,%r13,4), %ecx + movl %ecx, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %ecx + orl %eax, %ecx + movq 176(%r15), %rsi + movq 24(%r14), %rax + movslq (%rax,%r13,4), %rdx + testq %rdx, %rdx jle .LBB1_8 # %bb.11: # in Loop: Header=BB1_7 Depth=1 - movl %ebx, 12(%rsp) # 4-byte Spill - movq 160(%r15), %r15 - vbroadcastss (%r15,%rax,4), %ymm5 - movq 8(%r12), %rcx - vbroadcastss 4(%r15,%rax,4), %ymm6 + movq 160(%r15), %r9 + vbroadcastss (%r9,%rcx,4), %ymm5 + movq 8(%r14), %rax + vbroadcastss 4(%r9,%rcx,4), %ymm6 vinsertf64x4 $1, %ymm6, %zmm5, %zmm7 - vbroadcastss 8(%r15,%rax,4), %ymm5 - vbroadcastss 12(%r15,%rax,4), %ymm6 - vbroadcastss 32(%r15,%rax,4), %ymm8 - vbroadcastss 36(%r15,%rax,4), %ymm9 + vbroadcastss 8(%r9,%rcx,4), %ymm5 + vbroadcastss 12(%r9,%rcx,4), %ymm6 + vbroadcastss 32(%r9,%rcx,4), %ymm8 + vbroadcastss 36(%r9,%rcx,4), %ymm9 vinsertf64x4 $1, %ymm6, %zmm5, %zmm10 vinsertf64x4 $1, %ymm9, %zmm8, %zmm11 - vbroadcastss 40(%r15,%rax,4), %ymm5 - vbroadcastss 44(%r15,%rax,4), %ymm6 + vbroadcastss 40(%r9,%rcx,4), %ymm5 + vbroadcastss 44(%r9,%rcx,4), %ymm6 vinsertf64x4 $1, %ymm6, %zmm5, %zmm13 - vbroadcastss 64(%r15,%rax,4), %ymm5 - vbroadcastss 68(%r15,%rax,4), %ymm6 + vbroadcastss 64(%r9,%rcx,4), %ymm5 + vbroadcastss 68(%r9,%rcx,4), %ymm6 vinsertf64x4 $1, %ymm6, %zmm5, %zmm15 - vbroadcastss 72(%r15,%rax,4), %ymm5 - movq %rax, 80(%rsp) # 8-byte Spill - vbroadcastss 76(%r15,%rax,4), %ymm6 + vbroadcastss 72(%r9,%rcx,4), %ymm5 + movq %rcx, 64(%rsp) # 8-byte Spill + vbroadcastss 76(%r9,%rcx,4), %ymm6 vinsertf64x4 $1, %ymm6, %zmm5, %zmm16 - movq %rsi, 72(%rsp) # 8-byte Spill - movl %esi, %eax - movl 16(%r12), %edx - imull %r11d, %edx - movslq %edx, %rdx - leaq (%rcx,%rdx,4), %rcx - movq %rcx, (%rsp) # 8-byte Spill - movq %rax, 56(%rsp) # 8-byte Spill - decq %rax - movq %rax, 88(%rsp) # 8-byte Spill + movq %rdx, 56(%rsp) # 8-byte Spill + movl %edx, %edx + movl 16(%r14), %ecx + imull %r13d, %ecx + movslq %ecx, %rcx + leaq (%rax,%rcx,4), %rax + movq %rax, (%rsp) # 8-byte Spill + movq %rdx, 40(%rsp) # 8-byte Spill + leaq -1(%rdx), %rax + movq %rax, 72(%rsp) # 8-byte Spill vxorps %xmm12, %xmm12, %xmm12 - movq %r10, 64(%rsp) # 8-byte Spill - xorl %ecx, %ecx + movq %rsi, 48(%rsp) # 8-byte Spill + movq %rsi, %rcx + xorl %r11d, %r11d vxorps %xmm8, %xmm8, %xmm8 vxorps %xmm6, %xmm6, %xmm6 vxorps %xmm14, %xmm14, %xmm14 @@ -963,47 +929,46 @@ computeForceLJ_2xnn_half: # .LBB1_12: # Parent Loop BB1_7 Depth=1 # => This Inner Loop Header: Depth=2 movq (%rsp), %rax # 8-byte Reload - movslq (%rax,%rcx,4), %rdx + movslq (%rax,%r11,4), %rdx leal (%rdx,%rdx), %esi - xorl %ebx, %ebx - cmpq %rsi, %r11 - leal 1(%rdx,%rdx), %edi - setne %bl - leal (%rbx,%rbx,2), %ebx - movl $255, %ebp - movl $248, %eax - cmovel %eax, %ebp - orl $252, %ebx - leal -127(%rbp), %r8d - cmpq %rdi, %r11 - cmovnel %ebp, %r8d - leal 193(%rbx), %r14d - xorl %r13d, %r13d - cmpq %rdi, %r11 - cmovnel %ebx, %r14d - sete %r13b - movl $0, %r9d - movl $-31, %eax - cmovel %eax, %r9d - leal 240(%r13), %edi - addl $255, %r13d - xorl %ebx, %ebx - cmpq %rsi, %r11 - cmovel %edi, %r13d - sete %bl + xorl %edi, %edi + cmpq %rsi, %r13 + leal 1(%rdx,%rdx), %ebx + setne %dil + leal (%rdi,%rdi,2), %edi + movl $255, %eax + cmovel %ebp, %eax + orl $252, %edi + leal -127(%rax), %r15d + cmpq %rbx, %r13 + cmovnel %eax, %r15d + leal 193(%rdi), %r10d + xorl %r8d, %r8d + cmpq %rbx, %r13 + cmovnel %edi, %r10d + sete %r8b + movl $0, %r12d + movl $-31, %edi + cmovel %edi, %r12d + leal 240(%r8), %eax + addl $255, %r8d + xorl %edi, %edi + cmpq %rsi, %r13 + cmovel %eax, %r8d + sete %dil shlq $5, %rdx - leaq (%rdx,%rdx,2), %r12 - vmovupd (%r15,%r12), %zmm17 - vbroadcastf64x4 (%r15,%r12), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] - vbroadcastf64x4 64(%r15,%r12), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] - subl %ebx, %r9d - addl $255, %r9d - shll $8, %r14d - orl %r9d, %r14d - kmovd %r14d, %k2 - shll $8, %r13d - orl %r8d, %r13d - kmovd %r13d, %k3 + leaq (%rdx,%rdx,2), %r14 + vmovupd (%r9,%r14), %zmm17 + vbroadcastf64x4 (%r9,%r14), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 64(%r9,%r14), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] + subl %edi, %r12d + addl $255, %r12d + shll $8, %r10d + orl %r12d, %r10d + kmovd %r10d, %k2 + shll $8, %r8d + orl %r15d, %r8d + kmovd %r8d, %k3 vshuff64x2 $238, %zmm17, %zmm17, %zmm17 # zmm17 = zmm17[4,5,6,7,4,5,6,7] vsubps %zmm18, %zmm7, %zmm20 vsubps %zmm17, %zmm11, %zmm21 @@ -1047,38 +1012,38 @@ computeForceLJ_2xnn_half: # vaddps %zmm17, %zmm21, %zmm24 vextractf64x4 $1, %zmm23, %ymm25 vaddps %ymm25, %ymm23, %ymm23 - vmovups (%r10,%r12), %ymm25 # AlignMOV convert to UnAlignMOV + vmovups (%rcx,%r14), %ymm25 # AlignMOV convert to UnAlignMOV vsubps %ymm23, %ymm25, %ymm23 - vmovups 32(%r10,%r12), %ymm25 # AlignMOV convert to UnAlignMOV - vmovups 64(%r10,%r12), %ymm26 # AlignMOV convert to UnAlignMOV - vmovups %ymm23, (%r10,%r12) # AlignMOV convert to UnAlignMOV + vmovups 32(%rcx,%r14), %ymm25 # AlignMOV convert to UnAlignMOV + vmovups 64(%rcx,%r14), %ymm26 # AlignMOV convert to UnAlignMOV + vmovups %ymm23, (%rcx,%r14) # AlignMOV convert to UnAlignMOV vaddps %zmm19, %zmm22, %zmm23 vextractf64x4 $1, %zmm24, %ymm27 vaddps %ymm27, %ymm24, %ymm24 vsubps %ymm24, %ymm25, %ymm24 - vmovups %ymm24, 32(%r10,%r12) # AlignMOV convert to UnAlignMOV + vmovups %ymm24, 32(%rcx,%r14) # AlignMOV convert to UnAlignMOV vextractf64x4 $1, %zmm23, %ymm24 vaddps %ymm24, %ymm23, %ymm23 vsubps %ymm23, %ymm26, %ymm23 - vmovups %ymm23, 64(%r10,%r12) # AlignMOV convert to UnAlignMOV + vmovups %ymm23, 64(%rcx,%r14) # AlignMOV convert to UnAlignMOV vaddps %zmm20, %zmm12, %zmm12 vaddps %zmm21, %zmm8, %zmm8 vaddps %zmm22, %zmm6, %zmm6 vaddps %zmm18, %zmm14, %zmm14 vaddps %zmm17, %zmm9, %zmm9 vaddps %zmm19, %zmm5, %zmm5 - cmpq %rcx, 88(%rsp) # 8-byte Folded Reload + cmpq %r11, 72(%rsp) # 8-byte Folded Reload je .LBB1_13 # %bb.14: # in Loop: Header=BB1_12 Depth=2 - movq 16(%rsp), %rdx # 8-byte Reload - movq 160(%rdx), %r15 - movq 176(%rdx), %r10 - incq %rcx + movq 8(%rsp), %rax # 8-byte Reload + movq 160(%rax), %r9 + movq 176(%rax), %rcx + incq %r11 jmp .LBB1_12 .p2align 5, 0x90 .LBB1_8: # in Loop: Header=BB1_7 Depth=1 vxorps %xmm5, %xmm5, %xmm5 - movq %rsi, %rcx + movq %rdx, %rax vxorps %xmm9, %xmm9, %xmm9 vxorps %xmm14, %xmm14, %xmm14 vxorps %xmm6, %xmm6, %xmm6 @@ -1092,25 +1057,14 @@ computeForceLJ_2xnn_half: # callq likwid_markerStopRegion xorl %eax, %eax callq getTimeStamp - movq 32(%rsp), %rax # 8-byte Reload - vmovsd 184(%rax), %xmm3 # xmm3 = mem[0],zero - vsubsd 24(%rsp), %xmm0, %xmm1 # 8-byte Folded Reload - vmovsd %xmm1, (%rsp) # 8-byte Spill - vmulsd .LCPI1_4(%rip), %xmm3, %xmm0 - vmulsd %xmm1, %xmm0, %xmm0 - vcvtusi2sd %ebx, %xmm11, %xmm2 - vdivsd %xmm2, %xmm0, %xmm2 + vmovsd %xmm0, (%rsp) # 8-byte Spill movl $.L.str.4, %edi - movl %ebx, %esi - vmovapd %xmm3, %xmm0 - movb $3, %al - callq printf - movl $.L.str.5, %edi xorl %eax, %eax callq debug_printf vmovsd (%rsp), %xmm0 # 8-byte Reload # xmm0 = mem[0],zero - addq $232, %rsp + vsubsd 16(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $216, %rsp .cfi_def_cfa_offset 56 popq %rbx .cfi_def_cfa_offset 48 @@ -1377,12 +1331,6 @@ computeForceLJ_2xnn_full: # vxorps %xmm8, %xmm8, %xmm8 vxorps %xmm4, %xmm4, %xmm4 .p2align 4, 0x90 -movl $111, %ebx # OSACA START MARKER -.byte 100 # OSACA START MARKER -.byte 103 # OSACA START MARKER -.byte 144 # OSACA START MARKER -# LLVM-MCA-BEGIN -# pointer_increment=256 a23042eac7d8a1e13e9ff886fc02a80e .LBB2_12: # Parent Loop BB2_7 Depth=1 # => This Inner Loop Header: Depth=2 movslq (%r11,%rax,4), %rcx @@ -1464,11 +1412,6 @@ movl $111, %ebx # OSACA START MARKER incq %rax cmpq %rax, %r10 jne .LBB2_12 -# LLVM-MCA-END -movl $222, %ebx # OSACA END MARKER -.byte 100 # OSACA END MARKER -.byte 103 # OSACA END MARKER -.byte 144 # OSACA END MARKER # %bb.13: # in Loop: Header=BB2_7 Depth=1 movq %r15, %r14 movq 8(%rsp), %rbp # 8-byte Reload @@ -1481,7 +1424,7 @@ movl $222, %ebx # OSACA END MARKER xorl %eax, %eax callq getTimeStamp vmovsd %xmm0, 8(%rsp) # 8-byte Spill - movl $.L.str.5, %edi + movl $.L.str.4, %edi xorl %eax, %eax callq debug_printf vmovsd 8(%rsp), %xmm0 # 8-byte Reload @@ -1523,15 +1466,11 @@ computeForceLJ_2xnn: # .size computeForceLJ_2xnn, .Lfunc_end3-computeForceLJ_2xnn .cfi_endproc # -- End function - .section .rodata.cst8,"aM",@progbits,8 - .p2align 3 # -- Begin function computeForceLJ_4xn_half -.LCPI4_0: - .quad 0x41cdcd6500000000 # 1.0E+9 .section .rodata.cst4,"aM",@progbits,4 - .p2align 2 -.LCPI4_1: + .p2align 2 # -- Begin function computeForceLJ_4xn_half +.LCPI4_0: .long 0xbf000000 # -0.5 -.LCPI4_2: +.LCPI4_1: .long 0x42400000 # 48 .text .globl computeForceLJ_4xn_half @@ -1544,31 +1483,28 @@ computeForceLJ_4xn_half: # .cfi_def_cfa_offset 16 pushq %r14 .cfi_def_cfa_offset 24 - pushq %r13 - .cfi_def_cfa_offset 32 pushq %r12 - .cfi_def_cfa_offset 40 + .cfi_def_cfa_offset 32 pushq %rbx - .cfi_def_cfa_offset 48 - subq $576, %rsp # imm = 0x240 + .cfi_def_cfa_offset 40 + subq $584, %rsp # imm = 0x248 .cfi_def_cfa_offset 624 - .cfi_offset %rbx, -48 - .cfi_offset %r12, -40 - .cfi_offset %r13, -32 + .cfi_offset %rbx, -40 + .cfi_offset %r12, -32 .cfi_offset %r14, -24 .cfi_offset %r15, -16 movq %rdx, %r14 movq %rsi, %r15 - movq %rdi, %r12 - movl $.L.str.6, %edi + movq %rdi, %rbx + movl $.L.str.5, %edi xorl %eax, %eax callq debug_printf - vmovss 108(%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero - vmovss %xmm0, 64(%rsp) # 4-byte Spill - vbroadcastss 48(%r12), %zmm0 + vmovss 108(%rbx), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss %xmm0, (%rsp) # 4-byte Spill + vbroadcastss 48(%rbx), %zmm0 + vmovups %zmm0, 64(%rsp) # 64-byte Spill + vbroadcastss 40(%rbx), %zmm0 vmovups %zmm0, 512(%rsp) # 64-byte Spill - vbroadcastss 40(%r12), %zmm0 - vmovupd %zmm0, 448(%rsp) # 64-byte Spill movl 20(%r15), %r11d testl %r11d, %r11d jle .LBB4_5 @@ -1578,7 +1514,7 @@ computeForceLJ_4xn_half: # decq %r11 leaq 64(%r9), %r8 xorl %edi, %edi - vxorpd %xmm0, %xmm0, %xmm0 + vxorps %xmm0, %xmm0, %xmm0 vxorps %xmm1, %xmm1, %xmm1 jmp .LBB4_2 .p2align 5, 0x90 @@ -1614,21 +1550,21 @@ computeForceLJ_4xn_half: # .LBB4_10: # in Loop: Header=BB4_2 Depth=1 leaq (,%rcx,4), %rbx andq $-32, %rbx - movl %esi, %r13d - leaq (%r9,%r13,4), %rax + movl %esi, %r12d + leaq (%r9,%r12,4), %rax xorl %edx, %edx .p2align 4, 0x90 .LBB4_11: # Parent Loop BB4_2 Depth=1 # => This Inner Loop Header: Depth=2 - vmovupd %ymm0, (%rax,%rdx) + vmovups %ymm0, (%rax,%rdx) addq $32, %rdx cmpq %rdx, %rbx jne .LBB4_11 # %bb.12: # in Loop: Header=BB4_2 Depth=1 movl %ecx, %ebx andl $-8, %ebx - addq %rbx, %r13 - vmovups %zmm1, (%r9,%r13,4) + addq %rbx, %r12 + vmovups %zmm1, (%r9,%r12,4) cmpq %rcx, %rbx jae .LBB4_16 .LBB4_14: # in Loop: Header=BB4_2 Depth=1 @@ -1649,7 +1585,7 @@ computeForceLJ_4xn_half: # xorl %eax, %eax vzeroupper callq getTimeStamp - vmovsd %xmm0, (%rsp) # 8-byte Spill + vmovsd %xmm0, 128(%rsp) # 8-byte Spill movl $.L.str.1, %edi callq likwid_markerStartRegion cmpl $0, 20(%r15) @@ -1659,30 +1595,18 @@ computeForceLJ_4xn_half: # callq likwid_markerStopRegion xorl %eax, %eax callq getTimeStamp - vmovsd 184(%r12), %xmm3 # xmm3 = mem[0],zero - vsubsd (%rsp), %xmm0, %xmm1 # 8-byte Folded Reload - vmovsd %xmm1, (%rsp) # 8-byte Spill - vmulsd .LCPI4_0(%rip), %xmm3, %xmm0 - vmulsd %xmm1, %xmm0, %xmm0 - vxorpd %xmm2, %xmm2, %xmm2 - vdivsd %xmm2, %xmm0, %xmm2 - movl $.L.str.4, %edi - xorl %esi, %esi - vmovapd %xmm3, %xmm0 - movb $3, %al - callq printf - movl $.L.str.7, %edi + vmovsd %xmm0, 64(%rsp) # 8-byte Spill + movl $.L.str.6, %edi xorl %eax, %eax callq debug_printf - vmovsd (%rsp), %xmm0 # 8-byte Reload + vmovsd 64(%rsp), %xmm0 # 8-byte Reload # xmm0 = mem[0],zero - addq $576, %rsp # imm = 0x240 - .cfi_def_cfa_offset 48 - popq %rbx + vsubsd 128(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $584, %rsp # imm = 0x248 .cfi_def_cfa_offset 40 - popq %r12 + popq %rbx .cfi_def_cfa_offset 32 - popq %r13 + popq %r12 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 @@ -1697,26 +1621,26 @@ computeForceLJ_4xn_half: # testl %r10d, %r10d jle .LBB4_18 # %bb.7: - vmovss 64(%rsp), %xmm0 # 4-byte Reload + vmovss (%rsp), %xmm0 # 4-byte Reload # xmm0 = mem[0],zero,zero,zero vmulss %xmm0, %xmm0, %xmm0 vbroadcastss %xmm0, %zmm0 movq 160(%r15), %rdi movq 8(%r14), %r11 vbroadcastss (%rdi), %zmm1 - vmovups %zmm1, (%rsp) # 64-byte Spill - vbroadcastss 4(%rdi), %zmm1 - vmovups %zmm1, 64(%rsp) # 64-byte Spill - vbroadcastss 8(%rdi), %zmm1 - vmovups %zmm1, 384(%rsp) # 64-byte Spill - vbroadcastss 12(%rdi), %zmm1 - vmovups %zmm1, 320(%rsp) # 64-byte Spill - vbroadcastss 32(%rdi), %zmm1 - vmovups %zmm1, 256(%rsp) # 64-byte Spill - vbroadcastss 36(%rdi), %zmm1 - vmovups %zmm1, 192(%rsp) # 64-byte Spill - vbroadcastss 40(%rdi), %zmm1 vmovups %zmm1, 128(%rsp) # 64-byte Spill + vbroadcastss 4(%rdi), %zmm1 + vmovups %zmm1, (%rsp) # 64-byte Spill + vbroadcastss 8(%rdi), %zmm1 + vmovups %zmm1, 448(%rsp) # 64-byte Spill + vbroadcastss 12(%rdi), %zmm1 + vmovups %zmm1, 384(%rsp) # 64-byte Spill + vbroadcastss 32(%rdi), %zmm1 + vmovups %zmm1, 320(%rsp) # 64-byte Spill + vbroadcastss 36(%rdi), %zmm1 + vmovups %zmm1, 256(%rsp) # 64-byte Spill + vbroadcastss 40(%rdi), %zmm1 + vmovups %zmm1, 192(%rsp) # 64-byte Spill vbroadcastss 44(%rdi), %zmm8 vbroadcastss 64(%rdi), %zmm9 vbroadcastss 68(%rdi), %zmm10 @@ -1726,11 +1650,17 @@ computeForceLJ_4xn_half: # xorl %edx, %edx movl $248, %r8d movl $240, %r9d - vbroadcastss .LCPI4_1(%rip), %zmm13 # zmm13 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] - vbroadcastss .LCPI4_2(%rip), %zmm14 # zmm14 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] - vmovups 512(%rsp), %zmm6 # 64-byte Reload - vmovups 448(%rsp), %zmm7 # 64-byte Reload + vbroadcastss .LCPI4_0(%rip), %zmm13 # zmm13 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastss .LCPI4_1(%rip), %zmm14 # zmm14 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + vmovups 64(%rsp), %zmm6 # 64-byte Reload + vmovups 512(%rsp), %zmm7 # 64-byte Reload .p2align 4, 0x90 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649 +# LLVM-MCA-BEGIN .LBB4_8: # =>This Inner Loop Header: Depth=1 movslq (%r11,%rdx,4), %rax movq %rax, %rsi @@ -1739,22 +1669,22 @@ computeForceLJ_4xn_half: # vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV - vmovups (%rsp), %zmm1 # 64-byte Reload + vmovups 128(%rsp), %zmm1 # 64-byte Reload vsubps %zmm15, %zmm1, %zmm24 - vmovups 256(%rsp), %zmm1 # 64-byte Reload + vmovups 320(%rsp), %zmm1 # 64-byte Reload vsubps %zmm16, %zmm1, %zmm25 vsubps %zmm27, %zmm9, %zmm26 - vmovups 64(%rsp), %zmm1 # 64-byte Reload + vmovups (%rsp), %zmm1 # 64-byte Reload vsubps %zmm15, %zmm1, %zmm21 - vmovups 192(%rsp), %zmm1 # 64-byte Reload + vmovups 256(%rsp), %zmm1 # 64-byte Reload vsubps %zmm16, %zmm1, %zmm22 vsubps %zmm27, %zmm10, %zmm23 - vmovups 384(%rsp), %zmm1 # 64-byte Reload + vmovups 448(%rsp), %zmm1 # 64-byte Reload vsubps %zmm15, %zmm1, %zmm17 - vmovups 128(%rsp), %zmm1 # 64-byte Reload + vmovups 192(%rsp), %zmm1 # 64-byte Reload vsubps %zmm16, %zmm1, %zmm19 vsubps %zmm27, %zmm11, %zmm20 - vmovups 320(%rsp), %zmm1 # 64-byte Reload + vmovups 384(%rsp), %zmm1 # 64-byte Reload vsubps %zmm15, %zmm1, %zmm18 vsubps %zmm16, %zmm8, %zmm16 vsubps %zmm27, %zmm12, %zmm15 @@ -1863,6 +1793,11 @@ computeForceLJ_4xn_half: # movq 160(%r15), %rdi incq %rdx jmp .LBB4_8 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER .p2align 5, 0x90 .LBB4_18: vzeroupper @@ -1879,7 +1814,7 @@ simd_incr_reduced_sum: # pushq %rax .cfi_def_cfa_offset 16 movq stderr(%rip), %rcx - movl $.L.str.8, %edi + movl $.L.str.7, %edi movl $92, %esi movl $1, %edx callq fwrite@PLT @@ -1889,12 +1824,7 @@ simd_incr_reduced_sum: # .size simd_incr_reduced_sum, .Lfunc_end5-simd_incr_reduced_sum .cfi_endproc # -- End function - .section .rodata.cst8,"aM",@progbits,8 - .p2align 3 # -- Begin function computeForceLJ_4xn_full -.LCPI6_0: - .quad 0x41cdcd6500000000 # 1.0E+9 - .text - .globl computeForceLJ_4xn_full + .globl computeForceLJ_4xn_full # -- Begin function computeForceLJ_4xn_full .p2align 4, 0x90 .type computeForceLJ_4xn_full,@function computeForceLJ_4xn_full: # @@ -1904,31 +1834,27 @@ computeForceLJ_4xn_full: # .cfi_def_cfa_offset 16 pushq %r14 .cfi_def_cfa_offset 24 - pushq %r12 - .cfi_def_cfa_offset 32 pushq %rbx - .cfi_def_cfa_offset 40 - pushq %rax + .cfi_def_cfa_offset 32 + subq $16, %rsp .cfi_def_cfa_offset 48 - .cfi_offset %rbx, -40 - .cfi_offset %r12, -32 + .cfi_offset %rbx, -32 .cfi_offset %r14, -24 .cfi_offset %r15, -16 - movq %rsi, %r15 - movq %rdi, %r14 - movl $.L.str.6, %edi + movq %rsi, %r14 + movl $.L.str.5, %edi xorl %eax, %eax callq debug_printf - movl 20(%r15), %r11d + movl 20(%r14), %r11d testl %r11d, %r11d jle .LBB6_5 # %bb.1: - movq 176(%r15), %r9 - movq 192(%r15), %r10 + movq 176(%r14), %r9 + movq 192(%r14), %r10 decq %r11 leaq 64(%r9), %r8 xorl %edi, %edi - vxorpd %xmm0, %xmm0, %xmm0 + vxorps %xmm0, %xmm0, %xmm0 vxorps %xmm1, %xmm1, %xmm1 jmp .LBB6_2 .p2align 5, 0x90 @@ -1964,21 +1890,21 @@ computeForceLJ_4xn_full: # .LBB6_7: # in Loop: Header=BB6_2 Depth=1 leaq (,%rcx,4), %rbx andq $-32, %rbx - movl %esi, %r12d - leaq (%r9,%r12,4), %rax + movl %esi, %r15d + leaq (%r9,%r15,4), %rax xorl %edx, %edx .p2align 4, 0x90 .LBB6_8: # Parent Loop BB6_2 Depth=1 # => This Inner Loop Header: Depth=2 - vmovupd %ymm0, (%rax,%rdx) + vmovups %ymm0, (%rax,%rdx) addq $32, %rdx cmpq %rdx, %rbx jne .LBB6_8 # %bb.9: # in Loop: Header=BB6_2 Depth=1 movl %ecx, %ebx andl $-8, %ebx - addq %rbx, %r12 - vmovups %zmm1, (%r9,%r12,4) + addq %rbx, %r15 + vmovups %zmm1, (%r9,%r15,4) cmpq %rcx, %rbx jae .LBB6_13 .LBB6_11: # in Loop: Header=BB6_2 Depth=1 @@ -1999,38 +1925,26 @@ computeForceLJ_4xn_full: # xorl %eax, %eax vzeroupper callq getTimeStamp - vmovsd %xmm0, (%rsp) # 8-byte Spill + vmovsd %xmm0, 8(%rsp) # 8-byte Spill movl $.L.str.1, %edi callq likwid_markerStartRegion - cmpl $0, 20(%r15) + cmpl $0, 20(%r14) jg .LBB6_6 # %bb.14: movl $.L.str.1, %edi callq likwid_markerStopRegion xorl %eax, %eax callq getTimeStamp - vmovsd 184(%r14), %xmm3 # xmm3 = mem[0],zero - vsubsd (%rsp), %xmm0, %xmm1 # 8-byte Folded Reload - vmovsd %xmm1, (%rsp) # 8-byte Spill - vmulsd .LCPI6_0(%rip), %xmm3, %xmm0 - vmulsd %xmm1, %xmm0, %xmm0 - vxorpd %xmm2, %xmm2, %xmm2 - vdivsd %xmm2, %xmm0, %xmm2 - movl $.L.str.4, %edi - xorl %esi, %esi - vmovapd %xmm3, %xmm0 - movb $3, %al - callq printf - movl $.L.str.7, %edi + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.6, %edi xorl %eax, %eax callq debug_printf vmovsd (%rsp), %xmm0 # 8-byte Reload # xmm0 = mem[0],zero - addq $8, %rsp - .cfi_def_cfa_offset 40 - popq %rbx + vsubsd 8(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $16, %rsp .cfi_def_cfa_offset 32 - popq %r12 + popq %rbx .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 @@ -2081,23 +1995,19 @@ computeForceLJ_4xn: # .size .L.str.3, 27 .type .L.str.4,@object # .L.str.4: - .asciz "Its: %u Freq: %f Time: %f\nCy/it: %f\n" - .size .L.str.4, 39 + .asciz "computeForceLJ_2xnn end\n" + .size .L.str.4, 25 .type .L.str.5,@object # .L.str.5: - .asciz "computeForceLJ_2xnn end\n" - .size .L.str.5, 25 + .asciz "computeForceLJ_4xn begin\n" + .size .L.str.5, 26 .type .L.str.6,@object # .L.str.6: - .asciz "computeForceLJ_4xn begin\n" - .size .L.str.6, 26 + .asciz "computeForceLJ_4xn end\n" + .size .L.str.6, 24 .type .L.str.7,@object # .L.str.7: - .asciz "computeForceLJ_4xn end\n" - .size .L.str.7, 24 - .type .L.str.8,@object # -.L.str.8: .asciz "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n" - .size .L.str.8, 93 + .size .L.str.7, 93 .ident "Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.1.0.20220316)" .section ".note.GNU-stack","",@progbits diff --git a/static_analysis/jan/icx-icc-lammps-novec.s b/static_analysis/jan/icx-icc-lammps-novec.s deleted file mode 100644 index 0b316cc..0000000 --- a/static_analysis/jan/icx-icc-lammps-novec.s +++ /dev/null @@ -1,1310 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; -# mark_description "0226_000000"; -# mark_description "-I/apps/likwid/5.2.2/include -I././lammps/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GNU"; -# mark_description "_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=1 -DENABLE_OMP_SIMD -DALIGNMENT="; -# mark_description "64 -restrict -Ofast -no-vec -o build-lammps-ICC-NOVEC-DP/force_lj.s"; - .file "force_lj.c" - .text -..TXTST0: -.L_2__routine_start_computeForceLJFullNeigh_plain_c_0: -# -- Begin computeForceLJFullNeigh_plain_c - .text -# mark_begin; - .align 16,0x90 - .globl computeForceLJFullNeigh_plain_c -# --- computeForceLJFullNeigh_plain_c(Parameter *, Atom *, Neighbor *, Stats *) -computeForceLJFullNeigh_plain_c: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_computeForceLJFullNeigh_plain_c.1: -..L2: - #23.104 - pushq %r12 #23.104 - .cfi_def_cfa_offset 16 - .cfi_offset 12, -16 - pushq %r13 #23.104 - .cfi_def_cfa_offset 24 - .cfi_offset 13, -24 - pushq %r14 #23.104 - .cfi_def_cfa_offset 32 - .cfi_offset 14, -32 - pushq %r15 #23.104 - .cfi_def_cfa_offset 40 - .cfi_offset 15, -40 - pushq %rbx #23.104 - .cfi_def_cfa_offset 48 - .cfi_offset 3, -48 - pushq %rbp #23.104 - .cfi_def_cfa_offset 56 - .cfi_offset 6, -56 - subq $56, %rsp #23.104 - .cfi_def_cfa_offset 112 - movq %rdi, %rbp #23.104 - movq %rsi, %r15 #23.104 - movq %rcx, %r13 #23.104 - movq %rdx, %r12 #23.104 - movsd 144(%rbp), %xmm0 #27.27 - mulsd %xmm0, %xmm0 #27.45 - movsd 56(%rbp), %xmm1 #28.23 - movsd 40(%rbp), %xmm2 #29.24 - movl 4(%r15), %r14d #24.18 - movsd %xmm0, 32(%rsp) #27.45[spill] - movsd %xmm1, 24(%rsp) #28.23[spill] - movsd %xmm2, 40(%rsp) #29.24[spill] - testl %r14d, %r14d #32.24 - jle ..B1.16 # Prob 50% #32.24 - # LOE rbp r12 r13 r15 r14d -..B1.2: # Preds ..B1.1 - # Execution count [5.00e-03] - movq 64(%r15), %rdi #33.9 - lea (%r14,%r14,2), %eax #24.18 - cmpl $12, %eax #32.5 - jle ..B1.23 # Prob 0% #32.5 - # LOE rbp rdi r12 r13 r15 eax r14d -..B1.3: # Preds ..B1.2 - # Execution count [1.00e+00] - movslq %r14d, %r14 #32.5 - xorl %esi, %esi #32.5 - lea (%r14,%r14,2), %rdx #32.5 - shlq $3, %rdx #32.5 - call _intel_fast_memset #32.5 - # LOE rbp r12 r13 r14 r15 -..B1.5: # Preds ..B1.3 ..B1.28 ..B1.34 - # Execution count [1.00e+00] - xorl %ebx, %ebx #37.22 - xorl %eax, %eax #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.19: -# getTimeStamp() - call getTimeStamp #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.20: - # LOE rbx rbp r12 r13 r14 r15 xmm0 -..B1.31: # Preds ..B1.5 - # Execution count [1.00e+00] - movsd %xmm0, 16(%rsp) #38.16[spill] - # LOE rbx rbp r12 r13 r14 r15 -..B1.6: # Preds ..B1.31 - # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.22: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.23: - # LOE rbx rbp r12 r13 r14 r15 -..B1.7: # Preds ..B1.6 - # Execution count [9.00e-01] - xorl %ecx, %ecx #41.15 - movsd .L_2il0floatpacket.0(%rip), %xmm11 #77.41 - xorl %edx, %edx #41.5 - movq 16(%r12), %rax #42.19 - xorl %r11d, %r11d #41.5 - movslq 8(%r12), %rdi #42.43 - movq 24(%r12), %rsi #43.25 - mulsd 40(%rsp), %xmm11 #77.41[spill] - movsd 24(%rsp), %xmm12 #41.5[spill] - movsd 32(%rsp), %xmm13 #41.5[spill] - movsd .L_2il0floatpacket.1(%rip), %xmm1 #77.54 - shlq $2, %rdi #25.5 - movq 16(%r15), %r12 #44.25 - movq 64(%r15), %r8 #89.9 - xorl %r15d, %r15d #41.5 - movq (%r13), %r9 #93.9 - movq 8(%r13), %r10 #94.9 - movq %rbp, 8(%rsp) #41.5[spill] - movq %r13, (%rsp) #41.5[spill] - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 xmm1 xmm11 xmm12 xmm13 -..B1.8: # Preds ..B1.14 ..B1.7 - # Execution count [5.00e+00] - movslq (%rsi,%rdx,4), %r13 #43.25 - movq %r15, %rbp #56.9 - pxor %xmm2, %xmm2 #47.22 - movaps %xmm2, %xmm4 #48.22 - movsd (%r11,%r12), %xmm9 #44.25 - movaps %xmm4, %xmm10 #49.22 - movsd 8(%r11,%r12), %xmm8 #45.25 - movsd 16(%r11,%r12), %xmm5 #46.25 - testq %r13, %r13 #56.28 - jle ..B1.14 # Prob 10% #56.28 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.9: # Preds ..B1.8 - # Execution count [4.50e+00] - imulq %rdi, %rcx #42.43 - addq %rax, %rcx #25.5 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -# LLVM-MCA-BEGIN -# OSACA-BEGIN -..B1.10: # Preds ..B1.12 ..B1.9 - # Execution count [2.50e+01] - movl (%rcx,%rbp,4), %r15d #57.21 - movaps %xmm9, %xmm3 #58.36 - movaps %xmm8, %xmm7 #59.36 - movaps %xmm5, %xmm6 #60.36 - lea (%r15,%r15,2), %r15d #58.36 - movslq %r15d, %r15 #58.36 - subsd (%r12,%r15,8), %xmm3 #58.36 - subsd 8(%r12,%r15,8), %xmm7 #59.36 - subsd 16(%r12,%r15,8), %xmm6 #60.36 - movaps %xmm3, %xmm0 #61.35 - movaps %xmm7, %xmm14 #61.49 - mulsd %xmm3, %xmm0 #61.35 - movaps %xmm6, %xmm15 #61.63 - mulsd %xmm7, %xmm14 #61.49 - mulsd %xmm6, %xmm15 #61.63 - addsd %xmm14, %xmm0 #61.49 - addsd %xmm15, %xmm0 #61.63 - comisd %xmm0, %xmm13 #71.22 - jbe ..B1.12 # Prob 50% #71.22 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.11: # Preds ..B1.10 - # Execution count [1.25e+01] - movsd .L_2il0floatpacket.3(%rip), %xmm14 #75.38 - incl %ebx #73.17 - divsd %xmm0, %xmm14 #75.38 - movaps %xmm12, %xmm0 #76.38 - mulsd %xmm14, %xmm0 #76.38 - mulsd %xmm14, %xmm0 #76.44 - mulsd %xmm14, %xmm0 #76.50 - mulsd %xmm11, %xmm14 #77.54 - mulsd %xmm0, %xmm14 #77.61 - subsd %xmm1, %xmm0 #77.54 - mulsd %xmm0, %xmm14 #77.67 - mulsd %xmm14, %xmm3 #78.31 - mulsd %xmm14, %xmm7 #79.31 - mulsd %xmm14, %xmm6 #80.31 - addsd %xmm3, %xmm2 #78.17 - addsd %xmm7, %xmm4 #79.17 - addsd %xmm6, %xmm10 #80.17 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.12: # Preds ..B1.11 ..B1.10 - # Execution count [2.50e+01] - incq %rbp #56.9 - cmpq %r13, %rbp #56.9 - jb ..B1.10 # Prob 82% #56.9 -# OSACA-END -# LLVM-MCA-END - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.13: # Preds ..B1.12 - # Execution count [4.50e+00] - xorl %r15d, %r15d # - # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 xmm1 xmm2 xmm4 xmm10 xmm11 xmm12 xmm13 -..B1.14: # Preds ..B1.13 ..B1.8 - # Execution count [5.00e+00] - movslq %edx, %rcx #41.32 - incq %rdx #41.5 - addq %r13, %r9 #93.9 - addq %r13, %r10 #94.9 - incq %rcx #41.32 - addsd (%r11,%r8), %xmm2 #89.9 - addsd 8(%r11,%r8), %xmm4 #90.9 - addsd 16(%r11,%r8), %xmm10 #91.9 - movsd %xmm2, (%r11,%r8) #89.9 - movsd %xmm4, 8(%r11,%r8) #90.9 - movsd %xmm10, 16(%r11,%r8) #91.9 - addq $24, %r11 #41.5 - cmpq %r14, %rdx #41.5 - jb ..B1.8 # Prob 82% #41.5 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 xmm1 xmm11 xmm12 xmm13 -..B1.15: # Preds ..B1.14 - # Execution count [9.00e-01] - movq (%rsp), %r13 #[spill] - movq 8(%rsp), %rbp #[spill] - movq %r9, (%r13) #93.9 - movq %r10, 8(%r13) #94.9 - jmp ..B1.19 # Prob 100% #94.9 - # LOE rbx rbp -..B1.16: # Preds ..B1.1 - # Execution count [5.00e-01] - xorl %ebx, %ebx #37.22 - xorl %eax, %eax #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.31: -# getTimeStamp() - call getTimeStamp #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.32: - # LOE rbx rbp xmm0 -..B1.32: # Preds ..B1.16 - # Execution count [5.00e-01] - movsd %xmm0, 16(%rsp) #38.16[spill] - # LOE rbx rbp -..B1.17: # Preds ..B1.32 - # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.34: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.35: - # LOE rbx rbp -..B1.19: # Preds ..B1.17 ..B1.15 - # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #97.5 -..___tag_value_computeForceLJFullNeigh_plain_c.36: -# likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #97.5 -..___tag_value_computeForceLJFullNeigh_plain_c.37: - # LOE rbx rbp -..B1.20: # Preds ..B1.19 - # Execution count [1.00e+00] - xorl %eax, %eax #98.16 -..___tag_value_computeForceLJFullNeigh_plain_c.38: -# getTimeStamp() - call getTimeStamp #98.16 -..___tag_value_computeForceLJFullNeigh_plain_c.39: - # LOE rbx rbp xmm0 -..B1.33: # Preds ..B1.20 - # Execution count [1.00e+00] - movaps %xmm0, %xmm1 #98.16 - # LOE rbx rbp xmm1 -..B1.21: # Preds ..B1.33 - # Execution count [1.00e+00] - pxor %xmm3, %xmm3 #100.5 - cvtsi2sdq %rbx, %xmm3 #100.5 - subsd 16(%rsp), %xmm1 #100.91[spill] - movsd .L_2il0floatpacket.2(%rip), %xmm2 #100.5 - movl $.L_2__STRING.1, %edi #100.5 - divsd %xmm3, %xmm2 #100.5 - mulsd %xmm1, %xmm2 #100.5 - movl %ebx, %esi #100.5 - movsd 264(%rbp), %xmm0 #100.71 - movl $3, %eax #100.5 - mulsd %xmm0, %xmm2 #100.5 - movsd %xmm1, (%rsp) #100.5[spill] -..___tag_value_computeForceLJFullNeigh_plain_c.41: -# printf(const char *__restrict__, ...) - call printf #100.5 -..___tag_value_computeForceLJFullNeigh_plain_c.42: - # LOE -..B1.22: # Preds ..B1.21 - # Execution count [1.00e+00] - movsd (%rsp), %xmm1 #[spill] - movaps %xmm1, %xmm0 #102.14 - addq $56, %rsp #102.14 - .cfi_def_cfa_offset 56 - .cfi_restore 6 - popq %rbp #102.14 - .cfi_def_cfa_offset 48 - .cfi_restore 3 - popq %rbx #102.14 - .cfi_def_cfa_offset 40 - .cfi_restore 15 - popq %r15 #102.14 - .cfi_def_cfa_offset 32 - .cfi_restore 14 - popq %r14 #102.14 - .cfi_def_cfa_offset 24 - .cfi_restore 13 - popq %r13 #102.14 - .cfi_def_cfa_offset 16 - .cfi_restore 12 - popq %r12 #102.14 - .cfi_def_cfa_offset 8 - ret #102.14 - .cfi_def_cfa_offset 112 - .cfi_offset 3, -48 - .cfi_offset 6, -56 - .cfi_offset 12, -16 - .cfi_offset 13, -24 - .cfi_offset 14, -32 - .cfi_offset 15, -40 - # LOE -..B1.23: # Preds ..B1.2 - # Execution count [1.11e+00]: Infreq - movl %eax, %edx #32.5 - xorl %ebx, %ebx #32.5 - movl $1, %esi #32.5 - xorl %ecx, %ecx #32.5 - shrl $1, %edx #32.5 - je ..B1.27 # Prob 10% #32.5 - # LOE rdx rcx rbx rbp rdi r12 r13 r15 eax esi r14d -..B1.24: # Preds ..B1.23 - # Execution count [1.00e+00]: Infreq - xorl %esi, %esi #32.5 - # LOE rdx rcx rbx rbp rsi rdi r12 r13 r15 eax r14d -..B1.25: # Preds ..B1.25 ..B1.24 - # Execution count [2.78e+00]: Infreq - incq %rbx #32.5 - movq %rsi, (%rcx,%rdi) #33.9 - movq %rsi, 8(%rcx,%rdi) #33.9 - addq $16, %rcx #32.5 - cmpq %rdx, %rbx #32.5 - jb ..B1.25 # Prob 64% #32.5 - # LOE rdx rcx rbx rbp rsi rdi r12 r13 r15 eax r14d -..B1.26: # Preds ..B1.25 - # Execution count [1.00e+00]: Infreq - lea 1(%rbx,%rbx), %esi #33.9 - # LOE rbp rdi r12 r13 r15 eax esi r14d -..B1.27: # Preds ..B1.23 ..B1.26 - # Execution count [1.11e+00]: Infreq - lea -1(%rsi), %edx #32.5 - cmpl %eax, %edx #32.5 - jae ..B1.34 # Prob 10% #32.5 - # LOE rbp rdi r12 r13 r15 esi r14d -..B1.28: # Preds ..B1.27 - # Execution count [1.00e+00]: Infreq - movslq %esi, %rsi #32.5 - movslq %r14d, %r14 #32.5 - movq $0, -8(%rdi,%rsi,8) #33.9 - jmp ..B1.5 # Prob 100% #33.9 - # LOE rbp r12 r13 r14 r15 -..B1.34: # Preds ..B1.27 - # Execution count [1.11e-01]: Infreq - movslq %r14d, %r14 #32.5 - jmp ..B1.5 # Prob 100% #32.5 - .align 16,0x90 - # LOE rbp r12 r13 r14 r15 - .cfi_endproc -# mark_end; - .type computeForceLJFullNeigh_plain_c,@function - .size computeForceLJFullNeigh_plain_c,.-computeForceLJFullNeigh_plain_c -..LNcomputeForceLJFullNeigh_plain_c.0: - .data -# -- End computeForceLJFullNeigh_plain_c - .text -.L_2__routine_start_computeForceLJHalfNeigh_1: -# -- Begin computeForceLJHalfNeigh - .text -# mark_begin; - .align 16,0x90 - .globl computeForceLJHalfNeigh -# --- computeForceLJHalfNeigh(Parameter *, Atom *, Neighbor *, Stats *) -computeForceLJHalfNeigh: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_computeForceLJHalfNeigh.66: -..L67: - #105.96 - pushq %r12 #105.96 - .cfi_def_cfa_offset 16 - .cfi_offset 12, -16 - pushq %r13 #105.96 - .cfi_def_cfa_offset 24 - .cfi_offset 13, -24 - pushq %r14 #105.96 - .cfi_def_cfa_offset 32 - .cfi_offset 14, -32 - pushq %r15 #105.96 - .cfi_def_cfa_offset 40 - .cfi_offset 15, -40 - pushq %rbx #105.96 - .cfi_def_cfa_offset 48 - .cfi_offset 3, -48 - pushq %rbp #105.96 - .cfi_def_cfa_offset 56 - .cfi_offset 6, -56 - subq $216, %rsp #105.96 - .cfi_def_cfa_offset 272 - movq %rdi, %r14 #105.96 - movq %rsi, %r15 #105.96 - movq %rcx, %r12 #105.96 - movq %rdx, 32(%rsp) #105.96[spill] - movsd 144(%r14), %xmm0 #109.27 - mulsd %xmm0, %xmm0 #109.45 - movsd 56(%r14), %xmm1 #110.23 - movsd 40(%r14), %xmm2 #111.24 - movl 4(%r15), %ebp #106.18 - movsd %xmm0, 48(%rsp) #109.45[spill] - movsd %xmm1, 40(%rsp) #110.23[spill] - movsd %xmm2, 24(%rsp) #111.24[spill] - testl %ebp, %ebp #114.24 - jle ..B2.51 # Prob 50% #114.24 - # LOE r12 r14 r15 ebp -..B2.2: # Preds ..B2.1 - # Execution count [5.00e-03] - movq 64(%r15), %rdi #115.9 - lea (%rbp,%rbp,2), %ebx #106.18 - cmpl $12, %ebx #114.5 - jle ..B2.59 # Prob 0% #114.5 - # LOE rdi r12 r14 r15 ebx ebp -..B2.3: # Preds ..B2.2 - # Execution count [1.00e+00] - movslq %ebp, %r13 #114.5 - xorl %esi, %esi #114.5 - lea (%r13,%r13,2), %rdx #114.5 - shlq $3, %rdx #114.5 - call _intel_fast_memset #114.5 - # LOE r12 r13 r14 r15 ebp -..B2.5: # Preds ..B2.3 ..B2.64 ..B2.70 - # Execution count [1.00e+00] - xorl %ebx, %ebx #120.22 - xorl %eax, %eax #121.16 -..___tag_value_computeForceLJHalfNeigh.85: -# getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.86: - # LOE r12 r13 r14 r15 ebx ebp xmm0 -..B2.67: # Preds ..B2.5 - # Execution count [1.00e+00] - movsd %xmm0, 16(%rsp) #121.16[spill] - # LOE r12 r13 r14 r15 ebx ebp -..B2.6: # Preds ..B2.67 - # Execution count [5.00e-01] - movl $.L_2__STRING.2, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.88: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.89: - # LOE r12 r13 r14 r15 ebx ebp -..B2.7: # Preds ..B2.6 - # Execution count [9.00e-01] - movsd .L_2il0floatpacket.0(%rip), %xmm13 #161.41 - movd %ebp, %xmm0 #106.18 - mulsd 24(%rsp), %xmm13 #161.41[spill] - xorl %r9d, %r9d #124.15 - movq 32(%rsp), %rdx #125.19[spill] - movaps %xmm13, %xmm2 #161.41 - movsd 40(%rsp), %xmm3 #110.21[spill] - xorl %r8d, %r8d #124.5 - movsd 48(%rsp), %xmm6 #109.25[spill] - xorl %esi, %esi #124.5 - unpcklpd %xmm3, %xmm3 #110.21 - unpcklpd %xmm2, %xmm2 #161.41 - pshufd $0, %xmm0, %xmm0 #106.18 - movq 16(%rdx), %rdi #125.19 - movslq 8(%rdx), %rax #125.43 - movq 24(%rdx), %rcx #126.25 - movq 16(%r15), %rdx #127.25 - movq 64(%r15), %r15 #168.21 - unpcklpd %xmm6, %xmm6 #109.25 - movups .L_2il0floatpacket.7(%rip), %xmm1 #161.54 - movsd .L_2il0floatpacket.1(%rip), %xmm7 #161.54 - shlq $2, %rax #107.5 - movq (%r12), %r10 #179.9 - movq 8(%r12), %r11 #180.9 - movdqu %xmm0, 160(%rsp) #124.5[spill] - movups %xmm2, 192(%rsp) #124.5[spill] - movups %xmm3, 176(%rsp) #124.5[spill] - movq %rdi, 56(%rsp) #124.5[spill] - movl %ebp, 64(%rsp) #124.5[spill] - movq %r14, (%rsp) #124.5[spill] - movq %r12, 8(%rsp) #124.5[spill] - movsd 40(%rsp), %xmm10 #124.5[spill] - movsd 48(%rsp), %xmm12 #124.5[spill] - # LOE rax rdx rcx rsi r8 r9 r10 r11 r13 r15 ebx xmm6 xmm7 xmm10 xmm12 xmm13 -..B2.8: # Preds ..B2.49 ..B2.7 - # Execution count [5.00e+00] - movl (%rcx,%r8,4), %r14d #126.25 - addl %r14d, %ebx #138.9 - pxor %xmm5, %xmm5 #130.22 - movaps %xmm5, %xmm4 #131.22 - movsd (%rsi,%rdx), %xmm9 #127.25 - movaps %xmm4, %xmm0 #132.22 - movsd 8(%rsi,%rdx), %xmm8 #128.25 - movsd 16(%rsi,%rdx), %xmm11 #129.25 - testl %r14d, %r14d #143.9 - jle ..B2.48 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.9: # Preds ..B2.8 - # Execution count [2.50e+00] - jbe ..B2.48 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.10: # Preds ..B2.9 - # Execution count [2.25e+00] - cmpl $2, %r14d #143.9 - jb ..B2.58 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.11: # Preds ..B2.10 - # Execution count [2.25e+00] - movq %rax, %rdi #125.43 - movl %r14d, %r12d #143.9 - imulq %r9, %rdi #125.43 - pxor %xmm5, %xmm5 #130.22 - movaps %xmm9, %xmm1 #127.23 - movaps %xmm5, %xmm4 #131.22 - movaps %xmm8, %xmm2 #128.23 - movaps %xmm11, %xmm3 #129.23 - andl $-2, %r12d #143.9 - movsd %xmm11, 120(%rsp) #143.9[spill] - addq 56(%rsp), %rdi #107.5[spill] - xorl %ebp, %ebp #143.9 - unpcklpd %xmm1, %xmm1 #127.23 - movaps %xmm4, %xmm0 #132.22 - unpcklpd %xmm2, %xmm2 #128.23 - unpcklpd %xmm3, %xmm3 #129.23 - movslq %r12d, %r12 #143.9 - movsd %xmm8, 128(%rsp) #143.9[spill] - movsd %xmm9, 136(%rsp) #143.9[spill] - movsd %xmm13, 144(%rsp) #143.9[spill] - movl %r14d, 24(%rsp) #143.9[spill] - movq %rsi, 32(%rsp) #143.9[spill] - movq %rax, 72(%rsp) #143.9[spill] - movq %r11, 80(%rsp) #143.9[spill] - movq %r10, 88(%rsp) #143.9[spill] - movq %rcx, 96(%rsp) #143.9[spill] - movq %r8, 104(%rsp) #143.9[spill] - movq %r13, 112(%rsp) #143.9[spill] - movdqu .L_2il0floatpacket.5(%rip), %xmm11 #143.9 - movdqu .L_2il0floatpacket.4(%rip), %xmm12 #143.9 - # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.12: # Preds ..B2.38 ..B2.11 - # Execution count [1.25e+01] - movq (%rdi,%rbp,4), %xmm10 #144.21 - movdqa %xmm12, %xmm15 #146.36 - movdqa %xmm10, %xmm7 #145.36 - paddd %xmm10, %xmm7 #145.36 - paddd %xmm10, %xmm7 #145.36 - movdqa %xmm7, %xmm9 #145.36 - paddd %xmm7, %xmm15 #146.36 - movd %xmm7, %r13d #145.36 - paddd %xmm11, %xmm7 #147.36 - psrldq $4, %xmm9 #145.36 - movd %xmm9, %r11d #145.36 - movaps %xmm1, %xmm9 #145.36 - movd %xmm15, %r10d #146.36 - psrldq $4, %xmm15 #146.36 - movd %xmm15, %r8d #146.36 - movd %xmm7, %ecx #147.36 - psrldq $4, %xmm7 #147.36 - movd %xmm7, %eax #147.36 - movaps %xmm3, %xmm7 #147.36 - movslq %r13d, %r13 #145.36 - movslq %r11d, %r11 #145.36 - movslq %r10d, %r10 #146.36 - movslq %r8d, %r8 #146.36 - movsd (%rdx,%r13,8), %xmm8 #145.36 - movhpd (%rdx,%r11,8), %xmm8 #145.36 - movsd (%rdx,%r10,8), %xmm13 #146.36 - subpd %xmm8, %xmm9 #145.36 - movhpd (%rdx,%r8,8), %xmm13 #146.36 - movaps %xmm2, %xmm8 #146.36 - movslq %ecx, %rcx #147.36 - movaps %xmm9, %xmm15 #148.35 - subpd %xmm13, %xmm8 #146.36 - mulpd %xmm9, %xmm15 #148.35 - movslq %eax, %rax #147.36 - movaps %xmm8, %xmm13 #148.49 - movsd (%rdx,%rcx,8), %xmm14 #147.36 - mulpd %xmm8, %xmm13 #148.49 - movhpd (%rdx,%rax,8), %xmm14 #147.36 - subpd %xmm14, %xmm7 #147.36 - addpd %xmm13, %xmm15 #148.49 - movaps %xmm7, %xmm14 #148.63 - mulpd %xmm7, %xmm14 #148.63 - addpd %xmm14, %xmm15 #148.63 - movaps %xmm15, %xmm13 #158.22 - cmpltpd %xmm6, %xmm13 #158.22 - movmskpd %xmm13, %r14d #158.22 - testl %r14d, %r14d #158.22 - je ..B2.38 # Prob 50% #158.22 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm15 -..B2.13: # Preds ..B2.12 - # Execution count [6.25e+00] - movups .L_2il0floatpacket.6(%rip), %xmm14 #159.38 - divpd %xmm15, %xmm14 #159.38 - movdqu 160(%rsp), %xmm15 #167.24[spill] - pcmpgtd %xmm10, %xmm15 #167.24 - movups 176(%rsp), %xmm10 #160.38[spill] - mulpd %xmm14, %xmm10 #160.38 - mulpd %xmm14, %xmm10 #160.44 - mulpd %xmm14, %xmm10 #160.50 - mulpd 192(%rsp), %xmm14 #161.54[spill] - mulpd %xmm10, %xmm14 #161.61 - subpd .L_2il0floatpacket.7(%rip), %xmm10 #161.54 - mulpd %xmm10, %xmm14 #161.67 - mulpd %xmm14, %xmm9 #162.31 - mulpd %xmm14, %xmm8 #163.31 - mulpd %xmm14, %xmm7 #164.31 - punpckldq %xmm15, %xmm15 #167.24 - movaps %xmm13, %xmm14 #162.31 - andps %xmm13, %xmm15 #167.24 - movaps %xmm13, %xmm10 #163.31 - movmskpd %xmm15, %esi #167.24 - andps %xmm9, %xmm14 #162.31 - andps %xmm8, %xmm10 #163.31 - andps %xmm7, %xmm13 #164.31 - addpd %xmm14, %xmm5 #162.17 - addpd %xmm10, %xmm4 #163.17 - addpd %xmm13, %xmm0 #164.17 - testl %esi, %esi #167.24 - je ..B2.38 # Prob 50% #167.24 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 -..B2.14: # Preds ..B2.13 - # Execution count [3.12e+00] - movl %esi, %r14d #168.21 - andl $2, %r14d #168.21 - andl $1, %esi #168.21 - je ..B2.17 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 -..B2.15: # Preds ..B2.14 - # Execution count [3.12e+00] - movsd (%r15,%r13,8), %xmm10 #168.21 - testl %r14d, %r14d #168.21 - jne ..B2.18 # Prob 60% #168.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 -..B2.16: # Preds ..B2.15 - # Execution count [1.25e+00] - pxor %xmm13, %xmm13 #168.21 - unpcklpd %xmm13, %xmm10 #168.21 - subpd %xmm9, %xmm10 #168.21 - jmp ..B2.31 # Prob 100% #168.21 - # LOE rdx rcx rbx rbp rdi r9 r10 r12 r13 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.17: # Preds ..B2.14 - # Execution count [3.12e+00] - pxor %xmm10, %xmm10 #168.21 - testl %r14d, %r14d #168.21 - je ..B2.30 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 -..B2.18: # Preds ..B2.15 ..B2.17 - # Execution count [3.12e+00] - movhpd (%r15,%r11,8), %xmm10 #168.21 - subpd %xmm9, %xmm10 #168.21 - testl %esi, %esi #168.21 - je ..B2.20 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.19: # Preds ..B2.18 - # Execution count [1.88e+00] - movsd %xmm10, (%r15,%r13,8) #168.21 - psrldq $8, %xmm10 #168.21 - movsd %xmm10, (%r15,%r11,8) #168.21 - movsd (%r15,%r10,8), %xmm10 #169.21 - jmp ..B2.21 # Prob 100% #169.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.20: # Preds ..B2.18 - # Execution count [1.25e+00] - psrldq $8, %xmm10 #168.21 - movsd %xmm10, (%r15,%r11,8) #168.21 - pxor %xmm10, %xmm10 #169.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.21: # Preds ..B2.19 ..B2.20 - # Execution count [1.88e+00] - testl %r14d, %r14d #169.21 - je ..B2.72 # Prob 40% #169.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.22: # Preds ..B2.21 - # Execution count [3.12e+00] - movhpd (%r15,%r8,8), %xmm10 #169.21 - subpd %xmm8, %xmm10 #169.21 - testl %esi, %esi #169.21 - je ..B2.24 # Prob 40% #169.21 - # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 -..B2.23: # Preds ..B2.22 - # Execution count [1.88e+00] - movsd %xmm10, (%r15,%r10,8) #169.21 - psrldq $8, %xmm10 #169.21 - movsd %xmm10, (%r15,%r8,8) #169.21 - movsd (%r15,%rcx,8), %xmm9 #170.21 - jmp ..B2.25 # Prob 100% #170.21 - # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 -..B2.24: # Preds ..B2.22 - # Execution count [1.25e+00] - psrldq $8, %xmm10 #169.21 - movsd %xmm10, (%r15,%r8,8) #169.21 - pxor %xmm9, %xmm9 #170.21 - # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 -..B2.25: # Preds ..B2.23 ..B2.24 - # Execution count [1.88e+00] - testl %r14d, %r14d #170.21 - je ..B2.71 # Prob 40% #170.21 - # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 esi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 -..B2.26: # Preds ..B2.25 - # Execution count [3.12e+00] - movhpd (%r15,%rax,8), %xmm9 #170.21 - subpd %xmm7, %xmm9 #170.21 - testl %esi, %esi #170.21 - je ..B2.28 # Prob 40% #170.21 - # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.27: # Preds ..B2.26 - # Execution count [1.88e+00] - movsd %xmm9, (%r15,%rcx,8) #170.21 - psrldq $8, %xmm9 #170.21 - jmp ..B2.29 # Prob 100% #170.21 - # LOE rax rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.28: # Preds ..B2.26 - # Execution count [1.25e+00] - psrldq $8, %xmm9 #170.21 - # LOE rax rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.29: # Preds ..B2.27 ..B2.28 - # Execution count [3.12e+00] - movsd %xmm9, (%r15,%rax,8) #170.21 - jmp ..B2.38 # Prob 100% #170.21 - # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.30: # Preds ..B2.17 - # Execution count [1.88e+00] - pxor %xmm10, %xmm10 #168.21 - subpd %xmm9, %xmm10 #168.21 - testl %esi, %esi #168.21 - je ..B2.32 # Prob 40% #168.21 - # LOE rdx rcx rbx rbp rdi r9 r10 r12 r13 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.31: # Preds ..B2.16 ..B2.30 - # Execution count [1.25e+00] - movsd %xmm10, (%r15,%r13,8) #168.21 - movsd (%r15,%r10,8), %xmm10 #169.21 - pxor %xmm9, %xmm9 #169.21 - unpcklpd %xmm9, %xmm10 #169.21 - subpd %xmm8, %xmm10 #169.21 - jmp ..B2.34 # Prob 100% #169.21 - # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 -..B2.32: # Preds ..B2.30 - # Execution count [0.00e+00] - pxor %xmm10, %xmm10 #169.21 - jmp ..B2.33 # Prob 100% #169.21 - # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.72: # Preds ..B2.21 - # Execution count [7.50e-01] - testl %esi, %esi #168.21 - # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 -..B2.33: # Preds ..B2.32 ..B2.72 - # Execution count [2.67e+00] - pxor %xmm9, %xmm9 #169.21 - unpcklpd %xmm9, %xmm10 #169.21 - subpd %xmm8, %xmm10 #169.21 - je ..B2.35 # Prob 40% #169.21 - # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 -..B2.34: # Preds ..B2.31 ..B2.33 - # Execution count [1.25e+00] - movsd %xmm10, (%r15,%r10,8) #169.21 - movsd (%r15,%rcx,8), %xmm9 #170.21 - pxor %xmm8, %xmm8 #170.21 - unpcklpd %xmm8, %xmm9 #170.21 - subpd %xmm7, %xmm9 #170.21 - jmp ..B2.37 # Prob 100% #170.21 - # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.35: # Preds ..B2.33 - # Execution count [0.00e+00] - pxor %xmm9, %xmm9 #170.21 - jmp ..B2.36 # Prob 100% #170.21 - # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 -..B2.71: # Preds ..B2.25 - # Execution count [7.50e-01] - testl %esi, %esi #168.21 - # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 -..B2.36: # Preds ..B2.35 ..B2.71 - # Execution count [2.67e+00] - pxor %xmm8, %xmm8 #170.21 - unpcklpd %xmm8, %xmm9 #170.21 - subpd %xmm7, %xmm9 #170.21 - je ..B2.38 # Prob 40% #170.21 - # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.37: # Preds ..B2.34 ..B2.36 - # Execution count [1.25e+00] - movsd %xmm9, (%r15,%rcx,8) #170.21 - # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.38: # Preds ..B2.36 ..B2.29 ..B2.37 ..B2.13 ..B2.12 - # - # Execution count [1.25e+01] - addq $2, %rbp #143.9 - cmpq %r12, %rbp #143.9 - jb ..B2.12 # Prob 82% #143.9 - # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.39: # Preds ..B2.38 - # Execution count [2.25e+00] - movaps %xmm0, %xmm1 #132.22 - movaps %xmm4, %xmm2 #131.22 - movaps %xmm5, %xmm3 #130.22 - unpckhpd %xmm0, %xmm1 #132.22 - unpckhpd %xmm4, %xmm2 #131.22 - addsd %xmm1, %xmm0 #132.22 - addsd %xmm2, %xmm4 #131.22 - unpckhpd %xmm5, %xmm3 #130.22 - movsd 120(%rsp), %xmm11 #[spill] - addsd %xmm3, %xmm5 #130.22 - movsd 128(%rsp), %xmm8 #[spill] - movsd 136(%rsp), %xmm9 #[spill] - movsd 144(%rsp), %xmm13 #[spill] - movsd 40(%rsp), %xmm10 #[spill] - movsd 48(%rsp), %xmm12 #[spill] - movl 24(%rsp), %r14d #[spill] - movq 32(%rsp), %rsi #[spill] - movq 72(%rsp), %rax #[spill] - movq 80(%rsp), %r11 #[spill] - movq 88(%rsp), %r10 #[spill] - movq 96(%rsp), %rcx #[spill] - movq 104(%rsp), %r8 #[spill] - movq 112(%rsp), %r13 #[spill] - movsd .L_2il0floatpacket.1(%rip), %xmm7 # - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.40: # Preds ..B2.39 ..B2.58 - # Execution count [2.50e+00] - movslq %r14d, %r14 #143.9 - cmpq %r14, %r12 #143.9 - jae ..B2.49 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.41: # Preds ..B2.40 - # Execution count [2.25e+00] - imulq %rax, %r9 #125.43 - addq 56(%rsp), %r9 #107.5[spill] - movl 64(%rsp), %ebp #107.5[spill] - movq %r13, 112(%rsp) #107.5[spill] - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 r15 ebp xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.42: # Preds ..B2.45 ..B2.41 - # Execution count [1.25e+01] - movl (%r9,%r12,4), %edi #144.21 - movaps %xmm9, %xmm14 #145.36 - movaps %xmm8, %xmm3 #146.36 - movaps %xmm11, %xmm2 #147.36 - lea (%rdi,%rdi,2), %r13d #145.36 - movslq %r13d, %r13 #145.36 - subsd (%rdx,%r13,8), %xmm14 #145.36 - subsd 8(%rdx,%r13,8), %xmm3 #146.36 - subsd 16(%rdx,%r13,8), %xmm2 #147.36 - movaps %xmm14, %xmm15 #148.35 - movaps %xmm3, %xmm1 #148.49 - mulsd %xmm14, %xmm15 #148.35 - mulsd %xmm3, %xmm1 #148.49 - addsd %xmm1, %xmm15 #148.49 - movaps %xmm2, %xmm1 #148.63 - mulsd %xmm2, %xmm1 #148.63 - addsd %xmm1, %xmm15 #148.63 - comisd %xmm15, %xmm12 #158.22 - jbe ..B2.45 # Prob 50% #158.22 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 ebp edi xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 -..B2.43: # Preds ..B2.42 - # Execution count [6.25e+00] - movsd .L_2il0floatpacket.3(%rip), %xmm1 #159.38 - divsd %xmm15, %xmm1 #159.38 - movaps %xmm10, %xmm15 #160.38 - mulsd %xmm1, %xmm15 #160.38 - mulsd %xmm1, %xmm15 #160.44 - mulsd %xmm1, %xmm15 #160.50 - mulsd %xmm13, %xmm1 #161.54 - mulsd %xmm15, %xmm1 #161.61 - subsd %xmm7, %xmm15 #161.54 - mulsd %xmm15, %xmm1 #161.67 - mulsd %xmm1, %xmm14 #162.31 - mulsd %xmm1, %xmm3 #163.31 - mulsd %xmm1, %xmm2 #164.31 - addsd %xmm14, %xmm5 #162.17 - addsd %xmm3, %xmm4 #163.17 - addsd %xmm2, %xmm0 #164.17 - cmpl %ebp, %edi #167.24 - jge ..B2.45 # Prob 50% #167.24 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 ebp xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 -..B2.44: # Preds ..B2.43 - # Execution count [3.12e+00] - movsd (%r15,%r13,8), %xmm1 #168.21 - subsd %xmm14, %xmm1 #168.21 - movsd 8(%r15,%r13,8), %xmm14 #169.21 - subsd %xmm3, %xmm14 #169.21 - movsd 16(%r15,%r13,8), %xmm3 #170.21 - movsd %xmm1, (%r15,%r13,8) #168.21 - subsd %xmm2, %xmm3 #170.21 - movsd %xmm14, 8(%r15,%r13,8) #169.21 - movsd %xmm3, 16(%r15,%r13,8) #170.21 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 r15 ebp xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.45: # Preds ..B2.44 ..B2.43 ..B2.42 - # Execution count [1.25e+01] - incq %r12 #143.9 - cmpq %r14, %r12 #143.9 - jb ..B2.42 # Prob 82% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 r15 ebp xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.46: # Preds ..B2.45 - # Execution count [2.25e+00] - movq 112(%rsp), %r13 #[spill] - jmp ..B2.49 # Prob 100% # - # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 r15 xmm0 xmm4 xmm5 xmm6 xmm7 xmm10 xmm12 xmm13 -..B2.48: # Preds ..B2.9 ..B2.8 - # Execution count [2.50e+00] - movslq %r14d, %r14 #179.9 - # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 r15 xmm0 xmm4 xmm5 xmm6 xmm7 xmm10 xmm12 xmm13 -..B2.49: # Preds ..B2.46 ..B2.40 ..B2.48 - # Execution count [5.00e+00] - movslq %r8d, %r9 #124.32 - incq %r8 #124.5 - addq %r14, %r10 #179.9 - addq %r14, %r11 #180.9 - incq %r9 #124.32 - addsd (%rsi,%r15), %xmm5 #175.9 - addsd 8(%rsi,%r15), %xmm4 #176.9 - addsd 16(%rsi,%r15), %xmm0 #177.9 - movsd %xmm5, (%rsi,%r15) #175.9 - movsd %xmm4, 8(%rsi,%r15) #176.9 - movsd %xmm0, 16(%rsi,%r15) #177.9 - addq $24, %rsi #124.5 - cmpq %r13, %r8 #124.5 - jb ..B2.8 # Prob 82% #124.5 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 xmm6 xmm7 xmm10 xmm12 xmm13 -..B2.50: # Preds ..B2.49 - # Execution count [9.00e-01] - movq 8(%rsp), %r12 #[spill] - movq (%rsp), %r14 #[spill] - movq %r10, (%r12) #179.9 - movq %r11, 8(%r12) #180.9 - jmp ..B2.54 # Prob 100% #180.9 - # LOE rbx r14 -..B2.51: # Preds ..B2.1 - # Execution count [5.00e-01] - xorl %ebx, %ebx #120.22 - xorl %eax, %eax #121.16 -..___tag_value_computeForceLJHalfNeigh.139: -# getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.140: - # LOE rbx r14 xmm0 -..B2.68: # Preds ..B2.51 - # Execution count [5.00e-01] - movsd %xmm0, 16(%rsp) #121.16[spill] - # LOE rbx r14 -..B2.52: # Preds ..B2.68 - # Execution count [5.00e-01] - movl $.L_2__STRING.2, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.142: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.143: - # LOE rbx r14 -..B2.54: # Preds ..B2.52 ..B2.50 - # Execution count [1.00e+00] - movl $.L_2__STRING.2, %edi #183.5 -..___tag_value_computeForceLJHalfNeigh.144: -# likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #183.5 -..___tag_value_computeForceLJHalfNeigh.145: - # LOE rbx r14 -..B2.55: # Preds ..B2.54 - # Execution count [1.00e+00] - xorl %eax, %eax #184.16 -..___tag_value_computeForceLJHalfNeigh.146: -# getTimeStamp() - call getTimeStamp #184.16 -..___tag_value_computeForceLJHalfNeigh.147: - # LOE rbx r14 xmm0 -..B2.69: # Preds ..B2.55 - # Execution count [1.00e+00] - movaps %xmm0, %xmm1 #184.16 - # LOE rbx r14 xmm1 -..B2.56: # Preds ..B2.69 - # Execution count [1.00e+00] - pxor %xmm3, %xmm3 #185.5 - cvtsi2sdq %rbx, %xmm3 #185.5 - subsd 16(%rsp), %xmm1 #185.94[spill] - movsd .L_2il0floatpacket.2(%rip), %xmm2 #185.5 - movl $.L_2__STRING.3, %edi #185.5 - divsd %xmm3, %xmm2 #185.5 - mulsd %xmm1, %xmm2 #185.5 - movl %ebx, %esi #185.5 - movsd 264(%r14), %xmm0 #185.74 - movl $3, %eax #185.5 - mulsd %xmm0, %xmm2 #185.5 - movsd %xmm1, (%rsp) #185.5[spill] -..___tag_value_computeForceLJHalfNeigh.149: -# printf(const char *__restrict__, ...) - call printf #185.5 -..___tag_value_computeForceLJHalfNeigh.150: - # LOE -..B2.57: # Preds ..B2.56 - # Execution count [1.00e+00] - movsd (%rsp), %xmm1 #[spill] - movaps %xmm1, %xmm0 #186.14 - addq $216, %rsp #186.14 - .cfi_def_cfa_offset 56 - .cfi_restore 6 - popq %rbp #186.14 - .cfi_def_cfa_offset 48 - .cfi_restore 3 - popq %rbx #186.14 - .cfi_def_cfa_offset 40 - .cfi_restore 15 - popq %r15 #186.14 - .cfi_def_cfa_offset 32 - .cfi_restore 14 - popq %r14 #186.14 - .cfi_def_cfa_offset 24 - .cfi_restore 13 - popq %r13 #186.14 - .cfi_def_cfa_offset 16 - .cfi_restore 12 - popq %r12 #186.14 - .cfi_def_cfa_offset 8 - ret #186.14 - .cfi_def_cfa_offset 272 - .cfi_offset 3, -48 - .cfi_offset 6, -56 - .cfi_offset 12, -16 - .cfi_offset 13, -24 - .cfi_offset 14, -32 - .cfi_offset 15, -40 - # LOE -..B2.58: # Preds ..B2.10 - # Execution count [2.25e-01]: Infreq - xorl %r12d, %r12d #143.9 - jmp ..B2.40 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.59: # Preds ..B2.2 - # Execution count [1.11e+00]: Infreq - movl %ebx, %eax #114.5 - xorl %ecx, %ecx #114.5 - movl $1, %esi #114.5 - xorl %edx, %edx #114.5 - shrl $1, %eax #114.5 - je ..B2.63 # Prob 10% #114.5 - # LOE rax rdx rcx rdi r12 r14 r15 ebx ebp esi -..B2.60: # Preds ..B2.59 - # Execution count [1.00e+00]: Infreq - xorl %esi, %esi #114.5 - # LOE rax rdx rcx rsi rdi r12 r14 r15 ebx ebp -..B2.61: # Preds ..B2.61 ..B2.60 - # Execution count [2.78e+00]: Infreq - incq %rcx #114.5 - movq %rsi, (%rdx,%rdi) #115.9 - movq %rsi, 8(%rdx,%rdi) #115.9 - addq $16, %rdx #114.5 - cmpq %rax, %rcx #114.5 - jb ..B2.61 # Prob 64% #114.5 - # LOE rax rdx rcx rsi rdi r12 r14 r15 ebx ebp -..B2.62: # Preds ..B2.61 - # Execution count [1.00e+00]: Infreq - lea 1(%rcx,%rcx), %esi #115.9 - # LOE rdi r12 r14 r15 ebx ebp esi -..B2.63: # Preds ..B2.59 ..B2.62 - # Execution count [1.11e+00]: Infreq - lea -1(%rsi), %eax #114.5 - cmpl %ebx, %eax #114.5 - jae ..B2.70 # Prob 10% #114.5 - # LOE rdi r12 r14 r15 ebp esi -..B2.64: # Preds ..B2.63 - # Execution count [1.00e+00]: Infreq - movslq %esi, %rsi #114.5 - movslq %ebp, %r13 #114.5 - movq $0, -8(%rdi,%rsi,8) #115.9 - jmp ..B2.5 # Prob 100% #115.9 - # LOE r12 r13 r14 r15 ebp -..B2.70: # Preds ..B2.63 - # Execution count [1.11e-01]: Infreq - movslq %ebp, %r13 #114.5 - jmp ..B2.5 # Prob 100% #114.5 - .align 16,0x90 - # LOE r12 r13 r14 r15 ebp - .cfi_endproc -# mark_end; - .type computeForceLJHalfNeigh,@function - .size computeForceLJHalfNeigh,.-computeForceLJHalfNeigh -..LNcomputeForceLJHalfNeigh.1: - .data -# -- End computeForceLJHalfNeigh - .text -.L_2__routine_start_computeForceLJFullNeigh_simd_2: -# -- Begin computeForceLJFullNeigh_simd - .text -# mark_begin; - .align 16,0x90 - .globl computeForceLJFullNeigh_simd -# --- computeForceLJFullNeigh_simd(Parameter *, Atom *, Neighbor *, Stats *) -computeForceLJFullNeigh_simd: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -..B3.1: # Preds ..B3.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_computeForceLJFullNeigh_simd.174: -..L175: - #189.101 - pushq %rsi #189.101 - .cfi_def_cfa_offset 16 - movl 4(%rsi), %edx #190.18 - testl %edx, %edx #196.24 - jle ..B3.4 # Prob 50% #196.24 - # LOE rbx rbp rsi r12 r13 r14 r15 edx -..B3.2: # Preds ..B3.1 - # Execution count [5.00e-03] - movq 64(%rsi), %rdi #197.9 - lea (%rdx,%rdx,2), %eax #190.18 - cmpl $12, %eax #196.5 - jle ..B3.8 # Prob 0% #196.5 - # LOE rbx rbp rdi r12 r13 r14 r15 eax edx -..B3.3: # Preds ..B3.2 - # Execution count [1.00e+00] - movslq %edx, %rdx #196.5 - xorl %esi, %esi #196.5 - lea (%rdx,%rdx,2), %rdx #196.5 - shlq $3, %rdx #196.5 - call _intel_fast_memset #196.5 - # LOE rbx rbp r12 r13 r14 r15 -..B3.4: # Preds ..B3.1 ..B3.12 ..B3.3 ..B3.13 - # Execution count [1.00e+00] - xorl %eax, %eax #203.16 -..___tag_value_computeForceLJFullNeigh_simd.177: -# getTimeStamp() - call getTimeStamp #203.16 -..___tag_value_computeForceLJFullNeigh_simd.178: - # LOE rbx rbp r12 r13 r14 r15 -..B3.5: # Preds ..B3.4 - # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #204.5 -..___tag_value_computeForceLJFullNeigh_simd.179: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #204.5 -..___tag_value_computeForceLJFullNeigh_simd.180: - # LOE -..B3.6: # Preds ..B3.5 - # Execution count [1.00e+00] - movl $il0_peep_printf_format_0, %edi #207.5 - movq stderr(%rip), %rsi #207.5 - call fputs #207.5 - # LOE -..B3.7: # Preds ..B3.6 - # Execution count [1.00e+00] - movl $-1, %edi #208.5 -# exit(int) - call exit #208.5 - # LOE -..B3.8: # Preds ..B3.2 - # Execution count [1.11e+00]: Infreq - movl %eax, %edx #196.5 - xorl %r8d, %r8d #196.5 - movl $1, %r9d #196.5 - xorl %esi, %esi #196.5 - xorl %ecx, %ecx #196.5 - shrl $1, %edx #196.5 - je ..B3.12 # Prob 10% #196.5 - # LOE rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 eax r9d -..B3.10: # Preds ..B3.8 ..B3.10 - # Execution count [2.78e+00]: Infreq - incq %r8 #196.5 - movq %rsi, (%rcx,%rdi) #197.9 - movq %rsi, 8(%rcx,%rdi) #197.9 - addq $16, %rcx #196.5 - cmpq %rdx, %r8 #196.5 - jb ..B3.10 # Prob 64% #196.5 - # LOE rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 eax -..B3.11: # Preds ..B3.10 - # Execution count [1.00e+00]: Infreq - lea 1(%r8,%r8), %r9d #197.9 - # LOE rbx rbp rdi r12 r13 r14 r15 eax r9d -..B3.12: # Preds ..B3.11 ..B3.8 - # Execution count [1.11e+00]: Infreq - lea -1(%r9), %edx #196.5 - cmpl %eax, %edx #196.5 - jae ..B3.4 # Prob 10% #196.5 - # LOE rbx rbp rdi r12 r13 r14 r15 r9d -..B3.13: # Preds ..B3.12 - # Execution count [1.00e+00]: Infreq - movslq %r9d, %r9 #196.5 - movq $0, -8(%rdi,%r9,8) #197.9 - jmp ..B3.4 # Prob 100% #197.9 - .align 16,0x90 - # LOE rbx rbp r12 r13 r14 r15 - .cfi_endproc -# mark_end; - .type computeForceLJFullNeigh_simd,@function - .size computeForceLJFullNeigh_simd,.-computeForceLJFullNeigh_simd -..LNcomputeForceLJFullNeigh_simd.2: - .section .rodata.str1.32, "aMS",@progbits,1 - .align 32 - .align 32 -il0_peep_printf_format_0: - .long 1869771333 - .long 1394621042 - .long 541347145 - .long 1852990827 - .long 1847618661 - .long 1763734639 - .long 1701605485 - .long 1953391981 - .long 1713398885 - .long 1931506287 - .long 1768121712 - .long 1684367718 - .long 1936615712 - .long 1668641396 - .long 1852795252 - .long 1952805664 - .word 33 - .data -# -- End computeForceLJFullNeigh_simd - .section .rodata, "a" - .align 16 - .align 16 -.L_2il0floatpacket.4: - .long 0x00000001,0x00000001,0x00000001,0x00000001 - .type .L_2il0floatpacket.4,@object - .size .L_2il0floatpacket.4,16 - .align 16 -.L_2il0floatpacket.5: - .long 0x00000002,0x00000002,0x00000002,0x00000002 - .type .L_2il0floatpacket.5,@object - .size .L_2il0floatpacket.5,16 - .align 16 -.L_2il0floatpacket.6: - .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000 - .type .L_2il0floatpacket.6,@object - .size .L_2il0floatpacket.6,16 - .align 16 -.L_2il0floatpacket.7: - .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000 - .type .L_2il0floatpacket.7,@object - .size .L_2il0floatpacket.7,16 - .align 8 -.L_2il0floatpacket.0: - .long 0x00000000,0x40480000 - .type .L_2il0floatpacket.0,@object - .size .L_2il0floatpacket.0,8 - .align 8 -.L_2il0floatpacket.1: - .long 0x00000000,0x3fe00000 - .type .L_2il0floatpacket.1,@object - .size .L_2il0floatpacket.1,8 - .align 8 -.L_2il0floatpacket.2: - .long 0x00000000,0x41cdcd65 - .type .L_2il0floatpacket.2,@object - .size .L_2il0floatpacket.2,8 - .align 8 -.L_2il0floatpacket.3: - .long 0x00000000,0x3ff00000 - .type .L_2il0floatpacket.3,@object - .size .L_2il0floatpacket.3,8 - .section .rodata.str1.4, "aMS",@progbits,1 - .align 4 - .align 4 -.L_2__STRING.0: - .long 1668444006 - .word 101 - .type .L_2__STRING.0,@object - .size .L_2__STRING.0,6 - .space 2, 0x00 # pad - .align 4 -.L_2__STRING.1: - .long 980644937 - .long 544548128 - .long 1701987872 - .long 622869105 - .long 1411391590 - .long 979725673 - .long 174466336 - .long 1764718915 - .long 622869108 - .long 1881677926 - .long 1852399980 - .long 170484575 - .byte 0 - .type .L_2__STRING.1,@object - .size .L_2__STRING.1,49 - .space 3, 0x00 # pad - .align 4 -.L_2__STRING.2: - .long 1668444006 - .long 759843941 - .long 1718378856 - .long 1734960494 - .word 104 - .type .L_2__STRING.2,@object - .size .L_2__STRING.2,18 - .space 2, 0x00 # pad - .align 4 -.L_2__STRING.3: - .long 980644937 - .long 544548128 - .long 1701987872 - .long 622869105 - .long 1411391590 - .long 979725673 - .long 174466336 - .long 1764718915 - .long 622869108 - .long 1747460198 - .long 761687137 - .long 1734960494 - .long 665960 - .type .L_2__STRING.3,@object - .size .L_2__STRING.3,52 - .data - .section .note.GNU-stack, "" -# End diff --git a/static_analysis/jan/icx-icc-lammps-sse.s b/static_analysis/jan/icx-icc-lammps-sse.s deleted file mode 100644 index 22783ab..0000000 --- a/static_analysis/jan/icx-icc-lammps-sse.s +++ /dev/null @@ -1,1522 +0,0 @@ -# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; -# mark_description "0226_000000"; -# mark_description "-I/apps/likwid/5.2.2/include -I././lammps/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GNU"; -# mark_description "_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=2 -DENABLE_OMP_SIMD -DALIGNMENT="; -# mark_description "64 -restrict -Ofast -xSSE4.2 -o build-lammps-ICC-SSE-DP/force_lj.s"; - .file "force_lj.c" - .text -..TXTST0: -.L_2__routine_start_computeForceLJFullNeigh_plain_c_0: -# -- Begin computeForceLJFullNeigh_plain_c - .text -# mark_begin; - .align 16,0x90 - .globl computeForceLJFullNeigh_plain_c -# --- computeForceLJFullNeigh_plain_c(Parameter *, Atom *, Neighbor *, Stats *) -computeForceLJFullNeigh_plain_c: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -..B1.1: # Preds ..B1.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_computeForceLJFullNeigh_plain_c.1: -..L2: - #23.104 - pushq %r14 #23.104 - .cfi_def_cfa_offset 16 - .cfi_offset 14, -16 - pushq %r15 #23.104 - .cfi_def_cfa_offset 24 - .cfi_offset 15, -24 - pushq %rbx #23.104 - .cfi_def_cfa_offset 32 - .cfi_offset 3, -32 - pushq %rbp #23.104 - .cfi_def_cfa_offset 40 - .cfi_offset 6, -40 - subq $136, %rsp #23.104 - .cfi_def_cfa_offset 176 - movq %rsi, %r14 #23.104 - movsd 144(%rdi), %xmm0 #27.27 - movq %rcx, %rbp #23.104 - mulsd %xmm0, %xmm0 #27.45 - movq %rdx, %r15 #23.104 - movsd 56(%rdi), %xmm1 #28.23 - movsd 40(%rdi), %xmm2 #29.24 - movl 4(%r14), %eax #24.18 - movsd %xmm0, 64(%rsp) #27.45[spill] - movsd %xmm1, 40(%rsp) #28.23[spill] - movsd %xmm2, 32(%rsp) #29.24[spill] - testl %eax, %eax #32.24 - jle ..B1.23 # Prob 50% #32.24 - # LOE rbp r12 r13 r14 r15 eax -..B1.2: # Preds ..B1.1 - # Execution count [5.00e-03] - movslq %eax, %rbx #24.18 - lea (%rax,%rax,2), %eax #24.18 - movq 64(%r14), %rdi #33.9 - cmpl $12, %eax #32.5 - jle ..B1.29 # Prob 0% #32.5 - # LOE rbx rbp rdi r12 r13 r14 r15 -..B1.3: # Preds ..B1.2 - # Execution count [1.00e+00] - xorl %esi, %esi #32.5 - lea (%rbx,%rbx,2), %rdx #32.5 - shlq $3, %rdx #32.5 - call _intel_fast_memset #32.5 - # LOE rbx rbp r12 r13 r14 r15 -..B1.5: # Preds ..B1.43 ..B1.3 ..B1.41 - # Execution count [1.00e+00] - xorl %eax, %eax #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.15: -# getTimeStamp() - call getTimeStamp #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.16: - # LOE rbx rbp r12 r13 r14 r15 xmm0 -..B1.50: # Preds ..B1.5 - # Execution count [1.00e+00] - movsd %xmm0, 24(%rsp) #38.16[spill] - # LOE rbx rbp r12 r13 r14 r15 -..B1.6: # Preds ..B1.50 - # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.18: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.19: - # LOE rbx rbp r12 r13 r14 r15 -..B1.7: # Preds ..B1.6 - # Execution count [9.00e-01] - movsd .L_2il0floatpacket.3(%rip), %xmm13 #77.41 - xorl %eax, %eax #41.15 - mulsd 32(%rsp), %xmm13 #77.41[spill] - xorl %ecx, %ecx #41.5 - movddup 64(%rsp), %xmm3 #27.25[spill] - xorl %edi, %edi #41.5 - movddup %xmm13, %xmm1 #77.41 - movq 16(%r15), %rdx #42.19 - movslq 8(%r15), %rsi #42.43 - movq 24(%r15), %r15 #43.25 - movups .L_2il0floatpacket.2(%rip), %xmm2 #75.32 - movddup 40(%rsp), %xmm5 #28.21[spill] - movsd 40(%rsp), %xmm10 #41.5[spill] - movsd 64(%rsp), %xmm12 #41.5[spill] - movsd .L_2il0floatpacket.5(%rip), %xmm7 #77.54 - shlq $2, %rsi #25.5 - movq 16(%r14), %r11 #44.25 - movq 64(%r14), %r8 #89.9 - movq (%rbp), %r9 #93.9 - movq 8(%rbp), %r10 #94.9 - movups %xmm1, 112(%rsp) #41.5[spill] - movups %xmm3, 48(%rsp) #41.5[spill] - movq %rbx, 104(%rsp) #41.5[spill] - movq %rbp, (%rsp) #41.5[spill] - movq %r12, 8(%rsp) #41.5[spill] - movq %r13, 16(%rsp) #41.5[spill] - .cfi_offset 12, -168 - .cfi_offset 13, -160 - # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r15 xmm5 xmm7 xmm10 xmm12 xmm13 -..B1.8: # Preds ..B1.21 ..B1.7 - # Execution count [5.00e+00] - movl (%r15,%rcx,4), %ebx #43.25 - xorps %xmm6, %xmm6 #47.22 - movaps %xmm6, %xmm4 #48.22 - movsd (%rdi,%r11), %xmm9 #44.25 - movaps %xmm4, %xmm0 #49.22 - movsd 8(%rdi,%r11), %xmm8 #45.25 - movsd 16(%rdi,%r11), %xmm11 #46.25 - movslq %ebx, %r13 #56.9 - testl %ebx, %ebx #56.28 - jle ..B1.21 # Prob 50% #56.28 - # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.9: # Preds ..B1.8 - # Execution count [4.50e+00] - cmpq $2, %r13 #56.9 - jl ..B1.28 # Prob 10% #56.9 - # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.10: # Preds ..B1.9 - # Execution count [4.50e+00] - movq %rsi, %r14 #42.43 - movl %ebx, %ebp #56.9 - imulq %rax, %r14 #42.43 - xorps %xmm6, %xmm6 #47.22 - andl $-2, %ebp #56.9 - movaps %xmm6, %xmm4 #48.22 - movsd %xmm8, 80(%rsp) #71.22[spill] - movaps %xmm4, %xmm0 #49.22 - movsd %xmm9, 88(%rsp) #71.22[spill] - xorl %r12d, %r12d #56.9 - movslq %ebp, %rbp #56.9 - addq %rdx, %r14 #25.5 - movddup %xmm9, %xmm1 #44.23 - movddup %xmm8, %xmm2 #45.23 - movddup %xmm11, %xmm3 #46.23 - movsd %xmm11, 72(%rsp) #71.22[spill] - movsd %xmm13, 96(%rsp) #71.22[spill] - movq %rcx, 32(%rsp) #71.22[spill] - movups 48(%rsp), %xmm8 #71.22[spill] - movdqu .L_2il0floatpacket.1(%rip), %xmm9 #71.22 - movdqu .L_2il0floatpacket.0(%rip), %xmm10 #71.22 - # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 -# LLVM-MCA-BEGIN -# OSACA-BEGIN -..B1.11: # Preds ..B1.13 ..B1.10 - # Execution count [2.50e+01] - movq (%r14,%r12,4), %xmm15 #57.21 - movdqa %xmm10, %xmm13 #59.36 - movdqa %xmm15, %xmm7 #58.36 - paddd %xmm15, %xmm7 #58.36 - paddd %xmm7, %xmm15 #58.36 - movaps %xmm1, %xmm7 #58.36 - movd %xmm15, %ecx #58.36 - paddd %xmm15, %xmm13 #59.36 - pshufd $57, %xmm15, %xmm11 #58.36 - paddd %xmm9, %xmm15 #60.36 - pshufd $57, %xmm13, %xmm12 #59.36 - movslq %ecx, %rcx #58.36 - movsd (%r11,%rcx,8), %xmm14 #58.36 - movd %xmm11, %ecx #58.36 - movaps %xmm2, %xmm11 #59.36 - movslq %ecx, %rcx #58.36 - movhpd (%r11,%rcx,8), %xmm14 #58.36 - movd %xmm13, %ecx #59.36 - subpd %xmm14, %xmm7 #58.36 - movslq %ecx, %rcx #59.36 - movsd (%r11,%rcx,8), %xmm14 #59.36 - movd %xmm12, %ecx #59.36 - movslq %ecx, %rcx #59.36 - movhpd (%r11,%rcx,8), %xmm14 #59.36 - movd %xmm15, %ecx #60.36 - pshufd $57, %xmm15, %xmm15 #60.36 - subpd %xmm14, %xmm11 #59.36 - movslq %ecx, %rcx #60.36 - movaps %xmm7, %xmm14 #61.35 - movaps %xmm11, %xmm12 #61.49 - mulpd %xmm7, %xmm14 #61.35 - mulpd %xmm11, %xmm12 #61.49 - movsd (%r11,%rcx,8), %xmm13 #60.36 - movd %xmm15, %ecx #60.36 - movaps %xmm3, %xmm15 #60.36 - addpd %xmm12, %xmm14 #61.49 - movslq %ecx, %rcx #60.36 - pcmpeqd %xmm12, %xmm12 #71.22 - movhpd (%r11,%rcx,8), %xmm13 #60.36 - subpd %xmm13, %xmm15 #60.36 - movaps %xmm15, %xmm13 #61.63 - mulpd %xmm15, %xmm13 #61.63 - addpd %xmm13, %xmm14 #61.63 - movaps %xmm14, %xmm13 #71.22 - cmpltpd %xmm8, %xmm13 #71.22 - ptest %xmm12, %xmm13 #71.22 - je ..B1.13 # Prob 50% #71.22 - # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 xmm15 -..B1.12: # Preds ..B1.11 - # Execution count [1.25e+01] - movups .L_2il0floatpacket.2(%rip), %xmm12 #75.38 - divpd %xmm14, %xmm12 #75.38 - movaps %xmm5, %xmm14 #76.38 - mulpd %xmm12, %xmm14 #76.38 - mulpd %xmm12, %xmm14 #76.44 - mulpd %xmm12, %xmm14 #76.50 - mulpd 112(%rsp), %xmm12 #77.54[spill] - mulpd %xmm14, %xmm12 #77.61 - subpd .L_2il0floatpacket.4(%rip), %xmm14 #77.54 - mulpd %xmm14, %xmm12 #77.67 - mulpd %xmm12, %xmm7 #78.31 - mulpd %xmm12, %xmm11 #79.31 - mulpd %xmm12, %xmm15 #80.31 - andps %xmm13, %xmm7 #78.31 - andps %xmm13, %xmm11 #79.31 - andps %xmm15, %xmm13 #80.31 - addpd %xmm7, %xmm6 #78.17 - addpd %xmm11, %xmm4 #79.17 - addpd %xmm13, %xmm0 #80.17 - # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 -..B1.13: # Preds ..B1.12 ..B1.11 - # Execution count [2.50e+01] - addq $2, %r12 #56.9 - cmpq %rbp, %r12 #56.9 - jb ..B1.11 # Prob 82% #56.9 -# OSACA-END -# LLVM-MCA-END - # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 -..B1.14: # Preds ..B1.13 - # Execution count [4.50e+00] - movaps %xmm0, %xmm1 #49.22 - movaps %xmm4, %xmm2 #48.22 - movaps %xmm6, %xmm3 #47.22 - unpckhpd %xmm0, %xmm1 #49.22 - unpckhpd %xmm4, %xmm2 #48.22 - addsd %xmm1, %xmm0 #49.22 - addsd %xmm2, %xmm4 #48.22 - unpckhpd %xmm6, %xmm3 #47.22 - movsd 72(%rsp), %xmm11 #[spill] - addsd %xmm3, %xmm6 #47.22 - movsd 80(%rsp), %xmm8 #[spill] - movsd 88(%rsp), %xmm9 #[spill] - movsd 96(%rsp), %xmm13 #[spill] - movsd 40(%rsp), %xmm10 #[spill] - movsd 64(%rsp), %xmm12 #[spill] - movq 32(%rsp), %rcx #[spill] - movsd .L_2il0floatpacket.5(%rip), %xmm7 # - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.15: # Preds ..B1.14 ..B1.28 - # Execution count [5.00e+00] - cmpq %r13, %rbp #56.9 - jae ..B1.21 # Prob 10% #56.9 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.16: # Preds ..B1.15 - # Execution count [4.50e+00] - imulq %rsi, %rax #42.43 - addq %rdx, %rax #25.5 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.17: # Preds ..B1.19 ..B1.16 - # Execution count [2.50e+01] - movl (%rax,%rbp,4), %r12d #57.21 - movaps %xmm9, %xmm14 #58.36 - movaps %xmm8, %xmm3 #59.36 - movaps %xmm11, %xmm2 #60.36 - lea (%r12,%r12,2), %r14d #58.36 - movslq %r14d, %r14 #58.36 - subsd (%r11,%r14,8), %xmm14 #58.36 - subsd 8(%r11,%r14,8), %xmm3 #59.36 - subsd 16(%r11,%r14,8), %xmm2 #60.36 - movaps %xmm14, %xmm15 #61.35 - movaps %xmm3, %xmm1 #61.49 - mulsd %xmm14, %xmm15 #61.35 - mulsd %xmm3, %xmm1 #61.49 - addsd %xmm1, %xmm15 #61.49 - movaps %xmm2, %xmm1 #61.63 - mulsd %xmm2, %xmm1 #61.63 - addsd %xmm1, %xmm15 #61.63 - comisd %xmm15, %xmm12 #71.22 - jbe ..B1.19 # Prob 50% #71.22 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 -..B1.18: # Preds ..B1.17 - # Execution count [1.25e+01] - movsd .L_2il0floatpacket.6(%rip), %xmm1 #75.38 - divsd %xmm15, %xmm1 #75.38 - movaps %xmm10, %xmm15 #76.38 - mulsd %xmm1, %xmm15 #76.38 - mulsd %xmm1, %xmm15 #76.44 - mulsd %xmm1, %xmm15 #76.50 - mulsd %xmm13, %xmm1 #77.54 - mulsd %xmm15, %xmm1 #77.61 - subsd %xmm7, %xmm15 #77.54 - mulsd %xmm15, %xmm1 #77.67 - mulsd %xmm1, %xmm14 #78.31 - mulsd %xmm1, %xmm3 #79.31 - mulsd %xmm1, %xmm2 #80.31 - addsd %xmm14, %xmm6 #78.17 - addsd %xmm3, %xmm4 #79.17 - addsd %xmm2, %xmm0 #80.17 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.19: # Preds ..B1.18 ..B1.17 - # Execution count [2.50e+01] - incq %rbp #56.9 - cmpq %r13, %rbp #56.9 - jb ..B1.17 # Prob 82% #56.9 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.21: # Preds ..B1.19 ..B1.8 ..B1.15 - # Execution count [5.00e+00] - addq %r13, %r9 #93.9 - lea 1(%rbx), %eax #94.9 - shrl $31, %eax #94.9 - addsd (%rdi,%r8), %xmm6 #89.9 - addsd 8(%rdi,%r8), %xmm4 #90.9 - addsd 16(%rdi,%r8), %xmm0 #91.9 - movsd %xmm6, (%rdi,%r8) #89.9 - lea 1(%rbx,%rax), %ebx #94.9 - sarl $1, %ebx #94.9 - movslq %ebx, %rbx #94.9 - movslq %ecx, %rax #41.32 - incq %rcx #41.5 - movsd %xmm4, 8(%rdi,%r8) #90.9 - addq %rbx, %r10 #94.9 - movsd %xmm0, 16(%rdi,%r8) #91.9 - addq $24, %rdi #41.5 - incq %rax #41.32 - cmpq 104(%rsp), %rcx #41.5[spill] - jb ..B1.8 # Prob 82% #41.5 - # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r15 xmm5 xmm7 xmm10 xmm12 xmm13 -..B1.22: # Preds ..B1.21 - # Execution count [9.00e-01] - movq (%rsp), %rbp #[spill] - movq 8(%rsp), %r12 #[spill] - .cfi_restore 12 - movq 16(%rsp), %r13 #[spill] - .cfi_restore 13 - movq %r9, (%rbp) #93.9 - movq %r10, 8(%rbp) #94.9 - jmp ..B1.25 # Prob 100% #94.9 - # LOE r12 r13 -..B1.23: # Preds ..B1.1 - # Execution count [5.00e-01] - xorl %eax, %eax #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.53: -# getTimeStamp() - call getTimeStamp #38.16 -..___tag_value_computeForceLJFullNeigh_plain_c.54: - # LOE r12 r13 xmm0 -..B1.51: # Preds ..B1.23 - # Execution count [5.00e-01] - movsd %xmm0, 24(%rsp) #38.16[spill] - # LOE r12 r13 -..B1.24: # Preds ..B1.51 - # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.56: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 -..___tag_value_computeForceLJFullNeigh_plain_c.57: - # LOE r12 r13 -..B1.25: # Preds ..B1.22 ..B1.24 - # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #97.5 -..___tag_value_computeForceLJFullNeigh_plain_c.58: -# likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #97.5 -..___tag_value_computeForceLJFullNeigh_plain_c.59: - # LOE r12 r13 -..B1.26: # Preds ..B1.25 - # Execution count [1.00e+00] - xorl %eax, %eax #98.16 -..___tag_value_computeForceLJFullNeigh_plain_c.60: -# getTimeStamp() - call getTimeStamp #98.16 -..___tag_value_computeForceLJFullNeigh_plain_c.61: - # LOE r12 r13 xmm0 -..B1.27: # Preds ..B1.26 - # Execution count [1.00e+00] - subsd 24(%rsp), %xmm0 #102.14[spill] - addq $136, %rsp #102.14 - .cfi_def_cfa_offset 40 - .cfi_restore 6 - popq %rbp #102.14 - .cfi_def_cfa_offset 32 - .cfi_restore 3 - popq %rbx #102.14 - .cfi_def_cfa_offset 24 - .cfi_restore 15 - popq %r15 #102.14 - .cfi_def_cfa_offset 16 - .cfi_restore 14 - popq %r14 #102.14 - .cfi_def_cfa_offset 8 - ret #102.14 - .cfi_def_cfa_offset 176 - .cfi_offset 3, -32 - .cfi_offset 6, -40 - .cfi_offset 12, -168 - .cfi_offset 13, -160 - .cfi_offset 14, -16 - .cfi_offset 15, -24 - # LOE -..B1.28: # Preds ..B1.9 - # Execution count [4.50e-01]: Infreq - xorl %ebp, %ebp #56.9 - jmp ..B1.15 # Prob 100% #56.9 - .cfi_restore 12 - .cfi_restore 13 - # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B1.29: # Preds ..B1.2 - # Execution count [1.00e+00]: Infreq - lea (%rbx,%rbx,2), %rdx #24.18 - cmpq $4, %rdx #32.5 - jl ..B1.45 # Prob 10% #32.5 - # LOE rdx rbx rbp rdi r12 r13 r14 r15 -..B1.30: # Preds ..B1.29 - # Execution count [1.00e+00]: Infreq - movq %rdi, %rcx #32.5 - andq $15, %rcx #32.5 - testl %ecx, %ecx #32.5 - je ..B1.33 # Prob 50% #32.5 - # LOE rdx rbx rbp rdi r12 r13 r14 r15 ecx -..B1.31: # Preds ..B1.30 - # Execution count [1.00e+00]: Infreq - testb $7, %cl #32.5 - jne ..B1.45 # Prob 10% #32.5 - # LOE rdx rbx rbp rdi r12 r13 r14 r15 -..B1.32: # Preds ..B1.31 - # Execution count [5.00e-01]: Infreq - movl $1, %ecx #32.5 - # LOE rdx rbx rbp rdi r12 r13 r14 r15 ecx -..B1.33: # Preds ..B1.32 ..B1.30 - # Execution count [1.00e+00]: Infreq - movl %ecx, %eax #32.5 - lea 4(%rax), %rsi #32.5 - cmpq %rsi, %rdx #32.5 - jl ..B1.45 # Prob 10% #32.5 - # LOE rax rdx rbx rbp rdi r12 r13 r14 r15 ecx -..B1.34: # Preds ..B1.33 - # Execution count [1.11e+00]: Infreq - movl %edx, %r9d #32.5 - movl %r9d, %esi #32.5 - subl %ecx, %esi #32.5 - andl $3, %esi #32.5 - subl %esi, %r9d #32.5 - xorl %esi, %esi #32.5 - xorl %r8d, %r8d #33.22 - testl %ecx, %ecx #32.5 - movslq %r9d, %rcx #32.5 - jbe ..B1.38 # Prob 10% #32.5 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 -..B1.36: # Preds ..B1.34 ..B1.36 - # Execution count [5.56e+00]: Infreq - movq %r8, (%rdi,%rsi,8) #33.9 - incq %rsi #32.5 - cmpq %rax, %rsi #32.5 - jb ..B1.36 # Prob 82% #32.5 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 -..B1.38: # Preds ..B1.36 ..B1.34 - # Execution count [1.00e+00]: Infreq - xorps %xmm0, %xmm0 #33.22 - # LOE rax rdx rcx rbx rbp rdi r12 r13 r14 r15 xmm0 -..B1.39: # Preds ..B1.39 ..B1.38 - # Execution count [5.56e+00]: Infreq - movups %xmm0, (%rdi,%rax,8) #33.9 - movups %xmm0, 16(%rdi,%rax,8) #33.9 - addq $4, %rax #32.5 - cmpq %rcx, %rax #32.5 - jb ..B1.39 # Prob 82% #32.5 - # LOE rax rdx rcx rbx rbp rdi r12 r13 r14 r15 xmm0 -..B1.41: # Preds ..B1.39 ..B1.45 - # Execution count [1.11e+00]: Infreq - cmpq %rdx, %rcx #32.5 - jae ..B1.5 # Prob 10% #32.5 - # LOE rdx rcx rbx rbp rdi r12 r13 r14 r15 -..B1.43: # Preds ..B1.41 ..B1.43 - # Execution count [5.56e+00]: Infreq - movq $0, (%rdi,%rcx,8) #33.9 - incq %rcx #32.5 - cmpq %rdx, %rcx #32.5 - jb ..B1.43 # Prob 82% #32.5 - jmp ..B1.5 # Prob 100% #32.5 - # LOE rdx rcx rbx rbp rdi r12 r13 r14 r15 -..B1.45: # Preds ..B1.29 ..B1.31 ..B1.33 - # Execution count [1.00e-01]: Infreq - xorl %ecx, %ecx #32.5 - jmp ..B1.41 # Prob 100% #32.5 - .align 16,0x90 - # LOE rdx rcx rbx rbp rdi r12 r13 r14 r15 - .cfi_endproc -# mark_end; - .type computeForceLJFullNeigh_plain_c,@function - .size computeForceLJFullNeigh_plain_c,.-computeForceLJFullNeigh_plain_c -..LNcomputeForceLJFullNeigh_plain_c.0: - .data -# -- End computeForceLJFullNeigh_plain_c - .text -.L_2__routine_start_computeForceLJHalfNeigh_1: -# -- Begin computeForceLJHalfNeigh - .text -# mark_begin; - .align 16,0x90 - .globl computeForceLJHalfNeigh -# --- computeForceLJHalfNeigh(Parameter *, Atom *, Neighbor *, Stats *) -computeForceLJHalfNeigh: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -..B2.1: # Preds ..B2.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_computeForceLJHalfNeigh.82: -..L83: - #105.96 - pushq %r12 #105.96 - .cfi_def_cfa_offset 16 - .cfi_offset 12, -16 - pushq %r13 #105.96 - .cfi_def_cfa_offset 24 - .cfi_offset 13, -24 - pushq %r14 #105.96 - .cfi_def_cfa_offset 32 - .cfi_offset 14, -32 - pushq %r15 #105.96 - .cfi_def_cfa_offset 40 - .cfi_offset 15, -40 - pushq %rbx #105.96 - .cfi_def_cfa_offset 48 - .cfi_offset 3, -48 - pushq %rbp #105.96 - .cfi_def_cfa_offset 56 - .cfi_offset 6, -56 - subq $216, %rsp #105.96 - .cfi_def_cfa_offset 272 - movq %rdi, %r15 #105.96 - movq %rsi, %r12 #105.96 - movq %rcx, %r14 #105.96 - movq %rdx, %r13 #105.96 - movsd 144(%r15), %xmm0 #109.27 - mulsd %xmm0, %xmm0 #109.45 - movsd 56(%r15), %xmm1 #110.23 - movsd 40(%r15), %xmm2 #111.24 - movl 4(%r12), %ebp #106.18 - movsd %xmm0, 48(%rsp) #109.45[spill] - movsd %xmm1, 40(%rsp) #110.23[spill] - movsd %xmm2, 24(%rsp) #111.24[spill] - testl %ebp, %ebp #114.24 - jle ..B2.51 # Prob 50% #114.24 - # LOE r12 r13 r14 r15 ebp -..B2.2: # Preds ..B2.1 - # Execution count [5.00e-03] - movslq %ebp, %rbp #106.18 - movq 64(%r12), %rdi #115.9 - lea (%rbp,%rbp,2), %eax #106.18 - movq %rbp, 32(%rsp) #106.18[spill] - cmpl $12, %eax #114.5 - jle ..B2.59 # Prob 0% #114.5 - # LOE rbp rdi r12 r13 r14 r15 ebp -..B2.3: # Preds ..B2.2 - # Execution count [1.00e+00] - movq %rbp, %rax #114.5 - xorl %esi, %esi #114.5 - lea (%rax,%rax,2), %rdx #114.5 - shlq $3, %rdx #114.5 - call _intel_fast_memset #114.5 - # LOE r12 r13 r14 r15 ebp -..B2.5: # Preds ..B2.73 ..B2.3 ..B2.71 - # Execution count [1.00e+00] - xorl %ebx, %ebx #120.22 - xorl %eax, %eax #121.16 -..___tag_value_computeForceLJHalfNeigh.101: -# getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.102: - # LOE r12 r13 r14 r15 ebx ebp xmm0 -..B2.80: # Preds ..B2.5 - # Execution count [1.00e+00] - movsd %xmm0, 16(%rsp) #121.16[spill] - # LOE r12 r13 r14 r15 ebx ebp -..B2.6: # Preds ..B2.80 - # Execution count [5.00e-01] - movl $.L_2__STRING.1, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.104: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.105: - # LOE r12 r13 r14 r15 ebx ebp -..B2.7: # Preds ..B2.6 - # Execution count [9.00e-01] - movsd .L_2il0floatpacket.3(%rip), %xmm10 #161.41 - movd %ebp, %xmm0 #106.18 - mulsd 24(%rsp), %xmm10 #161.41[spill] - xorl %eax, %eax #124.15 - movslq 8(%r13), %rdx #125.43 - xorl %edi, %edi #124.5 - shlq $2, %rdx #107.5 - xorl %r11d, %r11d #124.5 - movddup 40(%rsp), %xmm3 #110.21[spill] - movddup %xmm10, %xmm2 #161.41 - pshufd $0, %xmm0, %xmm0 #106.18 - movq 16(%r13), %rcx #125.19 - movq %rdx, 56(%rsp) #124.5[spill] - movddup 48(%rsp), %xmm6 #109.25[spill] - movsd 40(%rsp), %xmm8 #124.5[spill] - movsd 48(%rsp), %xmm13 #124.5[spill] - movq 32(%rsp), %rdx #124.5[spill] - movups .L_2il0floatpacket.4(%rip), %xmm1 #161.54 - movsd .L_2il0floatpacket.5(%rip), %xmm7 #161.54 - movq 24(%r13), %r13 #126.25 - movq 16(%r12), %rsi #127.25 - movq 64(%r12), %r8 #168.21 - movq (%r14), %r9 #179.9 - movq 8(%r14), %r10 #180.9 - movdqu %xmm0, 160(%rsp) #124.5[spill] - movups %xmm2, 192(%rsp) #124.5[spill] - movups %xmm3, 176(%rsp) #124.5[spill] - movl %ebp, 64(%rsp) #124.5[spill] - movq %r15, (%rsp) #124.5[spill] - movq %r14, 8(%rsp) #124.5[spill] - # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r13 ebx xmm6 xmm7 xmm8 xmm10 xmm13 -..B2.8: # Preds ..B2.49 ..B2.7 - # Execution count [5.00e+00] - movl (%r13,%rdi,4), %ebp #126.25 - addl %ebp, %ebx #138.9 - xorps %xmm5, %xmm5 #130.22 - movaps %xmm5, %xmm4 #131.22 - movsd (%r11,%rsi), %xmm12 #127.25 - movaps %xmm4, %xmm2 #132.22 - movsd 8(%r11,%rsi), %xmm11 #128.25 - movsd 16(%r11,%rsi), %xmm9 #129.25 - testl %ebp, %ebp #143.9 - jle ..B2.48 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.9: # Preds ..B2.8 - # Execution count [2.50e+00] - jbe ..B2.48 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.10: # Preds ..B2.9 - # Execution count [2.25e+00] - cmpl $2, %ebp #143.9 - jb ..B2.58 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.11: # Preds ..B2.10 - # Execution count [2.25e+00] - movq 56(%rsp), %r14 #125.43[spill] - movl %ebp, %r12d #143.9 - imulq %rax, %r14 #125.43 - xorps %xmm5, %xmm5 #130.22 - andl $-2, %r12d #143.9 - movaps %xmm5, %xmm4 #131.22 - movsd %xmm11, 128(%rsp) #143.9[spill] - movaps %xmm4, %xmm2 #132.22 - movsd %xmm12, 136(%rsp) #143.9[spill] - movddup %xmm12, %xmm1 #127.23 - xorl %r15d, %r15d #143.9 - movddup %xmm11, %xmm0 #128.23 - addq %rcx, %r14 #107.5 - movddup %xmm9, %xmm3 #129.23 - movslq %r12d, %r12 #143.9 - movsd %xmm9, 120(%rsp) #143.9[spill] - movsd %xmm10, 144(%rsp) #143.9[spill] - movl %ebp, 24(%rsp) #143.9[spill] - movq %r11, 72(%rsp) #143.9[spill] - movq %r10, 80(%rsp) #143.9[spill] - movq %r9, 88(%rsp) #143.9[spill] - movq %r13, 96(%rsp) #143.9[spill] - movq %rcx, 104(%rsp) #143.9[spill] - movq %rdi, 112(%rsp) #143.9[spill] - movdqu .L_2il0floatpacket.1(%rip), %xmm11 #143.9 - movdqu .L_2il0floatpacket.0(%rip), %xmm12 #143.9 - # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.12: # Preds ..B2.38 ..B2.11 - # Execution count [1.25e+01] - movq (%r14,%r15,4), %xmm7 #144.21 - movdqa %xmm12, %xmm15 #146.36 - movdqa %xmm7, %xmm8 #145.36 - paddd %xmm7, %xmm8 #145.36 - paddd %xmm7, %xmm8 #145.36 - movd %xmm8, %r9d #145.36 - paddd %xmm8, %xmm15 #146.36 - pshufd $57, %xmm8, %xmm10 #145.36 - paddd %xmm11, %xmm8 #147.36 - pshufd $57, %xmm15, %xmm13 #146.36 - movd %xmm10, %edi #145.36 - movaps %xmm1, %xmm10 #145.36 - movd %xmm15, %ebp #146.36 - movd %xmm13, %ecx #146.36 - movd %xmm8, %edx #147.36 - pshufd $57, %xmm8, %xmm8 #147.36 - movd %xmm8, %r10d #147.36 - movaps %xmm3, %xmm8 #147.36 - movslq %r9d, %r9 #145.36 - movslq %edi, %rdi #145.36 - movslq %ebp, %rbp #146.36 - movslq %ecx, %rcx #146.36 - movsd (%rsi,%r9,8), %xmm9 #145.36 - movhpd (%rsi,%rdi,8), %xmm9 #145.36 - movsd (%rsi,%rbp,8), %xmm14 #146.36 - subpd %xmm9, %xmm10 #145.36 - movhpd (%rsi,%rcx,8), %xmm14 #146.36 - movaps %xmm0, %xmm9 #146.36 - movslq %edx, %rdx #147.36 - subpd %xmm14, %xmm9 #146.36 - movslq %r10d, %r10 #147.36 - movaps %xmm9, %xmm13 #148.49 - movsd (%rsi,%rdx,8), %xmm15 #147.36 - mulpd %xmm9, %xmm13 #148.49 - movhpd (%rsi,%r10,8), %xmm15 #147.36 - subpd %xmm15, %xmm8 #147.36 - movaps %xmm10, %xmm15 #148.35 - movaps %xmm8, %xmm14 #148.63 - mulpd %xmm10, %xmm15 #148.35 - mulpd %xmm8, %xmm14 #148.63 - addpd %xmm13, %xmm15 #148.49 - addpd %xmm14, %xmm15 #148.63 - movaps %xmm15, %xmm13 #158.22 - pcmpeqd %xmm14, %xmm14 #158.22 - cmpltpd %xmm6, %xmm13 #158.22 - ptest %xmm14, %xmm13 #158.22 - je ..B2.38 # Prob 50% #158.22 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm15 -..B2.13: # Preds ..B2.12 - # Execution count [6.25e+00] - movups .L_2il0floatpacket.2(%rip), %xmm14 #159.38 - divpd %xmm15, %xmm14 #159.38 - movdqu 160(%rsp), %xmm15 #167.24[spill] - pcmpgtd %xmm7, %xmm15 #167.24 - pmovsxdq %xmm15, %xmm15 #167.24 - pcmpeqd %xmm7, %xmm7 #167.24 - andps %xmm13, %xmm15 #167.24 - ptest %xmm7, %xmm15 #167.24 - movups 176(%rsp), %xmm7 #160.38[spill] - mulpd %xmm14, %xmm7 #160.38 - mulpd %xmm14, %xmm7 #160.44 - mulpd %xmm14, %xmm7 #160.50 - mulpd 192(%rsp), %xmm14 #161.54[spill] - mulpd %xmm7, %xmm14 #161.61 - subpd .L_2il0floatpacket.4(%rip), %xmm7 #161.54 - mulpd %xmm7, %xmm14 #161.67 - mulpd %xmm14, %xmm10 #162.31 - mulpd %xmm14, %xmm9 #163.31 - mulpd %xmm14, %xmm8 #164.31 - movaps %xmm13, %xmm14 #162.31 - movaps %xmm13, %xmm7 #163.31 - andps %xmm10, %xmm14 #162.31 - andps %xmm9, %xmm7 #163.31 - andps %xmm8, %xmm13 #164.31 - addpd %xmm14, %xmm5 #162.17 - addpd %xmm7, %xmm4 #163.17 - addpd %xmm13, %xmm2 #164.17 - je ..B2.38 # Prob 50% #167.24 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 xmm11 xmm12 xmm15 -..B2.14: # Preds ..B2.13 - # Execution count [3.12e+00] - movmskpd %xmm15, %r13d #168.21 - movl %r13d, %r11d #168.21 - andl $2, %r11d #168.21 - andl $1, %r13d #168.21 - je ..B2.17 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 xmm11 xmm12 -..B2.15: # Preds ..B2.14 - # Execution count [3.12e+00] - movsd (%r8,%r9,8), %xmm7 #168.21 - testl %r11d, %r11d #168.21 - jne ..B2.18 # Prob 60% #168.21 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 -..B2.16: # Preds ..B2.15 - # Execution count [1.25e+00] - xorps %xmm13, %xmm13 #168.21 - unpcklpd %xmm13, %xmm7 #168.21 - subpd %xmm10, %xmm7 #168.21 - jmp ..B2.31 # Prob 100% #168.21 - # LOE rax rdx rbx rbp rsi r8 r9 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 -..B2.17: # Preds ..B2.14 - # Execution count [3.12e+00] - testl %r11d, %r11d #168.21 - xorps %xmm7, %xmm7 #168.21 - je ..B2.30 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 -..B2.18: # Preds ..B2.15 ..B2.17 - # Execution count [3.12e+00] - testl %r13d, %r13d #168.21 - movhpd (%r8,%rdi,8), %xmm7 #168.21 - subpd %xmm10, %xmm7 #168.21 - je ..B2.20 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 -..B2.19: # Preds ..B2.18 - # Execution count [1.88e+00] - pshufd $14, %xmm7, %xmm10 #168.21 - movsd %xmm7, (%r8,%r9,8) #168.21 - movsd %xmm10, (%r8,%rdi,8) #168.21 - movsd (%r8,%rbp,8), %xmm13 #169.21 - jmp ..B2.21 # Prob 100% #169.21 - # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 -..B2.20: # Preds ..B2.18 - # Execution count [1.25e+00] - pshufd $14, %xmm7, %xmm7 #168.21 - movsd %xmm7, (%r8,%rdi,8) #168.21 - xorps %xmm13, %xmm13 #169.21 - # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 -..B2.21: # Preds ..B2.19 ..B2.20 - # Execution count [1.88e+00] - testl %r11d, %r11d #169.21 - je ..B2.84 # Prob 40% #169.21 - # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 -..B2.22: # Preds ..B2.21 - # Execution count [3.12e+00] - testl %r13d, %r13d #169.21 - movhpd (%r8,%rcx,8), %xmm13 #169.21 - subpd %xmm9, %xmm13 #169.21 - je ..B2.24 # Prob 40% #169.21 - # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm11 xmm12 xmm13 -..B2.23: # Preds ..B2.22 - # Execution count [1.88e+00] - pshufd $14, %xmm13, %xmm7 #169.21 - movsd %xmm13, (%r8,%rbp,8) #169.21 - movsd %xmm7, (%r8,%rcx,8) #169.21 - movsd (%r8,%rdx,8), %xmm9 #170.21 - jmp ..B2.25 # Prob 100% #170.21 - # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 -..B2.24: # Preds ..B2.22 - # Execution count [1.25e+00] - pshufd $14, %xmm13, %xmm7 #169.21 - movsd %xmm7, (%r8,%rcx,8) #169.21 - xorps %xmm9, %xmm9 #170.21 - # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 -..B2.25: # Preds ..B2.23 ..B2.24 - # Execution count [1.88e+00] - testl %r11d, %r11d #170.21 - je ..B2.83 # Prob 40% #170.21 - # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 -..B2.26: # Preds ..B2.25 - # Execution count [3.12e+00] - testl %r13d, %r13d #170.21 - movhpd (%r8,%r10,8), %xmm9 #170.21 - subpd %xmm8, %xmm9 #170.21 - je ..B2.28 # Prob 40% #170.21 - # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.27: # Preds ..B2.26 - # Execution count [1.88e+00] - movsd %xmm9, (%r8,%rdx,8) #170.21 - pshufd $14, %xmm9, %xmm7 #170.21 - jmp ..B2.29 # Prob 100% #170.21 - # LOE rax rbx rsi r8 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm11 xmm12 -..B2.28: # Preds ..B2.26 - # Execution count [1.25e+00] - pshufd $14, %xmm9, %xmm7 #170.21 - # LOE rax rbx rsi r8 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm11 xmm12 -..B2.29: # Preds ..B2.27 ..B2.28 - # Execution count [3.12e+00] - movsd %xmm7, (%r8,%r10,8) #170.21 - jmp ..B2.38 # Prob 100% #170.21 - # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.30: # Preds ..B2.17 - # Execution count [1.88e+00] - testl %r13d, %r13d #168.21 - xorps %xmm7, %xmm7 #168.21 - subpd %xmm10, %xmm7 #168.21 - je ..B2.32 # Prob 40% #168.21 - # LOE rax rdx rbx rbp rsi r8 r9 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 -..B2.31: # Preds ..B2.16 ..B2.30 - # Execution count [1.25e+00] - movsd %xmm7, (%r8,%r9,8) #168.21 - movsd (%r8,%rbp,8), %xmm13 #169.21 - xorps %xmm10, %xmm10 #169.21 - unpcklpd %xmm10, %xmm13 #169.21 - subpd %xmm9, %xmm13 #169.21 - jmp ..B2.34 # Prob 100% #169.21 - # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm11 xmm12 xmm13 -..B2.32: # Preds ..B2.30 - # Execution count [0.00e+00] - xorps %xmm13, %xmm13 #169.21 - jmp ..B2.33 # Prob 100% #169.21 - # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 -..B2.84: # Preds ..B2.21 - # Execution count [7.50e-01] - testl %r13d, %r13d #168.21 - # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 -..B2.33: # Preds ..B2.32 ..B2.84 - # Execution count [2.67e+00] - xorps %xmm7, %xmm7 #169.21 - unpcklpd %xmm7, %xmm13 #169.21 - subpd %xmm9, %xmm13 #169.21 - je ..B2.35 # Prob 40% #169.21 - # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm11 xmm12 xmm13 -..B2.34: # Preds ..B2.31 ..B2.33 - # Execution count [1.25e+00] - movsd %xmm13, (%r8,%rbp,8) #169.21 - movsd (%r8,%rdx,8), %xmm9 #170.21 - xorps %xmm7, %xmm7 #170.21 - unpcklpd %xmm7, %xmm9 #170.21 - subpd %xmm8, %xmm9 #170.21 - jmp ..B2.37 # Prob 100% #170.21 - # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.35: # Preds ..B2.33 - # Execution count [0.00e+00] - xorps %xmm9, %xmm9 #170.21 - jmp ..B2.36 # Prob 100% #170.21 - # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 -..B2.83: # Preds ..B2.25 - # Execution count [7.50e-01] - testl %r13d, %r13d #168.21 - # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 -..B2.36: # Preds ..B2.35 ..B2.83 - # Execution count [2.67e+00] - xorps %xmm7, %xmm7 #170.21 - unpcklpd %xmm7, %xmm9 #170.21 - subpd %xmm8, %xmm9 #170.21 - je ..B2.38 # Prob 40% #170.21 - # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 -..B2.37: # Preds ..B2.34 ..B2.36 - # Execution count [1.25e+00] - movsd %xmm9, (%r8,%rdx,8) #170.21 - # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.38: # Preds ..B2.36 ..B2.29 ..B2.37 ..B2.13 ..B2.12 - # - # Execution count [1.25e+01] - addq $2, %r15 #143.9 - cmpq %r12, %r15 #143.9 - jb ..B2.12 # Prob 82% #143.9 - # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 -..B2.39: # Preds ..B2.38 - # Execution count [2.25e+00] - movaps %xmm2, %xmm0 #132.22 - movaps %xmm4, %xmm1 #131.22 - movaps %xmm5, %xmm3 #130.22 - unpckhpd %xmm2, %xmm0 #132.22 - unpckhpd %xmm4, %xmm1 #131.22 - addsd %xmm0, %xmm2 #132.22 - addsd %xmm1, %xmm4 #131.22 - unpckhpd %xmm5, %xmm3 #130.22 - movsd 120(%rsp), %xmm9 #[spill] - addsd %xmm3, %xmm5 #130.22 - movsd 128(%rsp), %xmm11 #[spill] - movsd 136(%rsp), %xmm12 #[spill] - movsd 144(%rsp), %xmm10 #[spill] - movsd 40(%rsp), %xmm8 #[spill] - movsd 48(%rsp), %xmm13 #[spill] - movl 24(%rsp), %ebp #[spill] - movq 72(%rsp), %r11 #[spill] - movq 80(%rsp), %r10 #[spill] - movq 88(%rsp), %r9 #[spill] - movq 96(%rsp), %r13 #[spill] - movq 104(%rsp), %rcx #[spill] - movq 112(%rsp), %rdi #[spill] - movq 32(%rsp), %rdx #[spill] - movsd .L_2il0floatpacket.5(%rip), %xmm7 # - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.40: # Preds ..B2.39 ..B2.58 - # Execution count [2.50e+00] - movslq %ebp, %r14 #143.9 - cmpq %r14, %r12 #143.9 - jae ..B2.49 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.41: # Preds ..B2.40 - # Execution count [2.25e+00] - imulq 56(%rsp), %rax #125.43[spill] - movl 64(%rsp), %edx #107.5[spill] - addq %rcx, %rax #107.5 - movq %rdi, 112(%rsp) #107.5[spill] - # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edx ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.42: # Preds ..B2.45 ..B2.41 - # Execution count [1.25e+01] - movl (%rax,%r12,4), %edi #144.21 - movaps %xmm12, %xmm14 #145.36 - movaps %xmm11, %xmm3 #146.36 - movaps %xmm9, %xmm1 #147.36 - lea (%rdi,%rdi,2), %r15d #145.36 - movslq %r15d, %r15 #145.36 - subsd (%rsi,%r15,8), %xmm14 #145.36 - subsd 8(%rsi,%r15,8), %xmm3 #146.36 - subsd 16(%rsi,%r15,8), %xmm1 #147.36 - movaps %xmm14, %xmm15 #148.35 - movaps %xmm3, %xmm0 #148.49 - mulsd %xmm14, %xmm15 #148.35 - mulsd %xmm3, %xmm0 #148.49 - addsd %xmm0, %xmm15 #148.49 - movaps %xmm1, %xmm0 #148.63 - mulsd %xmm1, %xmm0 #148.63 - addsd %xmm0, %xmm15 #148.63 - comisd %xmm15, %xmm13 #158.22 - jbe ..B2.45 # Prob 50% #158.22 - # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 edx ebp edi xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 -..B2.43: # Preds ..B2.42 - # Execution count [6.25e+00] - movsd .L_2il0floatpacket.6(%rip), %xmm0 #159.38 - divsd %xmm15, %xmm0 #159.38 - movaps %xmm8, %xmm15 #160.38 - mulsd %xmm0, %xmm15 #160.38 - mulsd %xmm0, %xmm15 #160.44 - mulsd %xmm0, %xmm15 #160.50 - mulsd %xmm10, %xmm0 #161.54 - mulsd %xmm15, %xmm0 #161.61 - subsd %xmm7, %xmm15 #161.54 - mulsd %xmm15, %xmm0 #161.67 - mulsd %xmm0, %xmm14 #162.31 - mulsd %xmm0, %xmm3 #163.31 - mulsd %xmm0, %xmm1 #164.31 - addsd %xmm14, %xmm5 #162.17 - addsd %xmm3, %xmm4 #163.17 - addsd %xmm1, %xmm2 #164.17 - cmpl %edx, %edi #167.24 - jge ..B2.45 # Prob 50% #167.24 - # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 edx ebp xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 -..B2.44: # Preds ..B2.43 - # Execution count [3.12e+00] - movsd (%r8,%r15,8), %xmm0 #168.21 - subsd %xmm14, %xmm0 #168.21 - movsd 8(%r8,%r15,8), %xmm14 #169.21 - subsd %xmm3, %xmm14 #169.21 - movsd 16(%r8,%r15,8), %xmm3 #170.21 - movsd %xmm0, (%r8,%r15,8) #168.21 - subsd %xmm1, %xmm3 #170.21 - movsd %xmm14, 8(%r8,%r15,8) #169.21 - movsd %xmm3, 16(%r8,%r15,8) #170.21 - # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edx ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.45: # Preds ..B2.44 ..B2.43 ..B2.42 - # Execution count [1.25e+01] - incq %r12 #143.9 - cmpq %r14, %r12 #143.9 - jb ..B2.42 # Prob 82% #143.9 - # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edx ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.46: # Preds ..B2.45 - # Execution count [2.25e+00] - movq 112(%rsp), %rdi #[spill] - movq 32(%rsp), %rdx #[spill] - jmp ..B2.49 # Prob 100% # - # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 r14 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm13 -..B2.48: # Preds ..B2.9 ..B2.8 - # Execution count [2.50e+00] - movslq %ebp, %r14 #179.9 - # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 r14 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm13 -..B2.49: # Preds ..B2.46 ..B2.40 ..B2.48 - # Execution count [5.00e+00] - addq %r14, %r9 #179.9 - lea 1(%rbp), %eax #180.9 - shrl $31, %eax #180.9 - addsd (%r11,%r8), %xmm5 #175.9 - addsd 8(%r11,%r8), %xmm4 #176.9 - addsd 16(%r11,%r8), %xmm2 #177.9 - movsd %xmm5, (%r11,%r8) #175.9 - lea 1(%rbp,%rax), %ebp #180.9 - sarl $1, %ebp #180.9 - movslq %ebp, %rbp #180.9 - movslq %edi, %rax #124.32 - incq %rdi #124.5 - movsd %xmm4, 8(%r11,%r8) #176.9 - addq %rbp, %r10 #180.9 - movsd %xmm2, 16(%r11,%r8) #177.9 - addq $24, %r11 #124.5 - incq %rax #124.32 - cmpq %rdx, %rdi #124.5 - jb ..B2.8 # Prob 82% #124.5 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 xmm6 xmm7 xmm8 xmm10 xmm13 -..B2.50: # Preds ..B2.49 - # Execution count [9.00e-01] - movq 8(%rsp), %r14 #[spill] - movq (%rsp), %r15 #[spill] - movq %r9, (%r14) #179.9 - movq %r10, 8(%r14) #180.9 - jmp ..B2.54 # Prob 100% #180.9 - # LOE rbx r15 -..B2.51: # Preds ..B2.1 - # Execution count [5.00e-01] - xorl %ebx, %ebx #120.22 - xorl %eax, %eax #121.16 -..___tag_value_computeForceLJHalfNeigh.155: -# getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.156: - # LOE rbx r15 xmm0 -..B2.81: # Preds ..B2.51 - # Execution count [5.00e-01] - movsd %xmm0, 16(%rsp) #121.16[spill] - # LOE rbx r15 -..B2.52: # Preds ..B2.81 - # Execution count [5.00e-01] - movl $.L_2__STRING.1, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.158: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.159: - # LOE rbx r15 -..B2.54: # Preds ..B2.52 ..B2.50 - # Execution count [1.00e+00] - movl $.L_2__STRING.1, %edi #183.5 -..___tag_value_computeForceLJHalfNeigh.160: -# likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #183.5 -..___tag_value_computeForceLJHalfNeigh.161: - # LOE rbx r15 -..B2.55: # Preds ..B2.54 - # Execution count [1.00e+00] - xorl %eax, %eax #184.16 -..___tag_value_computeForceLJHalfNeigh.162: -# getTimeStamp() - call getTimeStamp #184.16 -..___tag_value_computeForceLJHalfNeigh.163: - # LOE rbx r15 xmm0 -..B2.82: # Preds ..B2.55 - # Execution count [1.00e+00] - movaps %xmm0, %xmm1 #184.16 - # LOE rbx r15 xmm1 -..B2.56: # Preds ..B2.82 - # Execution count [1.00e+00] - xorps %xmm3, %xmm3 #185.5 - cvtsi2sdq %rbx, %xmm3 #185.5 - subsd 16(%rsp), %xmm1 #185.94[spill] - movsd .L_2il0floatpacket.7(%rip), %xmm2 #185.5 - movl $.L_2__STRING.2, %edi #185.5 - divsd %xmm3, %xmm2 #185.5 - mulsd %xmm1, %xmm2 #185.5 - movl %ebx, %esi #185.5 - movsd 264(%r15), %xmm0 #185.74 - movl $3, %eax #185.5 - mulsd %xmm0, %xmm2 #185.5 - movsd %xmm1, (%rsp) #185.5[spill] -..___tag_value_computeForceLJHalfNeigh.165: -# printf(const char *__restrict__, ...) - call printf #185.5 -..___tag_value_computeForceLJHalfNeigh.166: - # LOE -..B2.57: # Preds ..B2.56 - # Execution count [1.00e+00] - movsd (%rsp), %xmm1 #[spill] - movaps %xmm1, %xmm0 #186.14 - addq $216, %rsp #186.14 - .cfi_def_cfa_offset 56 - .cfi_restore 6 - popq %rbp #186.14 - .cfi_def_cfa_offset 48 - .cfi_restore 3 - popq %rbx #186.14 - .cfi_def_cfa_offset 40 - .cfi_restore 15 - popq %r15 #186.14 - .cfi_def_cfa_offset 32 - .cfi_restore 14 - popq %r14 #186.14 - .cfi_def_cfa_offset 24 - .cfi_restore 13 - popq %r13 #186.14 - .cfi_def_cfa_offset 16 - .cfi_restore 12 - popq %r12 #186.14 - .cfi_def_cfa_offset 8 - ret #186.14 - .cfi_def_cfa_offset 272 - .cfi_offset 3, -48 - .cfi_offset 6, -56 - .cfi_offset 12, -16 - .cfi_offset 13, -24 - .cfi_offset 14, -32 - .cfi_offset 15, -40 - # LOE -..B2.58: # Preds ..B2.10 - # Execution count [2.25e-01]: Infreq - xorl %r12d, %r12d #143.9 - jmp ..B2.40 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 -..B2.59: # Preds ..B2.2 - # Execution count [1.00e+00]: Infreq - movq %rbp, %rax #106.18 - lea (%rax,%rax,2), %rax #106.18 - cmpq $4, %rax #114.5 - jl ..B2.75 # Prob 10% #114.5 - # LOE rax rdi r12 r13 r14 r15 ebp -..B2.60: # Preds ..B2.59 - # Execution count [1.00e+00]: Infreq - movq %rdi, %rdx #114.5 - andq $15, %rdx #114.5 - testl %edx, %edx #114.5 - je ..B2.63 # Prob 50% #114.5 - # LOE rax rdi r12 r13 r14 r15 edx ebp -..B2.61: # Preds ..B2.60 - # Execution count [1.00e+00]: Infreq - testb $7, %dl #114.5 - jne ..B2.75 # Prob 10% #114.5 - # LOE rax rdi r12 r13 r14 r15 ebp -..B2.62: # Preds ..B2.61 - # Execution count [5.00e-01]: Infreq - movl $1, %edx #114.5 - # LOE rax rdi r12 r13 r14 r15 edx ebp -..B2.63: # Preds ..B2.62 ..B2.60 - # Execution count [1.00e+00]: Infreq - movl %edx, %esi #114.5 - lea 4(%rsi), %rcx #114.5 - cmpq %rcx, %rax #114.5 - jl ..B2.75 # Prob 10% #114.5 - # LOE rax rsi rdi r12 r13 r14 r15 edx ebp -..B2.64: # Preds ..B2.63 - # Execution count [1.11e+00]: Infreq - movl %eax, %r8d #114.5 - movl %r8d, %ecx #114.5 - subl %edx, %ecx #114.5 - andl $3, %ecx #114.5 - subl %ecx, %r8d #114.5 - xorl %ecx, %ecx #114.5 - xorl %ebx, %ebx #115.22 - testl %edx, %edx #114.5 - movslq %r8d, %rdx #114.5 - jbe ..B2.68 # Prob 10% #114.5 - # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ebp -..B2.66: # Preds ..B2.64 ..B2.66 - # Execution count [5.56e+00]: Infreq - movq %rbx, (%rdi,%rcx,8) #115.9 - incq %rcx #114.5 - cmpq %rsi, %rcx #114.5 - jb ..B2.66 # Prob 82% #114.5 - # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ebp -..B2.68: # Preds ..B2.66 ..B2.64 - # Execution count [1.00e+00]: Infreq - xorps %xmm0, %xmm0 #115.22 - # LOE rax rdx rsi rdi r12 r13 r14 r15 ebp xmm0 -..B2.69: # Preds ..B2.69 ..B2.68 - # Execution count [5.56e+00]: Infreq - movups %xmm0, (%rdi,%rsi,8) #115.9 - movups %xmm0, 16(%rdi,%rsi,8) #115.9 - addq $4, %rsi #114.5 - cmpq %rdx, %rsi #114.5 - jb ..B2.69 # Prob 82% #114.5 - # LOE rax rdx rsi rdi r12 r13 r14 r15 ebp xmm0 -..B2.71: # Preds ..B2.69 ..B2.75 - # Execution count [1.11e+00]: Infreq - cmpq %rax, %rdx #114.5 - jae ..B2.5 # Prob 10% #114.5 - # LOE rax rdx rdi r12 r13 r14 r15 ebp -..B2.73: # Preds ..B2.71 ..B2.73 - # Execution count [5.56e+00]: Infreq - movq $0, (%rdi,%rdx,8) #115.9 - incq %rdx #114.5 - cmpq %rax, %rdx #114.5 - jb ..B2.73 # Prob 82% #114.5 - jmp ..B2.5 # Prob 100% #114.5 - # LOE rax rdx rdi r12 r13 r14 r15 ebp -..B2.75: # Preds ..B2.59 ..B2.61 ..B2.63 - # Execution count [1.00e-01]: Infreq - xorl %edx, %edx #114.5 - jmp ..B2.71 # Prob 100% #114.5 - .align 16,0x90 - # LOE rax rdx rdi r12 r13 r14 r15 ebp - .cfi_endproc -# mark_end; - .type computeForceLJHalfNeigh,@function - .size computeForceLJHalfNeigh,.-computeForceLJHalfNeigh -..LNcomputeForceLJHalfNeigh.1: - .data -# -- End computeForceLJHalfNeigh - .text -.L_2__routine_start_computeForceLJFullNeigh_simd_2: -# -- Begin computeForceLJFullNeigh_simd - .text -# mark_begin; - .align 16,0x90 - .globl computeForceLJFullNeigh_simd -# --- computeForceLJFullNeigh_simd(Parameter *, Atom *, Neighbor *, Stats *) -computeForceLJFullNeigh_simd: -# parameter 1: %rdi -# parameter 2: %rsi -# parameter 3: %rdx -# parameter 4: %rcx -..B3.1: # Preds ..B3.0 - # Execution count [1.00e+00] - .cfi_startproc -..___tag_value_computeForceLJFullNeigh_simd.190: -..L191: - #189.101 - pushq %rsi #189.101 - .cfi_def_cfa_offset 16 - movl 4(%rsi), %edx #190.18 - testl %edx, %edx #196.24 - jle ..B3.4 # Prob 50% #196.24 - # LOE rbx rbp rsi r12 r13 r14 r15 edx -..B3.2: # Preds ..B3.1 - # Execution count [5.00e-03] - movq 64(%rsi), %rdi #197.9 - lea (%rdx,%rdx,2), %eax #190.18 - cmpl $12, %eax #196.5 - jle ..B3.8 # Prob 0% #196.5 - # LOE rbx rbp rdi r12 r13 r14 r15 edx -..B3.3: # Preds ..B3.2 - # Execution count [1.00e+00] - movslq %edx, %rdx #196.5 - xorl %esi, %esi #196.5 - lea (%rdx,%rdx,2), %rdx #196.5 - shlq $3, %rdx #196.5 - call _intel_fast_memset #196.5 - # LOE rbx rbp r12 r13 r14 r15 -..B3.4: # Preds ..B3.22 ..B3.1 ..B3.20 ..B3.3 - # Execution count [1.00e+00] - xorl %eax, %eax #203.16 -..___tag_value_computeForceLJFullNeigh_simd.193: -# getTimeStamp() - call getTimeStamp #203.16 -..___tag_value_computeForceLJFullNeigh_simd.194: - # LOE rbx rbp r12 r13 r14 r15 -..B3.5: # Preds ..B3.4 - # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #204.5 -..___tag_value_computeForceLJFullNeigh_simd.195: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #204.5 -..___tag_value_computeForceLJFullNeigh_simd.196: - # LOE -..B3.6: # Preds ..B3.5 - # Execution count [1.00e+00] - movl $il0_peep_printf_format_0, %edi #207.5 - movq stderr(%rip), %rsi #207.5 - call fputs #207.5 - # LOE -..B3.7: # Preds ..B3.6 - # Execution count [1.00e+00] - movl $-1, %edi #208.5 -# exit(int) - call exit #208.5 - # LOE -..B3.8: # Preds ..B3.2 - # Execution count [1.00e+00]: Infreq - movslq %edx, %rdx #196.5 - lea (%rdx,%rdx,2), %rcx #190.18 - cmpq $4, %rcx #196.5 - jl ..B3.24 # Prob 10% #196.5 - # LOE rcx rbx rbp rdi r12 r13 r14 r15 -..B3.9: # Preds ..B3.8 - # Execution count [1.00e+00]: Infreq - movq %rdi, %rdx #196.5 - andq $15, %rdx #196.5 - testl %edx, %edx #196.5 - je ..B3.12 # Prob 50% #196.5 - # LOE rcx rbx rbp rdi r12 r13 r14 r15 edx -..B3.10: # Preds ..B3.9 - # Execution count [1.00e+00]: Infreq - testb $7, %dl #196.5 - jne ..B3.24 # Prob 10% #196.5 - # LOE rcx rbx rbp rdi r12 r13 r14 r15 -..B3.11: # Preds ..B3.10 - # Execution count [5.00e-01]: Infreq - movl $1, %edx #196.5 - # LOE rcx rbx rbp rdi r12 r13 r14 r15 edx -..B3.12: # Preds ..B3.11 ..B3.9 - # Execution count [1.00e+00]: Infreq - movl %edx, %eax #196.5 - lea 4(%rax), %rsi #196.5 - cmpq %rsi, %rcx #196.5 - jl ..B3.24 # Prob 10% #196.5 - # LOE rax rcx rbx rbp rdi r12 r13 r14 r15 edx -..B3.13: # Preds ..B3.12 - # Execution count [1.11e+00]: Infreq - movl %ecx, %r8d #196.5 - xorl %r9d, %r9d #196.5 - movl %r8d, %esi #196.5 - subl %edx, %esi #196.5 - andl $3, %esi #196.5 - subl %esi, %r8d #196.5 - xorl %esi, %esi #196.5 - movslq %r8d, %r8 #196.5 - testl %edx, %edx #196.5 - jbe ..B3.17 # Prob 10% #196.5 - # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 -..B3.15: # Preds ..B3.13 ..B3.15 - # Execution count [5.56e+00]: Infreq - movq %rsi, (%rdi,%r9,8) #197.9 - incq %r9 #196.5 - cmpq %rax, %r9 #196.5 - jb ..B3.15 # Prob 82% #196.5 - # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 -..B3.17: # Preds ..B3.15 ..B3.13 - # Execution count [1.00e+00]: Infreq - xorps %xmm0, %xmm0 #197.22 - # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 xmm0 -..B3.18: # Preds ..B3.18 ..B3.17 - # Execution count [5.56e+00]: Infreq - movups %xmm0, (%rdi,%rax,8) #197.9 - movups %xmm0, 16(%rdi,%rax,8) #197.9 - addq $4, %rax #196.5 - cmpq %r8, %rax #196.5 - jb ..B3.18 # Prob 82% #196.5 - # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 xmm0 -..B3.20: # Preds ..B3.18 ..B3.24 - # Execution count [1.11e+00]: Infreq - cmpq %rcx, %r8 #196.5 - jae ..B3.4 # Prob 10% #196.5 - # LOE rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 -..B3.22: # Preds ..B3.20 ..B3.22 - # Execution count [5.56e+00]: Infreq - movq %rsi, (%rdi,%r8,8) #197.9 - incq %r8 #196.5 - cmpq %rcx, %r8 #196.5 - jb ..B3.22 # Prob 82% #196.5 - jmp ..B3.4 # Prob 100% #196.5 - # LOE rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 -..B3.24: # Preds ..B3.8 ..B3.10 ..B3.12 - # Execution count [1.00e-01]: Infreq - xorl %r8d, %r8d #196.5 - xorl %esi, %esi #196.5 - jmp ..B3.20 # Prob 100% #196.5 - .align 16,0x90 - # LOE rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 - .cfi_endproc -# mark_end; - .type computeForceLJFullNeigh_simd,@function - .size computeForceLJFullNeigh_simd,.-computeForceLJFullNeigh_simd -..LNcomputeForceLJFullNeigh_simd.2: - .section .rodata.str1.32, "aMS",@progbits,1 - .align 32 - .align 32 -il0_peep_printf_format_0: - .long 1869771333 - .long 1394621042 - .long 541347145 - .long 1852990827 - .long 1847618661 - .long 1763734639 - .long 1701605485 - .long 1953391981 - .long 1713398885 - .long 1931506287 - .long 1768121712 - .long 1684367718 - .long 1936615712 - .long 1668641396 - .long 1852795252 - .long 1952805664 - .word 33 - .data -# -- End computeForceLJFullNeigh_simd - .section .rodata, "a" - .align 16 - .align 16 -.L_2il0floatpacket.0: - .long 0x00000001,0x00000001,0x00000001,0x00000001 - .type .L_2il0floatpacket.0,@object - .size .L_2il0floatpacket.0,16 - .align 16 -.L_2il0floatpacket.1: - .long 0x00000002,0x00000002,0x00000002,0x00000002 - .type .L_2il0floatpacket.1,@object - .size .L_2il0floatpacket.1,16 - .align 16 -.L_2il0floatpacket.2: - .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000 - .type .L_2il0floatpacket.2,@object - .size .L_2il0floatpacket.2,16 - .align 16 -.L_2il0floatpacket.4: - .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000 - .type .L_2il0floatpacket.4,@object - .size .L_2il0floatpacket.4,16 - .align 8 -.L_2il0floatpacket.3: - .long 0x00000000,0x40480000 - .type .L_2il0floatpacket.3,@object - .size .L_2il0floatpacket.3,8 - .align 8 -.L_2il0floatpacket.5: - .long 0x00000000,0x3fe00000 - .type .L_2il0floatpacket.5,@object - .size .L_2il0floatpacket.5,8 - .align 8 -.L_2il0floatpacket.6: - .long 0x00000000,0x3ff00000 - .type .L_2il0floatpacket.6,@object - .size .L_2il0floatpacket.6,8 - .align 8 -.L_2il0floatpacket.7: - .long 0x00000000,0x41cdcd65 - .type .L_2il0floatpacket.7,@object - .size .L_2il0floatpacket.7,8 - .section .rodata.str1.4, "aMS",@progbits,1 - .align 4 - .align 4 -.L_2__STRING.0: - .long 1668444006 - .word 101 - .type .L_2__STRING.0,@object - .size .L_2__STRING.0,6 - .space 2, 0x00 # pad - .align 4 -.L_2__STRING.1: - .long 1668444006 - .long 759843941 - .long 1718378856 - .long 1734960494 - .word 104 - .type .L_2__STRING.1,@object - .size .L_2__STRING.1,18 - .space 2, 0x00 # pad - .align 4 -.L_2__STRING.2: - .long 980644937 - .long 544548128 - .long 1701987872 - .long 622869105 - .long 1411391590 - .long 979725673 - .long 174466336 - .long 1764718915 - .long 622869108 - .long 1747460198 - .long 761687137 - .long 1734960494 - .long 665960 - .type .L_2__STRING.2,@object - .size .L_2__STRING.2,52 - .data - .section .note.GNU-stack, "" -# End diff --git a/static_analysis/jan/lammps-icc-avx2.o b/static_analysis/jan/lammps-icc-avx2.o new file mode 100644 index 0000000..a84e105 Binary files /dev/null and b/static_analysis/jan/lammps-icc-avx2.o differ diff --git a/static_analysis/jan/icx-icc-lammps-avx2.s b/static_analysis/jan/lammps-icc-avx2.s similarity index 70% rename from static_analysis/jan/icx-icc-lammps-avx2.s rename to static_analysis/jan/lammps-icc-avx2.s index 8f54169..000d54f 100644 --- a/static_analysis/jan/icx-icc-lammps-avx2.s +++ b/static_analysis/jan/lammps-icc-avx2.s @@ -23,50 +23,50 @@ computeForceLJFullNeigh_plain_c: .cfi_startproc ..___tag_value_computeForceLJFullNeigh_plain_c.1: ..L2: - #23.104 - pushq %rbp #23.104 + #21.104 + pushq %rbp #21.104 .cfi_def_cfa_offset 16 - movq %rsp, %rbp #23.104 + movq %rsp, %rbp #21.104 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 - andq $-32, %rsp #23.104 - pushq %r13 #23.104 - pushq %r14 #23.104 - pushq %r15 #23.104 - pushq %rbx #23.104 - subq $224, %rsp #23.104 + andq $-32, %rsp #21.104 + pushq %r13 #21.104 + pushq %r14 #21.104 + pushq %r15 #21.104 + pushq %rbx #21.104 + subq $224, %rsp #21.104 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 - movq %rsi, %r15 #23.104 - vmovsd 144(%rdi), %xmm0 #27.27 - movq %rcx, %r13 #23.104 - vmulsd %xmm0, %xmm0, %xmm1 #27.45 - movq %rdx, %r14 #23.104 - vmovsd 56(%rdi), %xmm2 #28.23 - vmovsd 40(%rdi), %xmm3 #29.24 - movl 4(%r15), %eax #24.18 - vmovsd %xmm1, 128(%rsp) #27.45[spill] - vmovsd %xmm2, 136(%rsp) #28.23[spill] - vmovsd %xmm3, 24(%rsp) #29.24[spill] - testl %eax, %eax #32.24 - jle ..B1.34 # Prob 50% #32.24 + movq %rsi, %r15 #21.104 + vmovsd 144(%rdi), %xmm0 #25.27 + movq %rcx, %r13 #21.104 + vmulsd %xmm0, %xmm0, %xmm1 #25.45 + movq %rdx, %r14 #21.104 + vmovsd 56(%rdi), %xmm2 #26.23 + vmovsd 40(%rdi), %xmm3 #27.24 + movl 4(%r15), %eax #22.18 + vmovsd %xmm1, 128(%rsp) #25.45[spill] + vmovsd %xmm2, 136(%rsp) #26.23[spill] + vmovsd %xmm3, 24(%rsp) #27.24[spill] + testl %eax, %eax #33.24 + jle ..B1.34 # Prob 50% #33.24 # LOE r12 r13 r14 r15 eax ..B1.2: # Preds ..B1.1 # Execution count [5.00e-03] - movslq %eax, %rbx #24.18 - lea (%rax,%rax,2), %eax #24.18 - movq 64(%r15), %rdi #33.9 - cmpl $12, %eax #32.5 - jle ..B1.43 # Prob 0% #32.5 + movslq %eax, %rbx #22.18 + lea (%rax,%rax,2), %eax #22.18 + movq 64(%r15), %rdi #34.9 + cmpl $12, %eax #33.5 + jle ..B1.43 # Prob 0% #33.5 # LOE rbx rdi r12 r13 r14 r15 ..B1.3: # Preds ..B1.2 # Execution count [1.00e+00] - xorl %esi, %esi #32.5 - lea (%rbx,%rbx,2), %rdx #32.5 - shlq $3, %rdx #32.5 - call __intel_avx_rep_memset #32.5 + xorl %esi, %esi #33.5 + lea (%rbx,%rbx,2), %rdx #33.5 + shlq $3, %rdx #33.5 + call __intel_avx_rep_memset #33.5 # LOE rbx r12 r13 r14 r15 ..B1.5: # Preds ..B1.49 ..B1.3 ..B1.47 # Execution count [1.00e+00] @@ -83,106 +83,106 @@ computeForceLJFullNeigh_plain_c: # LOE rbx r12 r13 r14 r15 ..B1.6: # Preds ..B1.54 # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 + movl $.L_2__STRING.0, %edi #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.16: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 + call likwid_markerStartRegion #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.17: # LOE rbx r12 r13 r14 r15 ..B1.7: # Preds ..B1.6 # Execution count [9.00e-01] - vmovsd 24(%rsp), %xmm0 #77.41[spill] - xorl %eax, %eax #41.15 - vmulsd .L_2il0floatpacket.0(%rip), %xmm0, %xmm4 #77.41 - xorl %ecx, %ecx #41.5 - vbroadcastsd 128(%rsp), %ymm6 #27.25[spill] - vbroadcastsd %xmm4, %ymm7 #77.41 - vbroadcastsd 136(%rsp), %ymm2 #28.21[spill] + vmovsd 24(%rsp), %xmm0 #77.42[spill] + xorl %eax, %eax #45.15 + vmulsd .L_2il0floatpacket.0(%rip), %xmm0, %xmm4 #77.42 + xorl %ecx, %ecx #45.5 + vbroadcastsd 128(%rsp), %ymm6 #25.25[spill] + vbroadcastsd %xmm4, %ymm7 #77.42 + vbroadcastsd 136(%rsp), %ymm2 #26.21[spill] vmovsd .L_2il0floatpacket.4(%rip), %xmm5 #75.32 - vmovsd .L_2il0floatpacket.1(%rip), %xmm0 #77.54 - vmovupd %ymm6, 32(%rsp) #41.5[spill] - vmovupd %ymm7, 64(%rsp) #41.5[spill] - vmovsd 136(%rsp), %xmm6 #41.5[spill] - vmovsd 128(%rsp), %xmm7 #41.5[spill] - vmovupd %ymm2, 96(%rsp) #41.5[spill] - movslq 8(%r14), %rsi #42.43 - xorl %edi, %edi #41.5 - movq 16(%r14), %rdx #42.19 - shlq $2, %rsi #25.5 - movq 24(%r14), %r14 #43.25 - movq 16(%r15), %r11 #44.25 + vmovsd .L_2il0floatpacket.1(%rip), %xmm0 #77.55 + vmovupd %ymm6, 32(%rsp) #45.5[spill] + vmovupd %ymm7, 64(%rsp) #45.5[spill] + vmovsd 136(%rsp), %xmm6 #45.5[spill] + vmovsd 128(%rsp), %xmm7 #45.5[spill] + vmovupd %ymm2, 96(%rsp) #45.5[spill] + movslq 8(%r14), %rsi #46.43 + xorl %edi, %edi #45.5 + movq 16(%r14), %rdx #46.19 + shlq $2, %rsi #23.5 + movq 24(%r14), %r14 #47.25 + movq 16(%r15), %r11 #48.25 movq 64(%r15), %r8 #89.9 movq (%r13), %r9 #93.9 movq 8(%r13), %r10 #94.9 - movq %rsi, 144(%rsp) #41.5[spill] - movq %rdx, 152(%rsp) #41.5[spill] - movq %rbx, 208(%rsp) #41.5[spill] - movq %r13, (%rsp) #41.5[spill] - movq %r12, 8(%rsp) #41.5[spill] + movq %rsi, 144(%rsp) #45.5[spill] + movq %rdx, 152(%rsp) #45.5[spill] + movq %rbx, 208(%rsp) #45.5[spill] + movq %r13, (%rsp) #45.5[spill] + movq %r12, 8(%rsp) #45.5[spill] .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22 # LOE rax rcx rdi r8 r9 r10 r11 r14 xmm0 xmm4 xmm5 xmm6 xmm7 ..B1.8: # Preds ..B1.32 ..B1.7 # Execution count [5.00e+00] - movl (%r14,%rcx,4), %r13d #43.25 - testl %r13d, %r13d #56.28 - vxorpd %xmm8, %xmm8, %xmm8 #47.22 - vmovapd %xmm8, %xmm9 #48.22 - vmovsd (%rdi,%r11), %xmm3 #44.25 - vmovapd %xmm9, %xmm10 #49.22 - vmovsd 8(%rdi,%r11), %xmm2 #45.25 - vmovsd 16(%rdi,%r11), %xmm1 #46.25 - movslq %r13d, %r12 #56.9 - jle ..B1.32 # Prob 50% #56.28 + movl (%r14,%rcx,4), %r13d #47.25 + testl %r13d, %r13d #59.28 + vxorpd %xmm8, %xmm8, %xmm8 #51.22 + vmovapd %xmm8, %xmm9 #52.22 + vmovsd (%rdi,%r11), %xmm3 #48.25 + vmovapd %xmm9, %xmm10 #53.22 + vmovsd 8(%rdi,%r11), %xmm2 #49.25 + vmovsd 16(%rdi,%r11), %xmm1 #50.25 + movslq %r13d, %r12 #59.9 + jle ..B1.32 # Prob 50% #59.28 # LOE rax rcx rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.9: # Preds ..B1.8 # Execution count [4.50e+00] - cmpq $4, %r12 #56.9 - jl ..B1.39 # Prob 10% #56.9 + cmpq $4, %r12 #59.9 + jl ..B1.39 # Prob 10% #59.9 # LOE rax rcx rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.10: # Preds ..B1.9 # Execution count [4.50e+00] - movq 144(%rsp), %rbx #42.43[spill] - imulq %rax, %rbx #42.43 - addq 152(%rsp), %rbx #25.5[spill] - cmpq $600, %r12 #56.9 - jl ..B1.41 # Prob 10% #56.9 + movq 144(%rsp), %rbx #46.43[spill] + imulq %rax, %rbx #46.43 + addq 152(%rsp), %rbx #23.5[spill] + cmpq $600, %r12 #59.9 + jl ..B1.41 # Prob 10% #59.9 # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.11: # Preds ..B1.10 # Execution count [4.50e+00] - movq %rbx, %r15 #56.9 - andq $31, %r15 #56.9 - testl %r15d, %r15d #56.9 - je ..B1.14 # Prob 50% #56.9 + movq %rbx, %r15 #59.9 + andq $31, %r15 #59.9 + testl %r15d, %r15d #59.9 + je ..B1.14 # Prob 50% #59.9 # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.12: # Preds ..B1.11 # Execution count [4.50e+00] - testl $3, %r15d #56.9 - jne ..B1.39 # Prob 10% #56.9 + testl $3, %r15d #59.9 + jne ..B1.39 # Prob 10% #59.9 # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.13: # Preds ..B1.12 # Execution count [2.25e+00] - negl %r15d #56.9 - addl $32, %r15d #56.9 - shrl $2, %r15d #56.9 + negl %r15d #59.9 + addl $32, %r15d #59.9 + shrl $2, %r15d #59.9 # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.14: # Preds ..B1.13 ..B1.11 # Execution count [4.50e+00] - movl %r15d, %edx #56.9 - lea 4(%rdx), %rsi #56.9 - cmpq %rsi, %r12 #56.9 - jl ..B1.39 # Prob 10% #56.9 + movl %r15d, %edx #59.9 + lea 4(%rdx), %rsi #59.9 + cmpq %rsi, %r12 #59.9 + jl ..B1.39 # Prob 10% #59.9 # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.15: # Preds ..B1.14 # Execution count [5.00e+00] - movl %r13d, %esi #56.9 - subl %r15d, %esi #56.9 - andl $3, %esi #56.9 - negl %esi #56.9 - addl %r13d, %esi #56.9 - movslq %esi, %rsi #56.9 - testl %r15d, %r15d #56.9 - movl $0, %r15d #56.9 - jbe ..B1.21 # Prob 10% #56.9 + movl %r13d, %esi #59.9 + subl %r15d, %esi #59.9 + andl $3, %esi #59.9 + negl %esi #59.9 + addl %r13d, %esi #59.9 + movslq %esi, %rsi #59.9 + testl %r15d, %r15d #59.9 + movl $0, %r15d #59.9 + jbe ..B1.21 # Prob 10% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.16: # Preds ..B1.15 # Execution count [4.50e+00] @@ -190,37 +190,37 @@ computeForceLJFullNeigh_plain_c: # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.17: # Preds ..B1.19 ..B1.16 # Execution count [2.50e+01] - movl (%rbx,%r15,4), %ecx #57.21 - lea (%rcx,%rcx,2), %ecx #58.36 - movslq %ecx, %rcx #58.36 - vsubsd 8(%r11,%rcx,8), %xmm2, %xmm13 #59.36 - vsubsd (%r11,%rcx,8), %xmm3, %xmm12 #58.36 - vsubsd 16(%r11,%rcx,8), %xmm1, %xmm11 #60.36 - vmulsd %xmm13, %xmm13, %xmm14 #61.49 - vfmadd231sd %xmm12, %xmm12, %xmm14 #61.63 - vfmadd231sd %xmm11, %xmm11, %xmm14 #61.63 - vcomisd %xmm14, %xmm7 #71.22 - jbe ..B1.19 # Prob 50% #71.22 + movl (%rbx,%r15,4), %ecx #60.21 + lea (%rcx,%rcx,2), %ecx #61.36 + movslq %ecx, %rcx #61.36 + vsubsd 8(%r11,%rcx,8), %xmm2, %xmm13 #62.36 + vsubsd (%r11,%rcx,8), %xmm3, %xmm12 #61.36 + vsubsd 16(%r11,%rcx,8), %xmm1, %xmm11 #63.36 + vmulsd %xmm13, %xmm13, %xmm14 #64.49 + vfmadd231sd %xmm12, %xmm12, %xmm14 #64.63 + vfmadd231sd %xmm11, %xmm11, %xmm14 #64.63 + vcomisd %xmm14, %xmm7 #74.22 + jbe ..B1.19 # Prob 50% #74.22 # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.18: # Preds ..B1.17 # Execution count [1.25e+01] - vdivsd %xmm14, %xmm5, %xmm15 #75.38 + vdivsd %xmm14, %xmm5, %xmm15 #75.39 vmulsd %xmm15, %xmm6, %xmm14 #76.38 vmulsd %xmm15, %xmm14, %xmm14 #76.44 vmulsd %xmm15, %xmm14, %xmm14 #76.50 - vmulsd %xmm4, %xmm15, %xmm15 #77.54 - vmulsd %xmm14, %xmm15, %xmm15 #77.61 - vsubsd %xmm0, %xmm14, %xmm14 #77.54 - vmulsd %xmm14, %xmm15, %xmm15 #77.67 + vmulsd %xmm4, %xmm15, %xmm15 #77.55 + vmulsd %xmm14, %xmm15, %xmm15 #77.64 + vsubsd %xmm0, %xmm14, %xmm14 #77.55 + vmulsd %xmm14, %xmm15, %xmm15 #77.70 vfmadd231sd %xmm12, %xmm15, %xmm8 #78.17 vfmadd231sd %xmm15, %xmm13, %xmm9 #79.17 vfmadd231sd %xmm15, %xmm11, %xmm10 #80.17 # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.19: # Preds ..B1.18 ..B1.17 # Execution count [2.50e+01] - incq %r15 #56.9 - cmpq %rdx, %r15 #56.9 - jb ..B1.17 # Prob 82% #56.9 + incq %r15 #59.9 + cmpq %rdx, %r15 #59.9 + jb ..B1.17 # Prob 82% #59.9 # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.20: # Preds ..B1.19 # Execution count [4.50e+00] @@ -228,79 +228,83 @@ computeForceLJFullNeigh_plain_c: # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.21: # Preds ..B1.20 ..B1.15 ..B1.41 # Execution count [4.50e+00] - vmovsd %xmm3, 192(%rsp) #71.22[spill] - vxorpd %xmm11, %xmm11, %xmm11 #47.22 - vmovsd %xmm8, %xmm11, %xmm13 #47.22 - vmovsd %xmm9, %xmm11, %xmm12 #48.22 - vmovsd %xmm10, %xmm11, %xmm11 #49.22 - vmovsd %xmm4, 200(%rsp) #71.22[spill] - vbroadcastsd %xmm3, %ymm10 #44.23 - vmovsd %xmm1, 176(%rsp) #71.22[spill] - vmovsd %xmm2, 184(%rsp) #71.22[spill] - vmovupd .L_2il0floatpacket.3(%rip), %ymm3 #71.22 - vmovupd .L_2il0floatpacket.2(%rip), %ymm4 #71.22 - vmovupd 32(%rsp), %ymm5 #71.22[spill] - vbroadcastsd %xmm2, %ymm9 #45.23 - vbroadcastsd %xmm1, %ymm8 #46.23 - movq %r8, 160(%rsp) #71.22[spill] - movq %r14, 168(%rsp) #71.22[spill] - movq %rcx, 24(%rsp) #71.22[spill] - vmovaps %xmm13, %xmm13 #47.22 - vmovaps %xmm12, %xmm12 #48.22 - vmovaps %xmm11, %xmm11 #49.22 + vmovsd %xmm3, 192(%rsp) #74.22[spill] + vxorpd %xmm11, %xmm11, %xmm11 #51.22 + vmovsd %xmm8, %xmm11, %xmm13 #51.22 + vmovsd %xmm9, %xmm11, %xmm12 #52.22 + vmovsd %xmm10, %xmm11, %xmm11 #53.22 + vmovsd %xmm4, 200(%rsp) #74.22[spill] + vbroadcastsd %xmm3, %ymm10 #48.23 + vmovsd %xmm1, 176(%rsp) #74.22[spill] + vmovsd %xmm2, 184(%rsp) #74.22[spill] + vmovupd .L_2il0floatpacket.3(%rip), %ymm3 #74.22 + vmovupd .L_2il0floatpacket.2(%rip), %ymm4 #74.22 + vmovupd 32(%rsp), %ymm5 #74.22[spill] + vbroadcastsd %xmm2, %ymm9 #49.23 + vbroadcastsd %xmm1, %ymm8 #50.23 + movq %r8, 160(%rsp) #74.22[spill] + movq %r14, 168(%rsp) #74.22[spill] + movq %rcx, 24(%rsp) #74.22[spill] + vmovaps %xmm13, %xmm13 #51.22 + vmovaps %xmm12, %xmm12 #52.22 + vmovaps %xmm11, %xmm11 #53.22 # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e # LLVM-MCA-BEGIN -# OSACA-BEGIN ..B1.22: # Preds ..B1.24 ..B1.21 # Execution count [2.50e+01] - vmovdqu (%rbx,%rdx,4), %xmm0 #57.21 - vmovq %xmm0, %rcx #57.21 - vpunpckhqdq %xmm0, %xmm0, %xmm2 #57.21 - vmovq %xmm2, %r15 #57.21 - movl %ecx, %r8d #57.21 - shrq $32, %rcx #57.21 - lea (%rcx,%rcx,2), %r14d #58.36 - lea (%r8,%r8,2), %r8d #58.36 - movslq %r8d, %rcx #58.36 - movslq %r14d, %r8 #58.36 - movl %r15d, %r14d #57.21 - shrq $32, %r15 #57.21 - vmovups (%r11,%rcx,8), %xmm7 #58.36 - vmovups (%r11,%r8,8), %xmm6 #58.36 - vmovq 16(%r11,%rcx,8), %xmm14 #58.36 - lea (%r14,%r14,2), %r14d #58.36 - movslq %r14d, %r14 #58.36 - lea (%r15,%r15,2), %r15d #58.36 - movslq %r15d, %r15 #58.36 - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #58.36 - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #58.36 - vmovq 16(%r11,%r14,8), %xmm0 #58.36 - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #58.36 - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #58.36 - vunpcklpd %ymm6, %ymm1, %ymm14 #58.36 - vunpckhpd %ymm6, %ymm1, %ymm1 #58.36 - vsubpd %ymm14, %ymm10, %ymm6 #58.36 - vinsertf128 $1, %xmm2, %ymm15, %ymm7 #58.36 - vsubpd %ymm1, %ymm9, %ymm2 #59.36 - vsubpd %ymm7, %ymm8, %ymm0 #60.36 - vmulpd %ymm2, %ymm2, %ymm14 #61.49 - vfmadd231pd %ymm6, %ymm6, %ymm14 #61.49 - vfmadd231pd %ymm0, %ymm0, %ymm14 #61.63 - vcmpltpd %ymm5, %ymm14, %ymm1 #71.22 - vpcmpeqd %ymm7, %ymm7, %ymm7 #71.22 - vptest %ymm7, %ymm1 #71.22 - je ..B1.24 # Prob 50% #71.22 + vmovdqu (%rbx,%rdx,4), %xmm0 #60.21 + vmovq %xmm0, %rcx #60.21 + vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21 + vmovq %xmm2, %r15 #60.21 + movl %ecx, %r8d #60.21 + shrq $32, %rcx #60.21 + lea (%rcx,%rcx,2), %r14d #61.36 + lea (%r8,%r8,2), %r8d #61.36 + movslq %r8d, %rcx #61.36 + movslq %r14d, %r8 #61.36 + movl %r15d, %r14d #60.21 + shrq $32, %r15 #60.21 + vmovups (%r11,%rcx,8), %xmm7 #61.36 + vmovups (%r11,%r8,8), %xmm6 #61.36 + vmovq 16(%r11,%rcx,8), %xmm14 #61.36 + lea (%r14,%r14,2), %r14d #61.36 + movslq %r14d, %r14 #61.36 + lea (%r15,%r15,2), %r15d #61.36 + movslq %r15d, %r15 #61.36 + vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36 + vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36 + vmovq 16(%r11,%r14,8), %xmm0 #61.36 + vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36 + vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36 + vunpcklpd %ymm6, %ymm1, %ymm14 #61.36 + vunpckhpd %ymm6, %ymm1, %ymm1 #61.36 + vsubpd %ymm14, %ymm10, %ymm6 #61.36 + vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36 + vsubpd %ymm1, %ymm9, %ymm2 #62.36 + vsubpd %ymm7, %ymm8, %ymm0 #63.36 + vmulpd %ymm2, %ymm2, %ymm14 #64.49 + vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49 + vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63 + vcmpltpd %ymm5, %ymm14, %ymm1 #74.22 + vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22 + vptest %ymm7, %ymm1 #74.22 + #je ..B1.24 # Prob 50% #74.22 # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 ..B1.23: # Preds ..B1.22 # Execution count [1.25e+01] - vdivpd %ymm14, %ymm4, %ymm7 #75.38 + vdivpd %ymm14, %ymm4, %ymm7 #75.39 vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill] vmulpd %ymm14, %ymm7, %ymm14 #76.44 vmulpd %ymm14, %ymm7, %ymm15 #76.50 - vfmsub213pd %ymm3, %ymm7, %ymm14 #77.54 - vmulpd 64(%rsp), %ymm7, %ymm7 #77.54[spill] - vmulpd %ymm7, %ymm15, %ymm15 #77.61 - vmulpd %ymm14, %ymm15, %ymm7 #77.67 + vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55 + vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill] + vmulpd %ymm7, %ymm15, %ymm15 #77.64 + vmulpd %ymm14, %ymm15, %ymm7 #77.70 vmulpd %ymm7, %ymm6, %ymm6 #78.31 vmulpd %ymm7, %ymm2, %ymm2 #79.31 vandpd %ymm6, %ymm1, %ymm6 #78.31 @@ -313,21 +317,24 @@ computeForceLJFullNeigh_plain_c: # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ..B1.24: # Preds ..B1.23 ..B1.22 # Execution count [2.50e+01] - addq $4, %rdx #56.9 - cmpq %rsi, %rdx #56.9 - jb ..B1.22 # Prob 82% #56.9 -# OSACA-END + addq $4, %rdx #59.9 + cmpq %rsi, %rdx #59.9 + jb ..B1.22 # Prob 82% #59.9 # LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ..B1.25: # Preds ..B1.24 # Execution count [4.50e+00] - vextractf128 $1, %ymm11, %xmm10 #49.22 + vextractf128 $1, %ymm11, %xmm10 #53.22 vmovsd 176(%rsp), %xmm1 #[spill] vmovsd 184(%rsp), %xmm2 #[spill] - vaddpd %xmm10, %xmm11, %xmm9 #49.22 - vunpckhpd %xmm9, %xmm9, %xmm8 #49.22 + vaddpd %xmm10, %xmm11, %xmm9 #53.22 + vunpckhpd %xmm9, %xmm9, %xmm8 #53.22 vmovsd 192(%rsp), %xmm3 #[spill] - vaddsd %xmm8, %xmm9, %xmm10 #49.22 + vaddsd %xmm8, %xmm9, %xmm10 #53.22 vmovsd 200(%rsp), %xmm4 #[spill] vmovsd 136(%rsp), %xmm6 #[spill] vmovsd 128(%rsp), %xmm7 #[spill] @@ -336,58 +343,58 @@ computeForceLJFullNeigh_plain_c: movq 24(%rsp), %rcx #[spill] vmovsd .L_2il0floatpacket.1(%rip), %xmm0 # vmovsd .L_2il0floatpacket.4(%rip), %xmm5 # - vextractf128 $1, %ymm12, %xmm14 #48.22 - vextractf128 $1, %ymm13, %xmm8 #47.22 - vaddpd %xmm14, %xmm12, %xmm15 #48.22 - vaddpd %xmm8, %xmm13, %xmm11 #47.22 - vunpckhpd %xmm15, %xmm15, %xmm9 #48.22 - vunpckhpd %xmm11, %xmm11, %xmm12 #47.22 - vaddsd %xmm9, %xmm15, %xmm9 #48.22 - vaddsd %xmm12, %xmm11, %xmm8 #47.22 + vextractf128 $1, %ymm12, %xmm14 #52.22 + vextractf128 $1, %ymm13, %xmm8 #51.22 + vaddpd %xmm14, %xmm12, %xmm15 #52.22 + vaddpd %xmm8, %xmm13, %xmm11 #51.22 + vunpckhpd %xmm15, %xmm15, %xmm9 #52.22 + vunpckhpd %xmm11, %xmm11, %xmm12 #51.22 + vaddsd %xmm9, %xmm15, %xmm9 #52.22 + vaddsd %xmm12, %xmm11, %xmm8 #51.22 # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.26: # Preds ..B1.25 ..B1.39 # Execution count [5.00e+00] - cmpq %r12, %rsi #56.9 - jae ..B1.32 # Prob 10% #56.9 + cmpq %r12, %rsi #59.9 + jae ..B1.32 # Prob 10% #59.9 # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.27: # Preds ..B1.26 # Execution count [4.50e+00] - imulq 144(%rsp), %rax #42.43[spill] - addq 152(%rsp), %rax #25.5[spill] + imulq 144(%rsp), %rax #46.43[spill] + addq 152(%rsp), %rax #23.5[spill] # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.28: # Preds ..B1.30 ..B1.27 # Execution count [2.50e+01] - movl (%rax,%rsi,4), %edx #57.21 - lea (%rdx,%rdx,2), %ebx #58.36 - movslq %ebx, %rbx #58.36 - vsubsd 8(%r11,%rbx,8), %xmm2, %xmm13 #59.36 - vsubsd (%r11,%rbx,8), %xmm3, %xmm12 #58.36 - vsubsd 16(%r11,%rbx,8), %xmm1, %xmm11 #60.36 - vmulsd %xmm13, %xmm13, %xmm14 #61.49 - vfmadd231sd %xmm12, %xmm12, %xmm14 #61.63 - vfmadd231sd %xmm11, %xmm11, %xmm14 #61.63 - vcomisd %xmm14, %xmm7 #71.22 - jbe ..B1.30 # Prob 50% #71.22 + movl (%rax,%rsi,4), %edx #60.21 + lea (%rdx,%rdx,2), %ebx #61.36 + movslq %ebx, %rbx #61.36 + vsubsd 8(%r11,%rbx,8), %xmm2, %xmm13 #62.36 + vsubsd (%r11,%rbx,8), %xmm3, %xmm12 #61.36 + vsubsd 16(%r11,%rbx,8), %xmm1, %xmm11 #63.36 + vmulsd %xmm13, %xmm13, %xmm14 #64.49 + vfmadd231sd %xmm12, %xmm12, %xmm14 #64.63 + vfmadd231sd %xmm11, %xmm11, %xmm14 #64.63 + vcomisd %xmm14, %xmm7 #74.22 + jbe ..B1.30 # Prob 50% #74.22 # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B1.29: # Preds ..B1.28 # Execution count [1.25e+01] - vdivsd %xmm14, %xmm5, %xmm15 #75.38 + vdivsd %xmm14, %xmm5, %xmm15 #75.39 vmulsd %xmm15, %xmm6, %xmm14 #76.38 vmulsd %xmm15, %xmm14, %xmm14 #76.44 vmulsd %xmm15, %xmm14, %xmm14 #76.50 - vmulsd %xmm4, %xmm15, %xmm15 #77.54 - vmulsd %xmm14, %xmm15, %xmm15 #77.61 - vsubsd %xmm0, %xmm14, %xmm14 #77.54 - vmulsd %xmm14, %xmm15, %xmm15 #77.67 + vmulsd %xmm4, %xmm15, %xmm15 #77.55 + vmulsd %xmm14, %xmm15, %xmm15 #77.64 + vsubsd %xmm0, %xmm14, %xmm14 #77.55 + vmulsd %xmm14, %xmm15, %xmm15 #77.70 vfmadd231sd %xmm12, %xmm15, %xmm8 #78.17 vfmadd231sd %xmm15, %xmm13, %xmm9 #79.17 vfmadd231sd %xmm15, %xmm11, %xmm10 #80.17 # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.30: # Preds ..B1.29 ..B1.28 # Execution count [2.50e+01] - incq %rsi #56.9 - cmpq %r12, %rsi #56.9 - jb ..B1.28 # Prob 82% #56.9 + incq %rsi #59.9 + cmpq %r12, %rsi #59.9 + jb ..B1.28 # Prob 82% #59.9 # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.32: # Preds ..B1.30 ..B1.8 ..B1.26 # Execution count [5.00e+00] @@ -401,16 +408,16 @@ computeForceLJFullNeigh_plain_c: vmovsd %xmm1, (%rdi,%r8) #89.9 vmovsd %xmm2, 8(%rdi,%r8) #90.9 vmovsd %xmm3, 16(%rdi,%r8) #91.9 - addq $24, %rdi #41.5 + addq $24, %rdi #45.5 lea 3(%rax,%r13), %edx #94.9 - movslq %ecx, %rax #41.32 + movslq %ecx, %rax #45.32 sarl $2, %edx #94.9 - incq %rcx #41.5 + incq %rcx #45.5 movslq %edx, %rdx #94.9 - incq %rax #41.32 + incq %rax #45.32 addq %rdx, %r10 #94.9 - cmpq 208(%rsp), %rcx #41.5[spill] - jb ..B1.8 # Prob 82% #41.5 + cmpq 208(%rsp), %rcx #45.5[spill] + jb ..B1.8 # Prob 82% #45.5 # LOE rax rcx rdi r8 r9 r10 r11 r14 xmm0 xmm4 xmm5 xmm6 xmm7 ..B1.33: # Preds ..B1.32 # Execution count [9.00e-01] @@ -435,10 +442,10 @@ computeForceLJFullNeigh_plain_c: # LOE r12 ..B1.35: # Preds ..B1.55 # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 + movl $.L_2__STRING.0, %edi #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.64: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 + call likwid_markerStartRegion #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.65: # LOE r12 ..B1.36: # Preds ..B1.33 ..B1.35 @@ -452,29 +459,29 @@ computeForceLJFullNeigh_plain_c: # LOE r12 ..B1.37: # Preds ..B1.36 # Execution count [1.00e+00] - xorl %eax, %eax #98.16 + xorl %eax, %eax #100.16 ..___tag_value_computeForceLJFullNeigh_plain_c.68: # getTimeStamp() - call getTimeStamp #98.16 + call getTimeStamp #100.16 ..___tag_value_computeForceLJFullNeigh_plain_c.69: # LOE r12 xmm0 ..B1.38: # Preds ..B1.37 # Execution count [1.00e+00] - vsubsd 16(%rsp), %xmm0, %xmm0 #102.14[spill] - addq $224, %rsp #102.14 + vsubsd 16(%rsp), %xmm0, %xmm0 #101.14[spill] + addq $224, %rsp #101.14 .cfi_restore 3 - popq %rbx #102.14 + popq %rbx #101.14 .cfi_restore 15 - popq %r15 #102.14 + popq %r15 #101.14 .cfi_restore 14 - popq %r14 #102.14 + popq %r14 #101.14 .cfi_restore 13 - popq %r13 #102.14 - movq %rbp, %rsp #102.14 - popq %rbp #102.14 + popq %r13 #101.14 + movq %rbp, %rsp #101.14 + popq %rbp #101.14 .cfi_def_cfa 7, 8 .cfi_restore 6 - ret #102.14 + ret #101.14 .cfi_def_cfa 6, 16 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 .cfi_offset 6, -16 @@ -485,44 +492,44 @@ computeForceLJFullNeigh_plain_c: # LOE ..B1.39: # Preds ..B1.9 ..B1.12 ..B1.14 # Execution count [4.50e-01]: Infreq - xorl %esi, %esi #56.9 - jmp ..B1.26 # Prob 100% #56.9 + xorl %esi, %esi #59.9 + jmp ..B1.26 # Prob 100% #59.9 # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.41: # Preds ..B1.10 # Execution count [4.50e-01]: Infreq - movl %r13d, %esi #56.9 - xorl %edx, %edx #56.9 - andl $-4, %esi #56.9 - movslq %esi, %rsi #56.9 - jmp ..B1.21 # Prob 100% #56.9 + movl %r13d, %esi #59.9 + xorl %edx, %edx #59.9 + andl $-4, %esi #59.9 + movslq %esi, %rsi #59.9 + jmp ..B1.21 # Prob 100% #59.9 .cfi_restore 12 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 ..B1.43: # Preds ..B1.2 # Execution count [1.00e+00]: Infreq - lea (%rbx,%rbx,2), %rcx #24.18 - cmpq $8, %rcx #32.5 - jl ..B1.51 # Prob 10% #32.5 + lea (%rbx,%rbx,2), %rcx #22.18 + cmpq $8, %rcx #33.5 + jl ..B1.51 # Prob 10% #33.5 # LOE rcx rbx rdi r12 r13 r14 r15 ..B1.44: # Preds ..B1.43 # Execution count [1.00e+00]: Infreq - movl %ecx, %eax #32.5 - xorl %edx, %edx #32.5 - andl $-8, %eax #32.5 - movslq %eax, %rax #32.5 - vxorpd %ymm0, %ymm0, %ymm0 #33.22 + movl %ecx, %eax #33.5 + xorl %edx, %edx #33.5 + andl $-8, %eax #33.5 + movslq %eax, %rax #33.5 + vxorpd %ymm0, %ymm0, %ymm0 #34.22 # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 ymm0 ..B1.45: # Preds ..B1.45 ..B1.44 # Execution count [5.56e+00]: Infreq - vmovupd %ymm0, (%rdi,%rdx,8) #33.9 - vmovupd %ymm0, 32(%rdi,%rdx,8) #33.9 - addq $8, %rdx #32.5 - cmpq %rax, %rdx #32.5 - jb ..B1.45 # Prob 82% #32.5 + vmovupd %ymm0, (%rdi,%rdx,8) #34.9 + vmovupd %ymm0, 32(%rdi,%rdx,8) #34.9 + addq $8, %rdx #33.5 + cmpq %rax, %rdx #33.5 + jb ..B1.45 # Prob 82% #33.5 # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 ymm0 ..B1.47: # Preds ..B1.45 ..B1.51 # Execution count [1.11e+00]: Infreq - cmpq %rcx, %rax #32.5 - jae ..B1.5 # Prob 10% #32.5 + cmpq %rcx, %rax #33.5 + jae ..B1.5 # Prob 10% #33.5 # LOE rax rcx rbx rdi r12 r13 r14 r15 ..B1.48: # Preds ..B1.47 # Execution count [1.00e+00]: Infreq @@ -530,16 +537,16 @@ computeForceLJFullNeigh_plain_c: # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 ..B1.49: # Preds ..B1.48 ..B1.49 # Execution count [5.56e+00]: Infreq - movq %rdx, (%rdi,%rax,8) #33.9 - incq %rax #32.5 - cmpq %rcx, %rax #32.5 - jb ..B1.49 # Prob 82% #32.5 - jmp ..B1.5 # Prob 100% #32.5 + movq %rdx, (%rdi,%rax,8) #34.9 + incq %rax #33.5 + cmpq %rcx, %rax #33.5 + jb ..B1.49 # Prob 82% #33.5 + jmp ..B1.5 # Prob 100% #33.5 # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 ..B1.51: # Preds ..B1.43 # Execution count [1.00e-01]: Infreq - xorl %eax, %eax #32.5 - jmp ..B1.47 # Prob 100% #32.5 + xorl %eax, %eax #33.5 + jmp ..B1.47 # Prob 100% #33.5 .align 16,0x90 # LOE rax rcx rbx rdi r12 r13 r14 r15 .cfi_endproc @@ -567,609 +574,579 @@ computeForceLJHalfNeigh: .cfi_startproc ..___tag_value_computeForceLJHalfNeigh.86: ..L87: - #105.96 - pushq %rbp #105.96 + #104.96 + pushq %rbp #104.96 .cfi_def_cfa_offset 16 - movq %rsp, %rbp #105.96 + movq %rsp, %rbp #104.96 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 - andq $-32, %rsp #105.96 - pushq %r12 #105.96 - pushq %r13 #105.96 - pushq %r14 #105.96 - pushq %r15 #105.96 - pushq %rbx #105.96 - subq $248, %rsp #105.96 + andq $-32, %rsp #104.96 + pushq %r12 #104.96 + pushq %r13 #104.96 + pushq %r14 #104.96 + pushq %r15 #104.96 + pushq %rbx #104.96 + subq $216, %rsp #104.96 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 - movq %rdi, %r12 #105.96 - movq %rsi, %r14 #105.96 - movq %rcx, %r15 #105.96 - movq %rdx, 32(%rsp) #105.96[spill] - vmovsd 144(%r12), %xmm0 #109.27 - vmulsd %xmm0, %xmm0, %xmm1 #109.45 - vmovsd 56(%r12), %xmm2 #110.23 - vmovsd 40(%r12), %xmm3 #111.24 - movl 4(%r14), %r13d #106.18 - vmovsd %xmm1, 56(%rsp) #109.45[spill] - vmovsd %xmm2, 48(%rsp) #110.23[spill] - vmovsd %xmm3, 24(%rsp) #111.24[spill] - testl %r13d, %r13d #114.24 - jle ..B2.51 # Prob 50% #114.24 - # LOE r12 r14 r15 r13d + movq %rsi, %r13 #104.96 + vmovsd 144(%rdi), %xmm0 #108.27 + movq %rcx, %r12 #104.96 + vmulsd %xmm0, %xmm0, %xmm1 #108.45 + movq %rdx, %r14 #104.96 + vmovsd 56(%rdi), %xmm2 #109.23 + vmovsd 40(%rdi), %xmm3 #110.24 + movl 4(%r13), %r15d #105.18 + vmovsd %xmm1, 32(%rsp) #108.45[spill] + vmovsd %xmm2, 24(%rsp) #109.23[spill] + vmovsd %xmm3, 16(%rsp) #110.24[spill] + testl %r15d, %r15d #116.24 + jle ..B2.51 # Prob 50% #116.24 + # LOE r12 r13 r14 r15d ..B2.2: # Preds ..B2.1 # Execution count [5.00e-03] - movslq %r13d, %r13 #106.18 - movq 64(%r14), %rdi #115.9 - lea (%r13,%r13,2), %eax #106.18 - movq %r13, 40(%rsp) #106.18[spill] - cmpl $12, %eax #114.5 - jle ..B2.59 # Prob 0% #114.5 - # LOE rdi r12 r13 r14 r15 r13d + movq 64(%r13), %rdi #117.9 + lea (%r15,%r15,2), %eax #105.18 + movslq %r15d, %rbx #105.18 + cmpl $12, %eax #116.5 + jle ..B2.57 # Prob 0% #116.5 + # LOE rbx rdi r12 r13 r14 r15d ..B2.3: # Preds ..B2.2 # Execution count [1.00e+00] - movq %r13, %rax #114.5 - xorl %esi, %esi #114.5 - lea (%rax,%rax,2), %rdx #114.5 - shlq $3, %rdx #114.5 - call __intel_avx_rep_memset #114.5 - # LOE r12 r14 r15 r13d -..B2.5: # Preds ..B2.65 ..B2.3 ..B2.63 + xorl %esi, %esi #116.5 + lea (%rbx,%rbx,2), %rdx #116.5 + shlq $3, %rdx #116.5 + call __intel_avx_rep_memset #116.5 + # LOE rbx r12 r13 r14 r15d +..B2.5: # Preds ..B2.63 ..B2.3 ..B2.61 # Execution count [1.00e+00] - xorl %ebx, %ebx #120.22 - xorl %eax, %eax #121.16 - vzeroupper #121.16 -..___tag_value_computeForceLJHalfNeigh.101: + xorl %eax, %eax #122.16 + vzeroupper #122.16 +..___tag_value_computeForceLJHalfNeigh.99: # getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.102: - # LOE r12 r14 r15 ebx r13d xmm0 -..B2.70: # Preds ..B2.5 + call getTimeStamp #122.16 +..___tag_value_computeForceLJHalfNeigh.100: + # LOE rbx r12 r13 r14 r15d xmm0 +..B2.68: # Preds ..B2.5 # Execution count [1.00e+00] - vmovsd %xmm0, 16(%rsp) #121.16[spill] - # LOE r12 r14 r15 ebx r13d -..B2.6: # Preds ..B2.70 + vmovsd %xmm0, 8(%rsp) #122.16[spill] + # LOE rbx r12 r13 r14 r15d +..B2.6: # Preds ..B2.68 # Execution count [5.00e-01] - movl $.L_2__STRING.1, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.104: + movl $.L_2__STRING.1, %edi #126.5 +..___tag_value_computeForceLJHalfNeigh.102: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.105: - # LOE r12 r14 r15 ebx r13d + call likwid_markerStartRegion #126.5 +..___tag_value_computeForceLJHalfNeigh.103: + # LOE rbx r12 r13 r14 r15d ..B2.7: # Preds ..B2.6 # Execution count [9.00e-01] - vmovsd 24(%rsp), %xmm5 #161.41[spill] - vmovd %r13d, %xmm0 #106.18 - vmulsd .L_2il0floatpacket.0(%rip), %xmm5, %xmm5 #161.41 - xorl %r9d, %r9d #124.15 - movq 32(%rsp), %rdx #125.19[spill] - xorl %r8d, %r8d #124.5 - vmovddup 56(%rsp), %xmm8 #109.25[spill] - xorl %esi, %esi #124.5 - vmovddup 48(%rsp), %xmm4 #110.21[spill] - movslq 8(%rdx), %rax #125.43 - shlq $2, %rax #107.5 - movq 16(%rdx), %rdi #125.19 - vmovddup %xmm5, %xmm3 #161.41 - vpbroadcastd %xmm0, %xmm1 #106.18 - movq 24(%rdx), %rcx #126.25 - movq 16(%r14), %rdx #127.25 - movq %rax, 64(%rsp) #124.5[spill] - vmovsd .L_2il0floatpacket.4(%rip), %xmm7 #159.32 - vmovdqu .L_2il0floatpacket.6(%rip), %xmm9 #147.36 - vmovsd .L_2il0floatpacket.1(%rip), %xmm0 #161.54 - movq 64(%r14), %r14 #168.21 - movq (%r15), %r11 #179.9 - movq 8(%r15), %r10 #180.9 - vmovdqu %xmm1, 192(%rsp) #124.5[spill] - vmovupd %xmm3, 176(%rsp) #124.5[spill] - vmovupd %xmm4, 160(%rsp) #124.5[spill] - vmovupd %xmm8, 208(%rsp) #124.5[spill] - movq %rdi, 72(%rsp) #124.5[spill] - movl %r13d, 80(%rsp) #124.5[spill] - movq %r12, (%rsp) #124.5[spill] - movq %r15, 8(%rsp) #124.5[spill] - vmovsd 48(%rsp), %xmm6 #124.5[spill] - vmovsd 56(%rsp), %xmm2 #124.5[spill] - movq 40(%rsp), %rax #124.5[spill] - # LOE rax rdx rcx rsi r8 r9 r10 r11 r14 ebx xmm0 xmm2 xmm5 xmm6 xmm7 + vmovsd 16(%rsp), %xmm6 #165.42[spill] + vmovd %r15d, %xmm0 #105.18 + vmulsd .L_2il0floatpacket.0(%rip), %xmm6, %xmm6 #165.42 + xorl %eax, %eax #129.15 + vmovddup 32(%rsp), %xmm8 #108.25[spill] + xorl %ecx, %ecx #129.5 + vmovddup 24(%rsp), %xmm4 #109.21[spill] + xorl %r9d, %r9d #129.5 + vmovddup %xmm6, %xmm3 #165.42 + vpbroadcastd %xmm0, %xmm1 #105.18 + movq 16(%r14), %rdx #130.19 + movslq 8(%r14), %rsi #130.43 + movq 24(%r14), %r11 #131.25 + vmovdqu .L_2il0floatpacket.6(%rip), %xmm9 #151.36 + vmovsd .L_2il0floatpacket.4(%rip), %xmm5 #163.32 + vmovsd .L_2il0floatpacket.1(%rip), %xmm0 #165.55 + shlq $2, %rsi #106.5 + movq 16(%r13), %r14 #132.25 + movq 64(%r13), %rdi #172.21 + movq (%r12), %r10 #183.9 + movq 8(%r12), %r8 #184.9 + vmovdqu %xmm1, 176(%rsp) #129.5[spill] + vmovupd %xmm3, 160(%rsp) #129.5[spill] + vmovupd %xmm4, 144(%rsp) #129.5[spill] + vmovupd %xmm8, 192(%rsp) #129.5[spill] + movq %rdx, 40(%rsp) #129.5[spill] + movl %r15d, 48(%rsp) #129.5[spill] + movq %r12, (%rsp) #129.5[spill] + vmovsd 24(%rsp), %xmm7 #129.5[spill] + vmovsd 32(%rsp), %xmm2 #129.5[spill] + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r14 xmm0 xmm2 xmm5 xmm6 xmm7 ..B2.8: # Preds ..B2.49 ..B2.7 # Execution count [5.00e+00] - movl (%rcx,%r8,4), %edi #126.25 - addl %edi, %ebx #138.9 - vxorpd %xmm10, %xmm10, %xmm10 #130.22 - testl %edi, %edi #143.9 - vmovapd %xmm10, %xmm11 #131.22 - vmovsd (%rsi,%rdx), %xmm4 #127.25 - vmovapd %xmm11, %xmm12 #132.22 - vmovsd 8(%rsi,%rdx), %xmm3 #128.25 - vmovsd 16(%rsi,%rdx), %xmm1 #129.25 - jle ..B2.48 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + movl (%r11,%rcx,4), %edx #131.25 + testl %edx, %edx #147.9 + vxorpd %xmm10, %xmm10, %xmm10 #135.22 + vmovapd %xmm10, %xmm11 #136.22 + vmovsd (%r9,%r14), %xmm4 #132.25 + vmovapd %xmm11, %xmm12 #137.22 + vmovsd 8(%r9,%r14), %xmm3 #133.25 + vmovsd 16(%r9,%r14), %xmm1 #134.25 + jle ..B2.48 # Prob 50% #147.9 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r14 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.9: # Preds ..B2.8 # Execution count [2.50e+00] - jbe ..B2.48 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + jbe ..B2.48 # Prob 50% #147.9 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r14 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.10: # Preds ..B2.9 # Execution count [2.25e+00] - cmpl $2, %edi #143.9 - jb ..B2.58 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + cmpl $2, %edx #147.9 + jb ..B2.56 # Prob 10% #147.9 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r14 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.11: # Preds ..B2.10 # Execution count [2.25e+00] - movq 64(%rsp), %r13 #125.43[spill] - movl %edi, %r12d #143.9 - imulq %r9, %r13 #125.43 - vxorpd %xmm14, %xmm14, %xmm14 #130.22 - andl $-2, %r12d #143.9 - vmovapd %xmm14, %xmm13 #131.22 - addq 72(%rsp), %r13 #107.5[spill] - xorl %r15d, %r15d #143.9 - vmovddup %xmm4, %xmm10 #127.23 - vmovapd %xmm13, %xmm11 #132.22 - vmovddup %xmm3, %xmm9 #128.23 - vmovddup %xmm1, %xmm8 #129.23 - movslq %r12d, %r12 #143.9 - vmovsd %xmm1, 128(%rsp) #143.9[spill] - vmovsd %xmm3, 136(%rsp) #143.9[spill] - vmovsd %xmm4, 144(%rsp) #143.9[spill] - vmovsd %xmm5, 152(%rsp) #143.9[spill] - movq %r9, 24(%rsp) #143.9[spill] - movl %edi, 32(%rsp) #143.9[spill] - movq %rsi, 88(%rsp) #143.9[spill] - movq %r10, 96(%rsp) #143.9[spill] - movq %r11, 104(%rsp) #143.9[spill] - movq %rcx, 112(%rsp) #143.9[spill] - movq %r8, 120(%rsp) #143.9[spill] - vmovdqu .L_2il0floatpacket.6(%rip), %xmm6 #143.9 - vmovdqu .L_2il0floatpacket.5(%rip), %xmm7 #143.9 - # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + movq %rsi, %r13 #130.43 + movl %edx, %r12d #147.9 + imulq %rax, %r13 #130.43 + vxorpd %xmm14, %xmm14, %xmm14 #135.22 + andl $-2, %r12d #147.9 + vmovapd %xmm14, %xmm13 #136.22 + vmovsd %xmm6, 136(%rsp) #147.9[spill] + vmovapd %xmm13, %xmm11 #137.22 + addq 40(%rsp), %r13 #106.5[spill] + xorl %r15d, %r15d #147.9 + vmovddup %xmm4, %xmm10 #132.23 + vmovddup %xmm3, %xmm9 #133.23 + vmovddup %xmm1, %xmm8 #134.23 + movslq %r12d, %r12 #147.9 + vmovsd %xmm1, 112(%rsp) #147.9[spill] + vmovsd %xmm3, 120(%rsp) #147.9[spill] + vmovsd %xmm4, 128(%rsp) #147.9[spill] + movl %edx, 16(%rsp) #147.9[spill] + movq %r9, 56(%rsp) #147.9[spill] + movq %rsi, 64(%rsp) #147.9[spill] + movq %r8, 72(%rsp) #147.9[spill] + movq %r10, 80(%rsp) #147.9[spill] + movq %r11, 88(%rsp) #147.9[spill] + movq %rcx, 96(%rsp) #147.9[spill] + movq %rbx, 104(%rsp) #147.9[spill] + vmovdqu .L_2il0floatpacket.6(%rip), %xmm6 #147.9 + vmovdqu .L_2il0floatpacket.5(%rip), %xmm7 #147.9 + # LOE rax rdi r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.12: # Preds ..B2.38 ..B2.11 # Execution count [1.25e+01] - vmovq (%r13,%r15,4), %xmm4 #144.21 - vpaddd %xmm4, %xmm4, %xmm0 #145.36 - vpaddd %xmm0, %xmm4, %xmm1 #145.36 - vmovd %xmm1, %r9d #145.36 - vpaddd %xmm7, %xmm1, %xmm12 #146.36 - vpshufd $57, %xmm1, %xmm2 #145.36 - vpshufd $57, %xmm12, %xmm15 #146.36 - vmovd %xmm2, %r8d #145.36 - vmovd %xmm12, %edi #146.36 - vmovd %xmm15, %ecx #146.36 - movslq %r9d, %r9 #145.36 - movslq %r8d, %r8 #145.36 - movslq %edi, %rdi #146.36 - movslq %ecx, %rcx #146.36 - vmovsd (%rdx,%r9,8), %xmm3 #145.36 - vmovhpd (%rdx,%r8,8), %xmm3, %xmm5 #145.36 - vsubpd %xmm5, %xmm10, %xmm0 #145.36 - vpaddd %xmm6, %xmm1, %xmm5 #147.36 - vmovd %xmm5, %eax #147.36 - vpshufd $57, %xmm5, %xmm1 #147.36 - vmovsd (%rdx,%rdi,8), %xmm2 #146.36 - vmovd %xmm1, %r10d #147.36 - vmovhpd (%rdx,%rcx,8), %xmm2, %xmm3 #146.36 - vpcmpeqd %xmm1, %xmm1, %xmm1 #158.22 - vsubpd %xmm3, %xmm9, %xmm2 #146.36 - movslq %eax, %rax #147.36 - movslq %r10d, %r10 #147.36 - vmovsd (%rdx,%rax,8), %xmm12 #147.36 - vmovhpd (%rdx,%r10,8), %xmm12, %xmm15 #147.36 - vsubpd %xmm15, %xmm8, %xmm3 #147.36 - vmulpd %xmm2, %xmm2, %xmm15 #148.49 - vfmadd231pd %xmm0, %xmm0, %xmm15 #148.49 - vfmadd231pd %xmm3, %xmm3, %xmm15 #148.63 - vcmpltpd 208(%rsp), %xmm15, %xmm5 #158.22[spill] - vptest %xmm1, %xmm5 #158.22 - je ..B2.38 # Prob 50% #158.22 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 xmm15 + vmovq (%r13,%r15,4), %xmm4 #148.21 + vpaddd %xmm4, %xmm4, %xmm0 #149.36 + vpaddd %xmm0, %xmm4, %xmm1 #149.36 + vmovd %xmm1, %r8d #149.36 + vpaddd %xmm7, %xmm1, %xmm12 #150.36 + vpshufd $57, %xmm1, %xmm2 #149.36 + vpshufd $57, %xmm12, %xmm15 #150.36 + vmovd %xmm2, %esi #149.36 + vmovd %xmm12, %ebx #150.36 + vmovd %xmm15, %ecx #150.36 + movslq %r8d, %r8 #149.36 + movslq %esi, %rsi #149.36 + movslq %ebx, %rbx #150.36 + movslq %ecx, %rcx #150.36 + vmovsd (%r14,%r8,8), %xmm3 #149.36 + vmovhpd (%r14,%rsi,8), %xmm3, %xmm5 #149.36 + vsubpd %xmm5, %xmm10, %xmm0 #149.36 + vpaddd %xmm6, %xmm1, %xmm5 #151.36 + vmovd %xmm5, %edx #151.36 + vpshufd $57, %xmm5, %xmm1 #151.36 + vmovsd (%r14,%rbx,8), %xmm2 #150.36 + vmovd %xmm1, %r9d #151.36 + vmovhpd (%r14,%rcx,8), %xmm2, %xmm3 #150.36 + vpcmpeqd %xmm1, %xmm1, %xmm1 #162.22 + vsubpd %xmm3, %xmm9, %xmm2 #150.36 + movslq %edx, %rdx #151.36 + movslq %r9d, %r9 #151.36 + vmovsd (%r14,%rdx,8), %xmm12 #151.36 + vmovhpd (%r14,%r9,8), %xmm12, %xmm15 #151.36 + vsubpd %xmm15, %xmm8, %xmm3 #151.36 + vmulpd %xmm2, %xmm2, %xmm15 #152.49 + vfmadd231pd %xmm0, %xmm0, %xmm15 #152.49 + vfmadd231pd %xmm3, %xmm3, %xmm15 #152.63 + vcmpltpd 192(%rsp), %xmm15, %xmm5 #162.22[spill] + vptest %xmm1, %xmm5 #162.22 + je ..B2.38 # Prob 50% #162.22 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r12 r13 r14 r15 xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 xmm15 ..B2.13: # Preds ..B2.12 # Execution count [6.25e+00] - vmovupd .L_2il0floatpacket.7(%rip), %xmm12 #159.38 - vdivpd %xmm15, %xmm12, %xmm1 #159.38 - vmovdqu 192(%rsp), %xmm12 #167.24[spill] - vpcmpeqd %xmm15, %xmm15, %xmm15 #167.24 - vpcmpgtd %xmm4, %xmm12, %xmm4 #167.24 - vmulpd 160(%rsp), %xmm1, %xmm12 #160.38[spill] - vmulpd %xmm12, %xmm1, %xmm12 #160.44 - vpmovsxdq %xmm4, %xmm4 #167.24 - vandpd %xmm4, %xmm5, %xmm4 #167.24 - vptest %xmm15, %xmm4 #167.24 - vmulpd %xmm12, %xmm1, %xmm15 #160.50 - vfmsub213pd .L_2il0floatpacket.8(%rip), %xmm1, %xmm12 #161.54 - vmulpd 176(%rsp), %xmm1, %xmm1 #161.54[spill] - vmulpd %xmm1, %xmm15, %xmm1 #161.61 - vmulpd %xmm12, %xmm1, %xmm15 #161.67 - vmulpd %xmm15, %xmm0, %xmm12 #162.31 - vmulpd %xmm15, %xmm2, %xmm1 #163.31 - vmulpd %xmm15, %xmm3, %xmm0 #164.31 - vandpd %xmm12, %xmm5, %xmm2 #162.31 - vandpd %xmm1, %xmm5, %xmm3 #163.31 - vandpd %xmm0, %xmm5, %xmm5 #164.31 - vaddpd %xmm2, %xmm14, %xmm14 #162.17 - vaddpd %xmm3, %xmm13, %xmm13 #163.17 - vaddpd %xmm5, %xmm11, %xmm11 #164.17 - je ..B2.38 # Prob 50% #167.24 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 xmm0 xmm1 xmm4 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 + vmovupd .L_2il0floatpacket.7(%rip), %xmm12 #163.39 + vdivpd %xmm15, %xmm12, %xmm1 #163.39 + vmovdqu 176(%rsp), %xmm12 #171.24[spill] + vpcmpeqd %xmm15, %xmm15, %xmm15 #171.24 + vpcmpgtd %xmm4, %xmm12, %xmm4 #171.24 + vmulpd 144(%rsp), %xmm1, %xmm12 #164.38[spill] + vmulpd %xmm12, %xmm1, %xmm12 #164.44 + vpmovsxdq %xmm4, %xmm4 #171.24 + vandpd %xmm4, %xmm5, %xmm4 #171.24 + vptest %xmm15, %xmm4 #171.24 + vmulpd %xmm12, %xmm1, %xmm15 #164.50 + vfmsub213pd .L_2il0floatpacket.8(%rip), %xmm1, %xmm12 #165.55 + vmulpd 160(%rsp), %xmm1, %xmm1 #165.55[spill] + vmulpd %xmm1, %xmm15, %xmm1 #165.64 + vmulpd %xmm12, %xmm1, %xmm15 #165.70 + vmulpd %xmm15, %xmm0, %xmm12 #166.31 + vmulpd %xmm15, %xmm2, %xmm1 #167.31 + vmulpd %xmm15, %xmm3, %xmm0 #168.31 + vandpd %xmm12, %xmm5, %xmm2 #166.31 + vandpd %xmm1, %xmm5, %xmm3 #167.31 + vandpd %xmm0, %xmm5, %xmm5 #168.31 + vaddpd %xmm2, %xmm14, %xmm14 #166.17 + vaddpd %xmm3, %xmm13, %xmm13 #167.17 + vaddpd %xmm5, %xmm11, %xmm11 #168.17 + je ..B2.38 # Prob 50% #171.24 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r12 r13 r14 r15 xmm0 xmm1 xmm4 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B2.14: # Preds ..B2.13 # Execution count [3.12e+00] - vmovmskpd %xmm4, %esi #168.21 - movl %esi, %r11d #168.21 - andl $2, %r11d #168.21 - andl $1, %esi #168.21 - je ..B2.17 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 + vmovmskpd %xmm4, %r11d #172.21 + movl %r11d, %r10d #172.21 + andl $2, %r10d #172.21 + andl $1, %r11d #172.21 + je ..B2.17 # Prob 40% #172.21 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B2.15: # Preds ..B2.14 # Execution count [3.12e+00] - vmovsd (%r14,%r9,8), %xmm2 #168.21 - testl %r11d, %r11d #168.21 - jne ..B2.18 # Prob 60% #168.21 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 + vmovsd (%rdi,%r8,8), %xmm2 #172.21 + testl %r10d, %r10d #172.21 + jne ..B2.18 # Prob 60% #172.21 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B2.16: # Preds ..B2.15 # Execution count [1.25e+00] - vxorpd %xmm3, %xmm3, %xmm3 #168.21 - vunpcklpd %xmm3, %xmm2, %xmm4 #168.21 - vsubpd %xmm12, %xmm4, %xmm2 #168.21 - jmp ..B2.31 # Prob 100% #168.21 - # LOE rax rdx rbx rdi r9 r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vxorpd %xmm3, %xmm3, %xmm3 #172.21 + vunpcklpd %xmm3, %xmm2, %xmm4 #172.21 + vsubpd %xmm12, %xmm4, %xmm2 #172.21 + jmp ..B2.31 # Prob 100% #172.21 + # LOE rax rdx rbx rdi r8 r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.17: # Preds ..B2.14 # Execution count [3.12e+00] - testl %r11d, %r11d #168.21 - vxorpd %xmm2, %xmm2, %xmm2 #168.21 - je ..B2.30 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 + testl %r10d, %r10d #172.21 + vxorpd %xmm2, %xmm2, %xmm2 #172.21 + je ..B2.30 # Prob 40% #172.21 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B2.18: # Preds ..B2.15 ..B2.17 # Execution count [3.12e+00] - vmovhpd (%r14,%r8,8), %xmm2, %xmm3 #168.21 - testl %esi, %esi #168.21 - vsubpd %xmm12, %xmm3, %xmm2 #168.21 - je ..B2.20 # Prob 40% #168.21 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovhpd (%rdi,%rsi,8), %xmm2, %xmm3 #172.21 + testl %r11d, %r11d #172.21 + vsubpd %xmm12, %xmm3, %xmm2 #172.21 + je ..B2.20 # Prob 40% #172.21 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.19: # Preds ..B2.18 # Execution count [1.88e+00] - vpshufd $14, %xmm2, %xmm3 #168.21 - vmovsd %xmm2, (%r14,%r9,8) #168.21 - vmovsd %xmm3, (%r14,%r8,8) #168.21 - vmovsd (%r14,%rdi,8), %xmm2 #169.21 - jmp ..B2.21 # Prob 100% #169.21 - # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vpshufd $14, %xmm2, %xmm3 #172.21 + vmovsd %xmm2, (%rdi,%r8,8) #172.21 + vmovsd %xmm3, (%rdi,%rsi,8) #172.21 + vmovsd (%rdi,%rbx,8), %xmm2 #173.21 + jmp ..B2.21 # Prob 100% #173.21 + # LOE rax rdx rcx rbx rdi r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.20: # Preds ..B2.18 # Execution count [1.25e+00] - vpshufd $14, %xmm2, %xmm2 #168.21 - vmovsd %xmm2, (%r14,%r8,8) #168.21 - vxorpd %xmm2, %xmm2, %xmm2 #169.21 - # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vpshufd $14, %xmm2, %xmm2 #172.21 + vmovsd %xmm2, (%rdi,%rsi,8) #172.21 + vxorpd %xmm2, %xmm2, %xmm2 #173.21 + # LOE rax rdx rcx rbx rdi r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.21: # Preds ..B2.19 ..B2.20 # Execution count [1.88e+00] - testl %r11d, %r11d #169.21 - je ..B2.74 # Prob 40% #169.21 - # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + testl %r10d, %r10d #173.21 + je ..B2.72 # Prob 40% #173.21 + # LOE rax rdx rcx rbx rdi r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.22: # Preds ..B2.21 # Execution count [3.12e+00] - vmovhpd (%r14,%rcx,8), %xmm2, %xmm3 #169.21 - testl %esi, %esi #169.21 - vsubpd %xmm1, %xmm3, %xmm1 #169.21 - je ..B2.24 # Prob 40% #169.21 - # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovhpd (%rdi,%rcx,8), %xmm2, %xmm3 #173.21 + testl %r11d, %r11d #173.21 + vsubpd %xmm1, %xmm3, %xmm1 #173.21 + je ..B2.24 # Prob 40% #173.21 + # LOE rax rdx rcx rbx rdi r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.23: # Preds ..B2.22 # Execution count [1.88e+00] - vpshufd $14, %xmm1, %xmm2 #169.21 - vmovsd %xmm1, (%r14,%rdi,8) #169.21 - vmovsd %xmm2, (%r14,%rcx,8) #169.21 - vmovsd (%r14,%rax,8), %xmm1 #170.21 - jmp ..B2.25 # Prob 100% #170.21 - # LOE rax rdx rbx r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vpshufd $14, %xmm1, %xmm2 #173.21 + vmovsd %xmm1, (%rdi,%rbx,8) #173.21 + vmovsd %xmm2, (%rdi,%rcx,8) #173.21 + vmovsd (%rdi,%rdx,8), %xmm1 #174.21 + jmp ..B2.25 # Prob 100% #174.21 + # LOE rax rdx rdi r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.24: # Preds ..B2.22 # Execution count [1.25e+00] - vpshufd $14, %xmm1, %xmm1 #169.21 - vmovsd %xmm1, (%r14,%rcx,8) #169.21 - vxorpd %xmm1, %xmm1, %xmm1 #170.21 - # LOE rax rdx rbx r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vpshufd $14, %xmm1, %xmm1 #173.21 + vmovsd %xmm1, (%rdi,%rcx,8) #173.21 + vxorpd %xmm1, %xmm1, %xmm1 #174.21 + # LOE rax rdx rdi r9 r12 r13 r14 r15 r10d r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.25: # Preds ..B2.23 ..B2.24 # Execution count [1.88e+00] - testl %r11d, %r11d #170.21 - je ..B2.73 # Prob 40% #170.21 - # LOE rax rdx rbx r10 r12 r13 r14 r15 esi xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + testl %r10d, %r10d #174.21 + je ..B2.71 # Prob 40% #174.21 + # LOE rax rdx rdi r9 r12 r13 r14 r15 r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.26: # Preds ..B2.25 # Execution count [3.12e+00] - vmovhpd (%r14,%r10,8), %xmm1, %xmm2 #170.21 - testl %esi, %esi #170.21 - vsubpd %xmm0, %xmm2, %xmm0 #170.21 - je ..B2.28 # Prob 40% #170.21 - # LOE rax rdx rbx r10 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovhpd (%rdi,%r9,8), %xmm1, %xmm2 #174.21 + testl %r11d, %r11d #174.21 + vsubpd %xmm0, %xmm2, %xmm0 #174.21 + je ..B2.28 # Prob 40% #174.21 + # LOE rax rdx rdi r9 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.27: # Preds ..B2.26 # Execution count [1.88e+00] - vmovsd %xmm0, (%r14,%rax,8) #170.21 - vpshufd $14, %xmm0, %xmm0 #170.21 - jmp ..B2.29 # Prob 100% #170.21 - # LOE rdx rbx r10 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovsd %xmm0, (%rdi,%rdx,8) #174.21 + vpshufd $14, %xmm0, %xmm0 #174.21 + jmp ..B2.29 # Prob 100% #174.21 + # LOE rax rdi r9 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.28: # Preds ..B2.26 # Execution count [1.25e+00] - vpshufd $14, %xmm0, %xmm0 #170.21 - # LOE rdx rbx r10 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vpshufd $14, %xmm0, %xmm0 #174.21 + # LOE rax rdi r9 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.29: # Preds ..B2.27 ..B2.28 # Execution count [3.12e+00] - vmovsd %xmm0, (%r14,%r10,8) #170.21 - jmp ..B2.38 # Prob 100% #170.21 - # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovsd %xmm0, (%rdi,%r9,8) #174.21 + jmp ..B2.38 # Prob 100% #174.21 + # LOE rax rdi r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.30: # Preds ..B2.17 # Execution count [1.88e+00] - testl %esi, %esi #168.21 - vxorpd %xmm2, %xmm2, %xmm2 #168.21 - vsubpd %xmm12, %xmm2, %xmm2 #168.21 - je ..B2.32 # Prob 40% #168.21 - # LOE rax rdx rbx rdi r9 r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + testl %r11d, %r11d #172.21 + vxorpd %xmm2, %xmm2, %xmm2 #172.21 + vsubpd %xmm12, %xmm2, %xmm2 #172.21 + je ..B2.32 # Prob 40% #172.21 + # LOE rax rdx rbx rdi r8 r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.31: # Preds ..B2.16 ..B2.30 # Execution count [1.25e+00] - vmovsd %xmm2, (%r14,%r9,8) #168.21 - vmovsd (%r14,%rdi,8), %xmm3 #169.21 - vxorpd %xmm4, %xmm4, %xmm4 #169.21 - vunpcklpd %xmm4, %xmm3, %xmm5 #169.21 - vsubpd %xmm1, %xmm5, %xmm1 #169.21 - jmp ..B2.34 # Prob 100% #169.21 + vmovsd %xmm2, (%rdi,%r8,8) #172.21 + vmovsd (%rdi,%rbx,8), %xmm3 #173.21 + vxorpd %xmm4, %xmm4, %xmm4 #173.21 + vunpcklpd %xmm4, %xmm3, %xmm5 #173.21 + vsubpd %xmm1, %xmm5, %xmm1 #173.21 + jmp ..B2.34 # Prob 100% #173.21 # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.32: # Preds ..B2.30 # Execution count [0.00e+00] - vxorpd %xmm2, %xmm2, %xmm2 #169.21 - jmp ..B2.33 # Prob 100% #169.21 + vxorpd %xmm2, %xmm2, %xmm2 #173.21 + jmp ..B2.33 # Prob 100% #173.21 # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 -..B2.74: # Preds ..B2.21 +..B2.72: # Preds ..B2.21 # Execution count [7.50e-01] - testl %esi, %esi #168.21 + testl %r11d, %r11d #172.21 # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 -..B2.33: # Preds ..B2.32 ..B2.74 +..B2.33: # Preds ..B2.32 ..B2.72 # Execution count [2.67e+00] - vxorpd %xmm3, %xmm3, %xmm3 #169.21 - vunpcklpd %xmm3, %xmm2, %xmm4 #169.21 - vsubpd %xmm1, %xmm4, %xmm1 #169.21 - je ..B2.35 # Prob 40% #169.21 + vxorpd %xmm3, %xmm3, %xmm3 #173.21 + vunpcklpd %xmm3, %xmm2, %xmm4 #173.21 + vsubpd %xmm1, %xmm4, %xmm1 #173.21 + je ..B2.35 # Prob 40% #173.21 # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.34: # Preds ..B2.31 ..B2.33 # Execution count [1.25e+00] - vmovsd %xmm1, (%r14,%rdi,8) #169.21 - vmovsd (%r14,%rax,8), %xmm2 #170.21 - vxorpd %xmm3, %xmm3, %xmm3 #170.21 - vunpcklpd %xmm3, %xmm2, %xmm4 #170.21 - vsubpd %xmm0, %xmm4, %xmm0 #170.21 - jmp ..B2.37 # Prob 100% #170.21 - # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovsd %xmm1, (%rdi,%rbx,8) #173.21 + vmovsd (%rdi,%rdx,8), %xmm2 #174.21 + vxorpd %xmm3, %xmm3, %xmm3 #174.21 + vunpcklpd %xmm3, %xmm2, %xmm4 #174.21 + vsubpd %xmm0, %xmm4, %xmm0 #174.21 + jmp ..B2.37 # Prob 100% #174.21 + # LOE rax rdx rdi r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.35: # Preds ..B2.33 # Execution count [0.00e+00] - vxorpd %xmm1, %xmm1, %xmm1 #170.21 - jmp ..B2.36 # Prob 100% #170.21 - # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 -..B2.73: # Preds ..B2.25 + vxorpd %xmm1, %xmm1, %xmm1 #174.21 + jmp ..B2.36 # Prob 100% #174.21 + # LOE rax rdx rdi r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.71: # Preds ..B2.25 # Execution count [7.50e-01] - testl %esi, %esi #168.21 - # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 -..B2.36: # Preds ..B2.35 ..B2.73 + testl %r11d, %r11d #172.21 + # LOE rax rdx rdi r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.36: # Preds ..B2.35 ..B2.71 # Execution count [2.67e+00] - vxorpd %xmm2, %xmm2, %xmm2 #170.21 - vunpcklpd %xmm2, %xmm1, %xmm3 #170.21 - vsubpd %xmm0, %xmm3, %xmm0 #170.21 - je ..B2.38 # Prob 40% #170.21 - # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vxorpd %xmm2, %xmm2, %xmm2 #174.21 + vunpcklpd %xmm2, %xmm1, %xmm3 #174.21 + vsubpd %xmm0, %xmm3, %xmm0 #174.21 + je ..B2.38 # Prob 40% #174.21 + # LOE rax rdx rdi r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.37: # Preds ..B2.34 ..B2.36 # Execution count [1.25e+00] - vmovsd %xmm0, (%r14,%rax,8) #170.21 - # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + vmovsd %xmm0, (%rdi,%rdx,8) #174.21 + # LOE rax rdi r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.38: # Preds ..B2.36 ..B2.29 ..B2.37 ..B2.13 ..B2.12 # # Execution count [1.25e+01] - addq $2, %r15 #143.9 - cmpq %r12, %r15 #143.9 - jb ..B2.12 # Prob 82% #143.9 - # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 + addq $2, %r15 #147.9 + cmpq %r12, %r15 #147.9 + jb ..B2.12 # Prob 82% #147.9 + # LOE rax rdi r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 ..B2.39: # Preds ..B2.38 # Execution count [2.25e+00] - vunpckhpd %xmm11, %xmm11, %xmm12 #132.22 - vunpckhpd %xmm14, %xmm14, %xmm8 #130.22 - vaddsd %xmm12, %xmm11, %xmm12 #132.22 - vaddsd %xmm8, %xmm14, %xmm10 #130.22 - vunpckhpd %xmm13, %xmm13, %xmm11 #131.22 - vmovsd 128(%rsp), %xmm1 #[spill] - vaddsd %xmm11, %xmm13, %xmm11 #131.22 - vmovsd 136(%rsp), %xmm3 #[spill] - vmovsd 144(%rsp), %xmm4 #[spill] - vmovsd 152(%rsp), %xmm5 #[spill] - vmovsd 48(%rsp), %xmm6 #[spill] - vmovsd 56(%rsp), %xmm2 #[spill] - movq 24(%rsp), %r9 #[spill] - movl 32(%rsp), %edi #[spill] - movq 88(%rsp), %rsi #[spill] - movq 96(%rsp), %r10 #[spill] - movq 104(%rsp), %r11 #[spill] - movq 112(%rsp), %rcx #[spill] - movq 120(%rsp), %r8 #[spill] - movq 40(%rsp), %rax #[spill] + vunpckhpd %xmm11, %xmm11, %xmm12 #137.22 + vunpckhpd %xmm14, %xmm14, %xmm8 #135.22 + vaddsd %xmm12, %xmm11, %xmm12 #137.22 + vaddsd %xmm8, %xmm14, %xmm10 #135.22 + vunpckhpd %xmm13, %xmm13, %xmm11 #136.22 + vmovsd 112(%rsp), %xmm1 #[spill] + vaddsd %xmm11, %xmm13, %xmm11 #136.22 + vmovsd 120(%rsp), %xmm3 #[spill] + vmovsd 128(%rsp), %xmm4 #[spill] + vmovsd 136(%rsp), %xmm6 #[spill] + vmovsd 24(%rsp), %xmm7 #[spill] + vmovsd 32(%rsp), %xmm2 #[spill] + movl 16(%rsp), %edx #[spill] + movq 56(%rsp), %r9 #[spill] + movq 64(%rsp), %rsi #[spill] + movq 72(%rsp), %r8 #[spill] + movq 80(%rsp), %r10 #[spill] + movq 88(%rsp), %r11 #[spill] + movq 96(%rsp), %rcx #[spill] + movq 104(%rsp), %rbx #[spill] vmovsd .L_2il0floatpacket.1(%rip), %xmm0 # - vmovsd .L_2il0floatpacket.4(%rip), %xmm7 # - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 -..B2.40: # Preds ..B2.39 ..B2.58 + vmovsd .L_2il0floatpacket.4(%rip), %xmm5 # + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.40: # Preds ..B2.39 ..B2.56 # Execution count [2.50e+00] - movslq %edi, %r13 #143.9 - cmpq %r13, %r12 #143.9 - jae ..B2.49 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + movslq %edx, %r13 #147.9 + cmpq %r13, %r12 #147.9 + jae ..B2.49 # Prob 10% #147.9 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.41: # Preds ..B2.40 # Execution count [2.25e+00] - imulq 64(%rsp), %r9 #125.43[spill] - addq 72(%rsp), %r9 #107.5[spill] - movl 80(%rsp), %eax #107.5[spill] - movq %r8, 120(%rsp) #107.5[spill] - # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + imulq %rsi, %rax #130.43 + movq %rcx, 96(%rsp) #106.5[spill] + addq 40(%rsp), %rax #106.5[spill] + movl 48(%rsp), %ecx #106.5[spill] + movq %rbx, 104(%rsp) #106.5[spill] + # LOE rax rsi rdi r8 r9 r10 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.42: # Preds ..B2.45 ..B2.41 # Execution count [1.25e+01] - movl (%r9,%r12,4), %r8d #144.21 - lea (%r8,%r8,2), %r15d #145.36 - movslq %r15d, %r15 #145.36 - vsubsd 8(%rdx,%r15,8), %xmm3, %xmm9 #146.36 - vsubsd (%rdx,%r15,8), %xmm4, %xmm14 #145.36 - vsubsd 16(%rdx,%r15,8), %xmm1, %xmm8 #147.36 - vmulsd %xmm9, %xmm9, %xmm13 #148.49 - vfmadd231sd %xmm14, %xmm14, %xmm13 #148.63 - vfmadd231sd %xmm8, %xmm8, %xmm13 #148.63 - vcomisd %xmm13, %xmm2 #158.22 - jbe ..B2.45 # Prob 50% #158.22 - # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 r15 eax edi r8d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 + movl (%rax,%r12,4), %ebx #148.21 + lea (%rbx,%rbx,2), %r15d #149.36 + movslq %r15d, %r15 #149.36 + vsubsd 8(%r14,%r15,8), %xmm3, %xmm9 #150.36 + vsubsd (%r14,%r15,8), %xmm4, %xmm14 #149.36 + vsubsd 16(%r14,%r15,8), %xmm1, %xmm8 #151.36 + vmulsd %xmm9, %xmm9, %xmm13 #152.49 + vfmadd231sd %xmm14, %xmm14, %xmm13 #152.63 + vfmadd231sd %xmm8, %xmm8, %xmm13 #152.63 + vcomisd %xmm13, %xmm2 #162.22 + jbe ..B2.45 # Prob 50% #162.22 + # LOE rax rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 edx ecx ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 ..B2.43: # Preds ..B2.42 # Execution count [6.25e+00] - vdivsd %xmm13, %xmm7, %xmm15 #159.38 - vmulsd %xmm15, %xmm6, %xmm13 #160.38 - vmulsd %xmm15, %xmm13, %xmm13 #160.44 - vmulsd %xmm15, %xmm13, %xmm13 #160.50 - vmulsd %xmm5, %xmm15, %xmm15 #161.54 - vmulsd %xmm13, %xmm15, %xmm15 #161.61 - vsubsd %xmm0, %xmm13, %xmm13 #161.54 - vmulsd %xmm13, %xmm15, %xmm15 #161.67 - vmulsd %xmm15, %xmm14, %xmm13 #162.31 - vmulsd %xmm15, %xmm9, %xmm9 #163.31 - vmulsd %xmm15, %xmm8, %xmm8 #164.31 - vaddsd %xmm13, %xmm10, %xmm10 #162.17 - vaddsd %xmm9, %xmm11, %xmm11 #163.17 - vaddsd %xmm8, %xmm12, %xmm12 #164.17 - cmpl %eax, %r8d #167.24 - jge ..B2.45 # Prob 50% #167.24 - # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 r15 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 + vdivsd %xmm13, %xmm5, %xmm15 #163.39 + vmulsd %xmm15, %xmm7, %xmm13 #164.38 + vmulsd %xmm15, %xmm13, %xmm13 #164.44 + vmulsd %xmm15, %xmm13, %xmm13 #164.50 + vmulsd %xmm6, %xmm15, %xmm15 #165.55 + vmulsd %xmm13, %xmm15, %xmm15 #165.64 + vsubsd %xmm0, %xmm13, %xmm13 #165.55 + vmulsd %xmm13, %xmm15, %xmm15 #165.70 + vmulsd %xmm15, %xmm14, %xmm13 #166.31 + vmulsd %xmm15, %xmm9, %xmm9 #167.31 + vmulsd %xmm15, %xmm8, %xmm8 #168.31 + vaddsd %xmm13, %xmm10, %xmm10 #166.17 + vaddsd %xmm9, %xmm11, %xmm11 #167.17 + vaddsd %xmm8, %xmm12, %xmm12 #168.17 + cmpl %ecx, %ebx #171.24 + jge ..B2.45 # Prob 50% #171.24 + # LOE rax rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 edx ecx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 ..B2.44: # Preds ..B2.43 # Execution count [3.12e+00] - vmovsd 8(%r14,%r15,8), %xmm15 #169.21 - vmovsd (%r14,%r15,8), %xmm14 #168.21 - vsubsd %xmm9, %xmm15, %xmm9 #169.21 - vsubsd %xmm13, %xmm14, %xmm13 #168.21 - vmovsd %xmm9, 8(%r14,%r15,8) #169.21 - vmovsd 16(%r14,%r15,8), %xmm9 #170.21 - vmovsd %xmm13, (%r14,%r15,8) #168.21 - vsubsd %xmm8, %xmm9, %xmm8 #170.21 - vmovsd %xmm8, 16(%r14,%r15,8) #170.21 - # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + vmovsd 8(%rdi,%r15,8), %xmm15 #173.21 + vmovsd (%rdi,%r15,8), %xmm14 #172.21 + vsubsd %xmm9, %xmm15, %xmm9 #173.21 + vsubsd %xmm13, %xmm14, %xmm13 #172.21 + vmovsd %xmm9, 8(%rdi,%r15,8) #173.21 + vmovsd 16(%rdi,%r15,8), %xmm9 #174.21 + vmovsd %xmm13, (%rdi,%r15,8) #172.21 + vsubsd %xmm8, %xmm9, %xmm8 #174.21 + vmovsd %xmm8, 16(%rdi,%r15,8) #174.21 + # LOE rax rsi rdi r8 r9 r10 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.45: # Preds ..B2.44 ..B2.43 ..B2.42 # Execution count [1.25e+01] - incq %r12 #143.9 - cmpq %r13, %r12 #143.9 - jb ..B2.42 # Prob 82% #143.9 - # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + incq %r12 #147.9 + cmpq %r13, %r12 #147.9 + jb ..B2.42 # Prob 82% #147.9 + # LOE rax rsi rdi r8 r9 r10 r11 r12 r13 r14 edx ecx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.46: # Preds ..B2.45 # Execution count [2.25e+00] - movq 120(%rsp), %r8 #[spill] - movq 40(%rsp), %rax #[spill] + movq 96(%rsp), %rcx #[spill] + movq 104(%rsp), %rbx #[spill] jmp ..B2.49 # Prob 100% # - # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 edi xmm0 xmm2 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + # LOE rcx rbx rsi rdi r8 r9 r10 r11 r13 r14 edx xmm0 xmm2 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.48: # Preds ..B2.9 ..B2.8 # Execution count [2.50e+00] - movslq %edi, %r13 #179.9 - # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 edi xmm0 xmm2 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 + movslq %edx, %r13 #183.9 + # LOE rcx rbx rsi rdi r8 r9 r10 r11 r13 r14 edx xmm0 xmm2 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 ..B2.49: # Preds ..B2.46 ..B2.40 ..B2.48 # Execution count [5.00e+00] - addq %r13, %r11 #179.9 - lea 3(%rdi), %r9d #180.9 - sarl $1, %r9d #180.9 - vaddsd (%rsi,%r14), %xmm10, %xmm1 #175.9 - vaddsd 8(%rsi,%r14), %xmm11, %xmm3 #176.9 - vaddsd 16(%rsi,%r14), %xmm12, %xmm4 #177.9 - shrl $30, %r9d #180.9 - vmovsd %xmm1, (%rsi,%r14) #175.9 - vmovsd %xmm3, 8(%rsi,%r14) #176.9 - vmovsd %xmm4, 16(%rsi,%r14) #177.9 - addq $24, %rsi #124.5 - lea 3(%r9,%rdi), %edi #180.9 - movslq %r8d, %r9 #124.32 - sarl $2, %edi #180.9 - incq %r8 #124.5 - movslq %edi, %rdi #180.9 - incq %r9 #124.32 - addq %rdi, %r10 #180.9 - cmpq %rax, %r8 #124.5 - jb ..B2.8 # Prob 82% #124.5 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 xmm0 xmm2 xmm5 xmm6 xmm7 + addq %r13, %r10 #183.9 + lea 3(%rdx), %eax #184.9 + sarl $1, %eax #184.9 + vaddsd (%r9,%rdi), %xmm10, %xmm1 #179.9 + vaddsd 8(%r9,%rdi), %xmm11, %xmm3 #180.9 + vaddsd 16(%r9,%rdi), %xmm12, %xmm4 #181.9 + shrl $30, %eax #184.9 + vmovsd %xmm1, (%r9,%rdi) #179.9 + vmovsd %xmm3, 8(%r9,%rdi) #180.9 + vmovsd %xmm4, 16(%r9,%rdi) #181.9 + addq $24, %r9 #129.5 + lea 3(%rax,%rdx), %edx #184.9 + movslq %ecx, %rax #129.32 + sarl $2, %edx #184.9 + incq %rcx #129.5 + movslq %edx, %rdx #184.9 + incq %rax #129.32 + addq %rdx, %r8 #184.9 + cmpq %rbx, %rcx #129.5 + jb ..B2.8 # Prob 82% #129.5 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r14 xmm0 xmm2 xmm5 xmm6 xmm7 ..B2.50: # Preds ..B2.49 # Execution count [9.00e-01] - movq 8(%rsp), %r15 #[spill] movq (%rsp), %r12 #[spill] - movq %r11, (%r15) #179.9 - movq %r10, 8(%r15) #180.9 - jmp ..B2.54 # Prob 100% #180.9 - # LOE rbx r12 + movq %r10, (%r12) #183.9 + movq %r8, 8(%r12) #184.9 + jmp ..B2.53 # Prob 100% #184.9 + # LOE ..B2.51: # Preds ..B2.1 # Execution count [5.00e-01] - xorl %ebx, %ebx #120.22 - xorl %eax, %eax #121.16 + xorl %eax, %eax #122.16 +..___tag_value_computeForceLJHalfNeigh.154: +# getTimeStamp() + call getTimeStamp #122.16 +..___tag_value_computeForceLJHalfNeigh.155: + # LOE xmm0 +..B2.69: # Preds ..B2.51 + # Execution count [5.00e-01] + vmovsd %xmm0, 8(%rsp) #122.16[spill] + # LOE +..B2.52: # Preds ..B2.69 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #126.5 +..___tag_value_computeForceLJHalfNeigh.157: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #126.5 +..___tag_value_computeForceLJHalfNeigh.158: + # LOE +..B2.53: # Preds ..B2.50 ..B2.52 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #187.5 +..___tag_value_computeForceLJHalfNeigh.159: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #187.5 +..___tag_value_computeForceLJHalfNeigh.160: + # LOE +..B2.54: # Preds ..B2.53 + # Execution count [1.00e+00] + xorl %eax, %eax #190.16 ..___tag_value_computeForceLJHalfNeigh.161: # getTimeStamp() - call getTimeStamp #121.16 + call getTimeStamp #190.16 ..___tag_value_computeForceLJHalfNeigh.162: - # LOE rbx r12 xmm0 -..B2.71: # Preds ..B2.51 - # Execution count [5.00e-01] - vmovsd %xmm0, 16(%rsp) #121.16[spill] - # LOE rbx r12 -..B2.52: # Preds ..B2.71 - # Execution count [5.00e-01] - movl $.L_2__STRING.1, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.164: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.165: - # LOE rbx r12 -..B2.54: # Preds ..B2.52 ..B2.50 - # Execution count [1.00e+00] - movl $.L_2__STRING.1, %edi #183.5 -..___tag_value_computeForceLJHalfNeigh.166: -# likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #183.5 -..___tag_value_computeForceLJHalfNeigh.167: - # LOE rbx r12 + # LOE xmm0 ..B2.55: # Preds ..B2.54 # Execution count [1.00e+00] - xorl %eax, %eax #184.16 -..___tag_value_computeForceLJHalfNeigh.168: -# getTimeStamp() - call getTimeStamp #184.16 -..___tag_value_computeForceLJHalfNeigh.169: - # LOE rbx r12 xmm0 -..B2.56: # Preds ..B2.55 - # Execution count [1.00e+00] - vxorpd %xmm4, %xmm4, %xmm4 #185.5 - vcvtsi2sdq %rbx, %xmm4, %xmm4 #185.5 - vsubsd 16(%rsp), %xmm0, %xmm1 #185.94[spill] - vmovsd .L_2il0floatpacket.9(%rip), %xmm3 #185.5 - movl $.L_2__STRING.2, %edi #185.5 - vdivsd %xmm4, %xmm3, %xmm5 #185.5 - vmulsd %xmm1, %xmm5, %xmm6 #185.5 - movl %ebx, %esi #185.5 - vmovsd 264(%r12), %xmm7 #185.74 - movl $3, %eax #185.5 - vmulsd %xmm7, %xmm6, %xmm2 #185.5 - vmovapd %xmm7, %xmm0 #185.5 - vmovsd %xmm1, (%rsp) #185.5[spill] -..___tag_value_computeForceLJHalfNeigh.171: -# printf(const char *__restrict__, ...) - call printf #185.5 -..___tag_value_computeForceLJHalfNeigh.172: - # LOE -..B2.57: # Preds ..B2.56 - # Execution count [1.00e+00] - vmovsd (%rsp), %xmm1 #[spill] - vmovapd %xmm1, %xmm0 #186.14 - addq $248, %rsp #186.14 + vsubsd 8(%rsp), %xmm0, %xmm0 #191.14[spill] + addq $216, %rsp #191.14 .cfi_restore 3 - popq %rbx #186.14 + popq %rbx #191.14 .cfi_restore 15 - popq %r15 #186.14 + popq %r15 #191.14 .cfi_restore 14 - popq %r14 #186.14 + popq %r14 #191.14 .cfi_restore 13 - popq %r13 #186.14 + popq %r13 #191.14 .cfi_restore 12 - popq %r12 #186.14 - movq %rbp, %rsp #186.14 - popq %rbp #186.14 + popq %r12 #191.14 + movq %rbp, %rsp #191.14 + popq %rbp #191.14 .cfi_def_cfa 7, 8 .cfi_restore 6 - ret #186.14 + ret #191.14 .cfi_def_cfa 6, 16 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 .cfi_offset 6, -16 @@ -1178,57 +1155,56 @@ computeForceLJHalfNeigh: .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 # LOE -..B2.58: # Preds ..B2.10 +..B2.56: # Preds ..B2.10 # Execution count [2.25e-01]: Infreq - xorl %r12d, %r12d #143.9 - jmp ..B2.40 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 -..B2.59: # Preds ..B2.2 + xorl %r12d, %r12d #147.9 + jmp ..B2.40 # Prob 100% #147.9 + # LOE rax rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.57: # Preds ..B2.2 # Execution count [1.00e+00]: Infreq - movq %r13, %rax #106.18 - lea (%rax,%rax,2), %rcx #106.18 - cmpq $8, %rcx #114.5 - jl ..B2.67 # Prob 10% #114.5 - # LOE rcx rdi r12 r14 r15 r13d -..B2.60: # Preds ..B2.59 + lea (%rbx,%rbx,2), %rcx #105.18 + cmpq $8, %rcx #116.5 + jl ..B2.65 # Prob 10% #116.5 + # LOE rcx rbx rdi r12 r13 r14 r15d +..B2.58: # Preds ..B2.57 # Execution count [1.00e+00]: Infreq - movl %ecx, %eax #114.5 - xorl %edx, %edx #114.5 - andl $-8, %eax #114.5 - movslq %eax, %rax #114.5 - vxorpd %ymm0, %ymm0, %ymm0 #115.22 - # LOE rax rdx rcx rdi r12 r14 r15 r13d ymm0 -..B2.61: # Preds ..B2.61 ..B2.60 + movl %ecx, %eax #116.5 + xorl %edx, %edx #116.5 + andl $-8, %eax #116.5 + movslq %eax, %rax #116.5 + vxorpd %ymm0, %ymm0, %ymm0 #117.22 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15d ymm0 +..B2.59: # Preds ..B2.59 ..B2.58 # Execution count [5.56e+00]: Infreq - vmovupd %ymm0, (%rdi,%rdx,8) #115.9 - vmovupd %ymm0, 32(%rdi,%rdx,8) #115.9 - addq $8, %rdx #114.5 - cmpq %rax, %rdx #114.5 - jb ..B2.61 # Prob 82% #114.5 - # LOE rax rdx rcx rdi r12 r14 r15 r13d ymm0 -..B2.63: # Preds ..B2.61 ..B2.67 + vmovupd %ymm0, (%rdi,%rdx,8) #117.9 + vmovupd %ymm0, 32(%rdi,%rdx,8) #117.9 + addq $8, %rdx #116.5 + cmpq %rax, %rdx #116.5 + jb ..B2.59 # Prob 82% #116.5 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15d ymm0 +..B2.61: # Preds ..B2.59 ..B2.65 # Execution count [1.11e+00]: Infreq - cmpq %rcx, %rax #114.5 - jae ..B2.5 # Prob 10% #114.5 - # LOE rax rcx rdi r12 r14 r15 r13d -..B2.64: # Preds ..B2.63 + cmpq %rcx, %rax #116.5 + jae ..B2.5 # Prob 10% #116.5 + # LOE rax rcx rbx rdi r12 r13 r14 r15d +..B2.62: # Preds ..B2.61 # Execution count [1.00e+00]: Infreq xorl %edx, %edx # - # LOE rax rdx rcx rdi r12 r14 r15 r13d -..B2.65: # Preds ..B2.64 ..B2.65 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15d +..B2.63: # Preds ..B2.62 ..B2.63 # Execution count [5.56e+00]: Infreq - movq %rdx, (%rdi,%rax,8) #115.9 - incq %rax #114.5 - cmpq %rcx, %rax #114.5 - jb ..B2.65 # Prob 82% #114.5 - jmp ..B2.5 # Prob 100% #114.5 - # LOE rax rdx rcx rdi r12 r14 r15 r13d -..B2.67: # Preds ..B2.59 + movq %rdx, (%rdi,%rax,8) #117.9 + incq %rax #116.5 + cmpq %rcx, %rax #116.5 + jb ..B2.63 # Prob 82% #116.5 + jmp ..B2.5 # Prob 100% #116.5 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15d +..B2.65: # Preds ..B2.57 # Execution count [1.00e-01]: Infreq - xorl %eax, %eax #114.5 - jmp ..B2.63 # Prob 100% #114.5 + xorl %eax, %eax #116.5 + jmp ..B2.61 # Prob 100% #116.5 .align 16,0x90 - # LOE rax rcx rdi r12 r14 r15 r13d + # LOE rax rcx rbx rdi r12 r13 r14 r15d .cfi_endproc # mark_end; .type computeForceLJHalfNeigh,@function @@ -1252,105 +1228,97 @@ computeForceLJFullNeigh_simd: ..B3.1: # Preds ..B3.0 # Execution count [1.00e+00] .cfi_startproc -..___tag_value_computeForceLJFullNeigh_simd.190: -..L191: - #189.101 - pushq %rbp #189.101 +..___tag_value_computeForceLJFullNeigh_simd.179: +..L180: + #194.101 + pushq %rbp #194.101 .cfi_def_cfa_offset 16 - movq %rsp, %rbp #189.101 + movq %rsp, %rbp #194.101 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 - andq $-32, %rsp #189.101 - movl 4(%rsi), %edx #190.18 - testl %edx, %edx #196.24 - jle ..B3.4 # Prob 50% #196.24 + andq $-32, %rsp #194.101 + movl 4(%rsi), %edx #195.18 + testl %edx, %edx #201.24 + jle ..B3.4 # Prob 50% #201.24 # LOE rbx rsi r12 r13 r14 r15 edx ..B3.2: # Preds ..B3.1 # Execution count [5.00e-03] - movq 64(%rsi), %rdi #197.9 - lea (%rdx,%rdx,2), %eax #190.18 - cmpl $12, %eax #196.5 - jle ..B3.8 # Prob 0% #196.5 + movq 64(%rsi), %rdi #202.9 + lea (%rdx,%rdx,2), %eax #195.18 + cmpl $12, %eax #201.5 + jle ..B3.7 # Prob 0% #201.5 # LOE rbx rdi r12 r13 r14 r15 edx ..B3.3: # Preds ..B3.2 # Execution count [1.00e+00] - movslq %edx, %rdx #196.5 - xorl %esi, %esi #196.5 - lea (%rdx,%rdx,2), %rdx #196.5 - shlq $3, %rdx #196.5 - call __intel_avx_rep_memset #196.5 + movslq %edx, %rdx #201.5 + xorl %esi, %esi #201.5 + lea (%rdx,%rdx,2), %rdx #201.5 + shlq $3, %rdx #201.5 + call __intel_avx_rep_memset #201.5 # LOE rbx r12 r13 r14 r15 -..B3.4: # Preds ..B3.14 ..B3.1 ..B3.12 ..B3.3 +..B3.4: # Preds ..B3.13 ..B3.1 ..B3.11 ..B3.3 # Execution count [1.00e+00] - xorl %eax, %eax #203.16 - vzeroupper #203.16 -..___tag_value_computeForceLJFullNeigh_simd.195: + xorl %eax, %eax #207.16 + vzeroupper #207.16 +..___tag_value_computeForceLJFullNeigh_simd.184: # getTimeStamp() - call getTimeStamp #203.16 -..___tag_value_computeForceLJFullNeigh_simd.196: - # LOE rbx r12 r13 r14 r15 + call getTimeStamp #207.16 +..___tag_value_computeForceLJFullNeigh_simd.185: + # LOE ..B3.5: # Preds ..B3.4 # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #204.5 -..___tag_value_computeForceLJFullNeigh_simd.197: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #204.5 -..___tag_value_computeForceLJFullNeigh_simd.198: + movl $il0_peep_printf_format_0, %edi #210.5 + movq stderr(%rip), %rsi #210.5 + call fputs #210.5 # LOE ..B3.6: # Preds ..B3.5 # Execution count [1.00e+00] - movl $il0_peep_printf_format_0, %edi #207.5 - movq stderr(%rip), %rsi #207.5 - call fputs #207.5 - # LOE -..B3.7: # Preds ..B3.6 - # Execution count [1.00e+00] - movl $-1, %edi #208.5 + movl $-1, %edi #211.5 # exit(int) - call exit #208.5 + call exit #211.5 # LOE -..B3.8: # Preds ..B3.2 +..B3.7: # Preds ..B3.2 # Execution count [1.00e+00]: Infreq - movslq %edx, %rdx #196.5 - lea (%rdx,%rdx,2), %rsi #190.18 - cmpq $8, %rsi #196.5 - jl ..B3.16 # Prob 10% #196.5 + movslq %edx, %rdx #201.5 + lea (%rdx,%rdx,2), %rsi #195.18 + cmpq $8, %rsi #201.5 + jl ..B3.15 # Prob 10% #201.5 # LOE rbx rsi rdi r12 r13 r14 r15 -..B3.9: # Preds ..B3.8 +..B3.8: # Preds ..B3.7 # Execution count [1.00e+00]: Infreq - movl %esi, %edx #196.5 - xorl %ecx, %ecx #196.5 - andl $-8, %edx #196.5 - xorl %eax, %eax #196.5 - movslq %edx, %rdx #196.5 - vxorpd %ymm0, %ymm0, %ymm0 #197.22 + movl %esi, %edx #201.5 + xorl %ecx, %ecx #201.5 + andl $-8, %edx #201.5 + xorl %eax, %eax #201.5 + movslq %edx, %rdx #201.5 + vxorpd %ymm0, %ymm0, %ymm0 #202.22 # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ymm0 -..B3.10: # Preds ..B3.10 ..B3.9 +..B3.9: # Preds ..B3.9 ..B3.8 # Execution count [5.56e+00]: Infreq - vmovupd %ymm0, (%rdi,%rcx,8) #197.9 - vmovupd %ymm0, 32(%rdi,%rcx,8) #197.9 - addq $8, %rcx #196.5 - cmpq %rdx, %rcx #196.5 - jb ..B3.10 # Prob 82% #196.5 + vmovupd %ymm0, (%rdi,%rcx,8) #202.9 + vmovupd %ymm0, 32(%rdi,%rcx,8) #202.9 + addq $8, %rcx #201.5 + cmpq %rdx, %rcx #201.5 + jb ..B3.9 # Prob 82% #201.5 # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ymm0 -..B3.12: # Preds ..B3.10 ..B3.16 +..B3.11: # Preds ..B3.9 ..B3.15 # Execution count [1.11e+00]: Infreq - cmpq %rsi, %rdx #196.5 - jae ..B3.4 # Prob 10% #196.5 + cmpq %rsi, %rdx #201.5 + jae ..B3.4 # Prob 10% #201.5 # LOE rax rdx rbx rsi rdi r12 r13 r14 r15 -..B3.14: # Preds ..B3.12 ..B3.14 +..B3.13: # Preds ..B3.11 ..B3.13 # Execution count [5.56e+00]: Infreq - movq %rax, (%rdi,%rdx,8) #197.9 - incq %rdx #196.5 - cmpq %rsi, %rdx #196.5 - jb ..B3.14 # Prob 82% #196.5 - jmp ..B3.4 # Prob 100% #196.5 + movq %rax, (%rdi,%rdx,8) #202.9 + incq %rdx #201.5 + cmpq %rsi, %rdx #201.5 + jb ..B3.13 # Prob 82% #201.5 + jmp ..B3.4 # Prob 100% #201.5 # LOE rax rdx rbx rsi rdi r12 r13 r14 r15 -..B3.16: # Preds ..B3.8 +..B3.15: # Preds ..B3.7 # Execution count [1.00e-01]: Infreq - xorl %edx, %edx #196.5 - xorl %eax, %eax #196.5 - jmp ..B3.12 # Prob 100% #196.5 + xorl %edx, %edx #201.5 + xorl %eax, %eax #201.5 + jmp ..B3.11 # Prob 100% #201.5 .align 16,0x90 # LOE rax rdx rbx rsi rdi r12 r13 r14 r15 .cfi_endproc @@ -1428,11 +1396,6 @@ il0_peep_printf_format_0: .long 0x00000000,0x3ff00000 .type .L_2il0floatpacket.4,@object .size .L_2il0floatpacket.4,8 - .align 8 -.L_2il0floatpacket.9: - .long 0x00000000,0x41cdcd65 - .type .L_2il0floatpacket.9,@object - .size .L_2il0floatpacket.9,8 .section .rodata.str1.4, "aMS",@progbits,1 .align 4 .align 4 @@ -1451,24 +1414,6 @@ il0_peep_printf_format_0: .word 104 .type .L_2__STRING.1,@object .size .L_2__STRING.1,18 - .space 2, 0x00 # pad - .align 4 -.L_2__STRING.2: - .long 980644937 - .long 544548128 - .long 1701987872 - .long 622869105 - .long 1411391590 - .long 979725673 - .long 174466336 - .long 1764718915 - .long 622869108 - .long 1747460198 - .long 761687137 - .long 1734960494 - .long 665960 - .type .L_2__STRING.2,@object - .size .L_2__STRING.2,52 .data .section .note.GNU-stack, "" # End diff --git a/static_analysis/jan/lammps-icc-avx512.o b/static_analysis/jan/lammps-icc-avx512.o new file mode 100644 index 0000000..ecf4a08 Binary files /dev/null and b/static_analysis/jan/lammps-icc-avx512.o differ diff --git a/static_analysis/jan/icx-icc-lammps-avx512.s b/static_analysis/jan/lammps-icc-avx512.s similarity index 67% rename from static_analysis/jan/icx-icc-lammps-avx512.s rename to static_analysis/jan/lammps-icc-avx512.s index 47960ac..bb29ece 100644 --- a/static_analysis/jan/icx-icc-lammps-avx512.s +++ b/static_analysis/jan/lammps-icc-avx512.s @@ -24,49 +24,49 @@ computeForceLJFullNeigh_plain_c: .cfi_startproc ..___tag_value_computeForceLJFullNeigh_plain_c.1: ..L2: - #23.104 - pushq %rbp #23.104 + #21.104 + pushq %rbp #21.104 .cfi_def_cfa_offset 16 - movq %rsp, %rbp #23.104 + movq %rsp, %rbp #21.104 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 - andq $-64, %rsp #23.104 - pushq %r13 #23.104 - pushq %r14 #23.104 - pushq %r15 #23.104 - pushq %rbx #23.104 - subq $96, %rsp #23.104 + andq $-64, %rsp #21.104 + pushq %r13 #21.104 + pushq %r14 #21.104 + pushq %r15 #21.104 + pushq %rbx #21.104 + subq $96, %rsp #21.104 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 - movq %rsi, %r13 #23.104 - vmovsd 144(%rdi), %xmm0 #27.27 - movq %rcx, %r14 #23.104 - vmovsd 56(%rdi), %xmm1 #28.23 - movq %rdx, %rbx #23.104 - vmovsd 40(%rdi), %xmm2 #29.24 - movl 4(%r13), %r15d #24.18 - vmovsd %xmm0, 32(%rsp) #27.27[spill] - vmovsd %xmm1, 16(%rsp) #28.23[spill] - vmovsd %xmm2, 24(%rsp) #29.24[spill] - testl %r15d, %r15d #32.24 - jle ..B1.27 # Prob 50% #32.24 + movq %rsi, %r13 #21.104 + vmovsd 144(%rdi), %xmm0 #25.27 + movq %rcx, %r14 #21.104 + vmovsd 56(%rdi), %xmm1 #26.23 + movq %rdx, %rbx #21.104 + vmovsd 40(%rdi), %xmm2 #27.24 + movl 4(%r13), %r15d #22.18 + vmovsd %xmm0, 32(%rsp) #25.27[spill] + vmovsd %xmm1, 16(%rsp) #26.23[spill] + vmovsd %xmm2, 24(%rsp) #27.24[spill] + testl %r15d, %r15d #33.24 + jle ..B1.27 # Prob 50% #33.24 # LOE rbx r12 r13 r14 r15d ..B1.2: # Preds ..B1.1 # Execution count [5.00e-03] - movq 64(%r13), %rdi #33.9 - lea (%r15,%r15,2), %esi #24.18 - cmpl $12, %esi #32.5 - jle ..B1.34 # Prob 0% #32.5 + movq 64(%r13), %rdi #34.9 + lea (%r15,%r15,2), %esi #22.18 + cmpl $12, %esi #33.5 + jle ..B1.34 # Prob 0% #33.5 # LOE rbx rdi r12 r13 r14 esi r15d ..B1.3: # Preds ..B1.2 # Execution count [1.00e+00] - movslq %r15d, %r15 #32.5 - xorl %esi, %esi #32.5 - lea (%r15,%r15,2), %rdx #32.5 - shlq $3, %rdx #32.5 - call __intel_skx_avx512_memset #32.5 + movslq %r15d, %r15 #33.5 + xorl %esi, %esi #33.5 + lea (%r15,%r15,2), %rdx #33.5 + shlq $3, %rdx #33.5 + call __intel_skx_avx512_memset #33.5 # LOE rbx r12 r13 r14 r15 ..B1.4: # Preds ..B1.3 ..B1.46 ..B1.39 # Execution count [1.00e+00] @@ -83,166 +83,166 @@ computeForceLJFullNeigh_plain_c: # LOE rbx r12 r13 r14 r15 ..B1.5: # Preds ..B1.43 # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 + movl $.L_2__STRING.0, %edi #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.16: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 + call likwid_markerStartRegion #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.17: # LOE rbx r12 r13 r14 r15 ..B1.6: # Preds ..B1.5 # Execution count [9.00e-01] - vmovsd 32(%rsp), %xmm13 #27.45[spill] - xorl %esi, %esi #41.15 - vmovsd 24(%rsp), %xmm0 #77.41[spill] - xorl %edi, %edi #41.5 - vmulsd %xmm13, %xmm13, %xmm14 #27.45 - xorl %eax, %eax #41.5 - vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #56.9 - vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm1 #77.41 - vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #56.9 - vmovups .L_2il0floatpacket.4(%rip), %zmm5 #77.54 - vbroadcastsd %xmm14, %zmm14 #27.25 - vbroadcastsd 16(%rsp), %zmm13 #28.21[spill] - vbroadcastsd %xmm1, %zmm12 #77.41 - movq 24(%rbx), %r11 #43.25 + vmovsd 32(%rsp), %xmm13 #25.45[spill] + xorl %esi, %esi #45.15 + vmovsd 24(%rsp), %xmm0 #77.42[spill] + xorl %edi, %edi #45.5 + vmulsd %xmm13, %xmm13, %xmm14 #25.45 + xorl %eax, %eax #45.5 + vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #59.9 + vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm1 #77.42 + vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #59.9 + vmovups .L_2il0floatpacket.4(%rip), %zmm5 #77.55 + vbroadcastsd %xmm14, %zmm14 #25.25 + vbroadcastsd 16(%rsp), %zmm13 #26.21[spill] + vbroadcastsd %xmm1, %zmm12 #77.42 + movq 24(%rbx), %r11 #47.25 movq 64(%r13), %r10 #89.9 - movq 16(%rbx), %r9 #42.19 - movslq 8(%rbx), %r8 #42.43 - shlq $2, %r8 #25.5 - movq 16(%r13), %rbx #44.25 + movq 16(%rbx), %r9 #46.19 + movslq 8(%rbx), %r8 #46.43 + shlq $2, %r8 #23.5 + movq 16(%r13), %rbx #48.25 movq (%r14), %rdx #93.9 movq 8(%r14), %rcx #94.9 - movq %r10, 48(%rsp) #41.5[spill] - movq %r11, 56(%rsp) #41.5[spill] - movq %r15, 64(%rsp) #41.5[spill] - movq %r14, (%rsp) #41.5[spill] - movq %r12, 8(%rsp) #41.5[spill] + movq %r10, 48(%rsp) #45.5[spill] + movq %r11, 56(%rsp) #45.5[spill] + movq %r15, 64(%rsp) #45.5[spill] + movq %r14, (%rsp) #45.5[spill] + movq %r12, 8(%rsp) #45.5[spill] .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 # LOE rax rdx rcx rbx rsi rdi r8 r9 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 ..B1.7: # Preds ..B1.25 ..B1.6 # Execution count [5.00e+00] - movq 56(%rsp), %r10 #43.25[spill] - vxorpd %xmm24, %xmm24, %xmm24 #47.22 - vmovapd %xmm24, %xmm18 #48.22 - movl (%r10,%rdi,4), %r13d #43.25 - vmovapd %xmm18, %xmm4 #49.22 - vmovsd (%rax,%rbx), %xmm11 #44.25 - vmovsd 8(%rax,%rbx), %xmm6 #45.25 - vmovsd 16(%rax,%rbx), %xmm7 #46.25 - testl %r13d, %r13d #56.28 - jle ..B1.25 # Prob 50% #56.28 + movq 56(%rsp), %r10 #47.25[spill] + vxorpd %xmm24, %xmm24, %xmm24 #51.22 + vmovapd %xmm24, %xmm18 #52.22 + movl (%r10,%rdi,4), %r13d #47.25 + vmovapd %xmm18, %xmm4 #53.22 + vmovsd (%rax,%rbx), %xmm11 #48.25 + vmovsd 8(%rax,%rbx), %xmm6 #49.25 + vmovsd 16(%rax,%rbx), %xmm7 #50.25 + testl %r13d, %r13d #59.28 + jle ..B1.25 # Prob 50% #59.28 # LOE rax rdx rcx rbx rsi rdi r8 r9 r13d xmm4 xmm6 xmm7 xmm11 xmm18 xmm24 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 ..B1.8: # Preds ..B1.7 # Execution count [4.50e+00] - vpxord %zmm10, %zmm10, %zmm10 #47.22 - vmovaps %zmm10, %zmm9 #48.22 - vmovaps %zmm9, %zmm8 #49.22 - cmpl $8, %r13d #56.9 - jl ..B1.33 # Prob 10% #56.9 + vpxord %zmm10, %zmm10, %zmm10 #51.22 + vmovaps %zmm10, %zmm9 #52.22 + vmovaps %zmm9, %zmm8 #53.22 + cmpl $8, %r13d #59.9 + jl ..B1.33 # Prob 10% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.9: # Preds ..B1.8 # Execution count [4.50e+00] - cmpl $1200, %r13d #56.9 - jl ..B1.32 # Prob 10% #56.9 + cmpl $1200, %r13d #59.9 + jl ..B1.32 # Prob 10% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.10: # Preds ..B1.9 # Execution count [4.50e+00] - movq %r8, %r10 #42.43 - imulq %rsi, %r10 #42.43 - addq %r9, %r10 #25.5 - movq %r10, %r12 #56.9 - andq $63, %r12 #56.9 - testl $3, %r12d #56.9 - je ..B1.12 # Prob 50% #56.9 + movq %r8, %r10 #46.43 + imulq %rsi, %r10 #46.43 + addq %r9, %r10 #23.5 + movq %r10, %r12 #59.9 + andq $63, %r12 #59.9 + testl $3, %r12d #59.9 + je ..B1.12 # Prob 50% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.11: # Preds ..B1.10 # Execution count [2.25e+00] - xorl %r12d, %r12d #56.9 - jmp ..B1.14 # Prob 100% #56.9 + xorl %r12d, %r12d #59.9 + jmp ..B1.14 # Prob 100% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.12: # Preds ..B1.10 # Execution count [2.25e+00] - testl %r12d, %r12d #56.9 - je ..B1.14 # Prob 50% #56.9 + testl %r12d, %r12d #59.9 + je ..B1.14 # Prob 50% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.13: # Preds ..B1.12 # Execution count [2.50e+01] - negl %r12d #56.9 - addl $64, %r12d #56.9 - shrl $2, %r12d #56.9 - cmpl %r12d, %r13d #56.9 - cmovl %r13d, %r12d #56.9 + negl %r12d #59.9 + addl $64, %r12d #59.9 + shrl $2, %r12d #59.9 + cmpl %r12d, %r13d #59.9 + cmovl %r13d, %r12d #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.14: # Preds ..B1.11 ..B1.13 ..B1.12 # Execution count [5.00e+00] - movl %r13d, %r11d #56.9 - subl %r12d, %r11d #56.9 - andl $7, %r11d #56.9 - negl %r11d #56.9 - addl %r13d, %r11d #56.9 - cmpl $1, %r12d #56.9 - jb ..B1.18 # Prob 50% #56.9 + movl %r13d, %r11d #59.9 + subl %r12d, %r11d #59.9 + andl $7, %r11d #59.9 + negl %r11d #59.9 + addl %r13d, %r11d #59.9 + cmpl $1, %r12d #59.9 + jb ..B1.18 # Prob 50% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.15: # Preds ..B1.14 # Execution count [4.50e+00] - vmovdqa32 %ymm16, %ymm4 #56.9 - xorl %r15d, %r15d #56.9 - vpbroadcastd %r12d, %ymm3 #56.9 - vbroadcastsd %xmm11, %zmm2 #44.23 - vbroadcastsd %xmm6, %zmm1 #45.23 - vbroadcastsd %xmm7, %zmm0 #46.23 - movslq %r12d, %r14 #56.9 + vmovdqa32 %ymm16, %ymm4 #59.9 + xorl %r15d, %r15d #59.9 + vpbroadcastd %r12d, %ymm3 #59.9 + vbroadcastsd %xmm11, %zmm2 #48.23 + vbroadcastsd %xmm6, %zmm1 #49.23 + vbroadcastsd %xmm7, %zmm0 #50.23 + movslq %r12d, %r14 #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r15 r11d r12d r13d xmm6 xmm7 xmm11 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 movl $111, %ebx # OSACA START MARKER .byte 100 # OSACA START MARKER .byte 103 # OSACA START MARKER .byte 144 # OSACA START MARKER +# pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4 # LLVM-MCA-BEGIN -# pointer_increment=64 55c62179dea305ceefda0dbc87792a60 ..B1.16: # Preds ..B1.16 ..B1.15 # Execution count [2.50e+01] - vpcmpgtd %ymm4, %ymm3, %k5 #56.9 - vpaddd %ymm15, %ymm4, %ymm4 #56.9 - vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #57.21 - vpaddd %ymm17, %ymm17, %ymm18 #58.36 - addq $8, %r15 #56.9 - vpaddd %ymm18, %ymm17, %ymm19 #58.36 - kmovw %k5, %k2 #58.36 - kmovw %k5, %k3 #58.36 - kmovw %k5, %k1 #58.36 - vpxord %zmm21, %zmm21, %zmm21 #58.36 - vpxord %zmm20, %zmm20, %zmm20 #58.36 - vpxord %zmm22, %zmm22, %zmm22 #58.36 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #58.36 - vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #58.36 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #58.36 - vsubpd %zmm21, %zmm1, %zmm18 #59.36 - vsubpd %zmm20, %zmm2, %zmm17 #58.36 - vsubpd %zmm22, %zmm0, %zmm19 #60.36 - vmulpd %zmm18, %zmm18, %zmm31 #61.49 - vfmadd231pd %zmm17, %zmm17, %zmm31 #61.49 - vfmadd231pd %zmm19, %zmm19, %zmm31 #61.63 - vrcp14pd %zmm31, %zmm30 #75.38 - vcmppd $1, %zmm14, %zmm31, %k6{%k5} #71.22 - vfpclasspd $30, %zmm30, %k0 #75.38 - vmovaps %zmm31, %zmm23 #75.38 - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.38 - knotw %k0, %k4 #75.38 - vmulpd %zmm23, %zmm23, %zmm24 #75.38 - vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.38 - vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.38 + vpcmpgtd %ymm4, %ymm3, %k5 #59.9 + vpaddd %ymm15, %ymm4, %ymm4 #59.9 + vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21 + vpaddd %ymm17, %ymm17, %ymm18 #61.36 + addq $8, %r15 #59.9 + vpaddd %ymm18, %ymm17, %ymm19 #61.36 + kmovw %k5, %k2 #61.36 + kmovw %k5, %k3 #61.36 + kmovw %k5, %k1 #61.36 + vpxord %zmm21, %zmm21, %zmm21 #61.36 + vpxord %zmm20, %zmm20, %zmm20 #61.36 + vpxord %zmm22, %zmm22, %zmm22 #61.36 + vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36 + vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36 + vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36 + vsubpd %zmm21, %zmm1, %zmm18 #62.36 + vsubpd %zmm20, %zmm2, %zmm17 #61.36 + vsubpd %zmm22, %zmm0, %zmm19 #63.36 + vmulpd %zmm18, %zmm18, %zmm31 #64.49 + vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49 + vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63 + vrcp14pd %zmm31, %zmm30 #75.39 + vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22 + vfpclasspd $30, %zmm30, %k0 #75.39 + vmovaps %zmm31, %zmm23 #75.39 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39 + knotw %k0, %k4 #75.39 + vmulpd %zmm23, %zmm23, %zmm24 #75.39 + vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39 + vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39 vmulpd %zmm13, %zmm30, %zmm25 #76.38 - vmulpd %zmm12, %zmm30, %zmm27 #77.54 + vmulpd %zmm12, %zmm30, %zmm27 #77.55 vmulpd %zmm25, %zmm30, %zmm28 #76.44 vmulpd %zmm28, %zmm30, %zmm26 #76.50 - vfmsub213pd %zmm5, %zmm28, %zmm30 #77.54 - vmulpd %zmm27, %zmm26, %zmm29 #77.61 - vmulpd %zmm30, %zmm29, %zmm23 #77.67 + vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55 + vmulpd %zmm27, %zmm26, %zmm29 #77.64 + vmulpd %zmm30, %zmm29, %zmm23 #77.70 vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17 vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17 vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17 - cmpq %r14, %r15 #56.9 - jb ..B1.16 # Prob 82% #56.9 + cmpq %r14, %r15 #59.9 + jb ..B1.16 # Prob 82% #59.9 # LLVM-MCA-END movl $222, %ebx # OSACA END MARKER .byte 100 # OSACA END MARKER @@ -251,145 +251,145 @@ movl $222, %ebx # OSACA END MARKER # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r15 r11d r12d r13d xmm6 xmm7 xmm11 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.17: # Preds ..B1.16 # Execution count [4.50e+00] - cmpl %r12d, %r13d #56.9 - je ..B1.24 # Prob 10% #56.9 + cmpl %r12d, %r13d #59.9 + je ..B1.24 # Prob 10% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.18: # Preds ..B1.17 ..B1.14 ..B1.32 # Execution count [2.50e+01] - lea 8(%r12), %r10d #56.9 - cmpl %r10d, %r11d #56.9 - jl ..B1.22 # Prob 50% #56.9 + lea 8(%r12), %r10d #59.9 + cmpl %r10d, %r11d #59.9 + jl ..B1.22 # Prob 50% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.19: # Preds ..B1.18 # Execution count [4.50e+00] - movq %r8, %r10 #42.43 - imulq %rsi, %r10 #42.43 - vbroadcastsd %xmm11, %zmm2 #44.23 - vbroadcastsd %xmm6, %zmm1 #45.23 - vbroadcastsd %xmm7, %zmm0 #46.23 - movslq %r12d, %r14 #56.9 - addq %r9, %r10 #25.5 + movq %r8, %r10 #46.43 + imulq %rsi, %r10 #46.43 + vbroadcastsd %xmm11, %zmm2 #48.23 + vbroadcastsd %xmm6, %zmm1 #49.23 + vbroadcastsd %xmm7, %zmm0 #50.23 + movslq %r12d, %r14 #59.9 + addq %r9, %r10 #23.5 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.20: # Preds ..B1.20 ..B1.19 # Execution count [2.50e+01] - vmovdqu (%r10,%r14,4), %ymm3 #57.21 - addl $8, %r12d #56.9 - vpcmpeqb %xmm0, %xmm0, %k2 #58.36 - vpcmpeqb %xmm0, %xmm0, %k3 #58.36 - vpcmpeqb %xmm0, %xmm0, %k1 #58.36 - vpaddd %ymm3, %ymm3, %ymm4 #58.36 - vpaddd %ymm4, %ymm3, %ymm17 #58.36 - addq $8, %r14 #56.9 - vpxord %zmm19, %zmm19, %zmm19 #58.36 - vpxord %zmm18, %zmm18, %zmm18 #58.36 - vpxord %zmm20, %zmm20, %zmm20 #58.36 - vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #58.36 - vgatherdpd (%rbx,%ymm17,8), %zmm18{%k3} #58.36 - vgatherdpd 16(%rbx,%ymm17,8), %zmm20{%k1} #58.36 - vsubpd %zmm19, %zmm1, %zmm30 #59.36 - vsubpd %zmm18, %zmm2, %zmm29 #58.36 - vsubpd %zmm20, %zmm0, %zmm3 #60.36 - vmulpd %zmm30, %zmm30, %zmm21 #61.49 - vfmadd231pd %zmm29, %zmm29, %zmm21 #61.49 - vfmadd231pd %zmm3, %zmm3, %zmm21 #61.63 - vrcp14pd %zmm21, %zmm28 #75.38 - vcmppd $1, %zmm14, %zmm21, %k5 #71.22 - vfpclasspd $30, %zmm28, %k0 #75.38 - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm28, %zmm21 #75.38 - knotw %k0, %k4 #75.38 - vmulpd %zmm21, %zmm21, %zmm22 #75.38 - vfmadd213pd %zmm28, %zmm21, %zmm28{%k4} #75.38 - vfmadd213pd %zmm28, %zmm22, %zmm28{%k4} #75.38 + vmovdqu (%r10,%r14,4), %ymm3 #60.21 + addl $8, %r12d #59.9 + vpcmpeqb %xmm0, %xmm0, %k2 #61.36 + vpcmpeqb %xmm0, %xmm0, %k3 #61.36 + vpcmpeqb %xmm0, %xmm0, %k1 #61.36 + vpaddd %ymm3, %ymm3, %ymm4 #61.36 + vpaddd %ymm4, %ymm3, %ymm17 #61.36 + addq $8, %r14 #59.9 + vpxord %zmm19, %zmm19, %zmm19 #61.36 + vpxord %zmm18, %zmm18, %zmm18 #61.36 + vpxord %zmm20, %zmm20, %zmm20 #61.36 + vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #61.36 + vgatherdpd (%rbx,%ymm17,8), %zmm18{%k3} #61.36 + vgatherdpd 16(%rbx,%ymm17,8), %zmm20{%k1} #61.36 + vsubpd %zmm19, %zmm1, %zmm30 #62.36 + vsubpd %zmm18, %zmm2, %zmm29 #61.36 + vsubpd %zmm20, %zmm0, %zmm3 #63.36 + vmulpd %zmm30, %zmm30, %zmm21 #64.49 + vfmadd231pd %zmm29, %zmm29, %zmm21 #64.49 + vfmadd231pd %zmm3, %zmm3, %zmm21 #64.63 + vrcp14pd %zmm21, %zmm28 #75.39 + vcmppd $1, %zmm14, %zmm21, %k5 #74.22 + vfpclasspd $30, %zmm28, %k0 #75.39 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm28, %zmm21 #75.39 + knotw %k0, %k4 #75.39 + vmulpd %zmm21, %zmm21, %zmm22 #75.39 + vfmadd213pd %zmm28, %zmm21, %zmm28{%k4} #75.39 + vfmadd213pd %zmm28, %zmm22, %zmm28{%k4} #75.39 vmulpd %zmm13, %zmm28, %zmm23 #76.38 - vmulpd %zmm12, %zmm28, %zmm25 #77.54 + vmulpd %zmm12, %zmm28, %zmm25 #77.55 vmulpd %zmm23, %zmm28, %zmm26 #76.44 vmulpd %zmm26, %zmm28, %zmm24 #76.50 - vfmsub213pd %zmm5, %zmm26, %zmm28 #77.54 - vmulpd %zmm25, %zmm24, %zmm27 #77.61 - vmulpd %zmm28, %zmm27, %zmm31 #77.67 + vfmsub213pd %zmm5, %zmm26, %zmm28 #77.55 + vmulpd %zmm25, %zmm24, %zmm27 #77.64 + vmulpd %zmm28, %zmm27, %zmm31 #77.70 vfmadd231pd %zmm29, %zmm31, %zmm10{%k5} #78.17 vfmadd231pd %zmm30, %zmm31, %zmm9{%k5} #79.17 vfmadd231pd %zmm3, %zmm31, %zmm8{%k5} #80.17 - cmpl %r11d, %r12d #56.9 - jb ..B1.20 # Prob 82% #56.9 + cmpl %r11d, %r12d #59.9 + jb ..B1.20 # Prob 82% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.22: # Preds ..B1.20 ..B1.18 ..B1.33 # Execution count [5.00e+00] - lea 1(%r11), %r10d #56.9 - cmpl %r13d, %r10d #56.9 - ja ..B1.24 # Prob 50% #56.9 + lea 1(%r11), %r10d #59.9 + cmpl %r13d, %r10d #59.9 + ja ..B1.24 # Prob 50% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.23: # Preds ..B1.22 # Execution count [2.50e+01] - imulq %r8, %rsi #42.43 - vbroadcastsd %xmm7, %zmm17 #46.23 - vbroadcastsd %xmm6, %zmm4 #45.23 - vbroadcastsd %xmm11, %zmm2 #44.23 - movl %r13d, %r10d #56.9 - addq %r9, %rsi #25.5 - subl %r11d, %r10d #56.9 - vpbroadcastd %r10d, %ymm7 #56.9 - vpcmpgtd %ymm16, %ymm7, %k5 #56.9 - movslq %r11d, %r11 #56.9 - kmovw %k5, %k2 #58.36 - kmovw %k5, %k3 #58.36 - kmovw %k5, %k1 #58.36 - vmovdqu32 (%rsi,%r11,4), %ymm6{%k5}{z} #57.21 - vpaddd %ymm6, %ymm6, %ymm0 #58.36 - vpaddd %ymm0, %ymm6, %ymm1 #58.36 - vpxord %zmm11, %zmm11, %zmm11 #58.36 - vpxord %zmm3, %zmm3, %zmm3 #58.36 - vpxord %zmm18, %zmm18, %zmm18 #58.36 - vgatherdpd 8(%rbx,%ymm1,8), %zmm11{%k2} #58.36 - vgatherdpd (%rbx,%ymm1,8), %zmm3{%k3} #58.36 - vgatherdpd 16(%rbx,%ymm1,8), %zmm18{%k1} #58.36 - vsubpd %zmm11, %zmm4, %zmm29 #59.36 - vsubpd %zmm3, %zmm2, %zmm28 #58.36 - vsubpd %zmm18, %zmm17, %zmm31 #60.36 - vmulpd %zmm29, %zmm29, %zmm27 #61.49 - vfmadd231pd %zmm28, %zmm28, %zmm27 #61.49 - vfmadd231pd %zmm31, %zmm31, %zmm27 #61.63 - vrcp14pd %zmm27, %zmm26 #75.38 - vcmppd $1, %zmm14, %zmm27, %k6{%k5} #71.22 - vfpclasspd $30, %zmm26, %k0 #75.38 - vmovaps %zmm27, %zmm19 #75.38 - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #75.38 - knotw %k0, %k4 #75.38 - vmulpd %zmm19, %zmm19, %zmm20 #75.38 - vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #75.38 - vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #75.38 + imulq %r8, %rsi #46.43 + vbroadcastsd %xmm7, %zmm17 #50.23 + vbroadcastsd %xmm6, %zmm4 #49.23 + vbroadcastsd %xmm11, %zmm2 #48.23 + movl %r13d, %r10d #59.9 + addq %r9, %rsi #23.5 + subl %r11d, %r10d #59.9 + vpbroadcastd %r10d, %ymm7 #59.9 + vpcmpgtd %ymm16, %ymm7, %k5 #59.9 + movslq %r11d, %r11 #59.9 + kmovw %k5, %k2 #61.36 + kmovw %k5, %k3 #61.36 + kmovw %k5, %k1 #61.36 + vmovdqu32 (%rsi,%r11,4), %ymm6{%k5}{z} #60.21 + vpaddd %ymm6, %ymm6, %ymm0 #61.36 + vpaddd %ymm0, %ymm6, %ymm1 #61.36 + vpxord %zmm11, %zmm11, %zmm11 #61.36 + vpxord %zmm3, %zmm3, %zmm3 #61.36 + vpxord %zmm18, %zmm18, %zmm18 #61.36 + vgatherdpd 8(%rbx,%ymm1,8), %zmm11{%k2} #61.36 + vgatherdpd (%rbx,%ymm1,8), %zmm3{%k3} #61.36 + vgatherdpd 16(%rbx,%ymm1,8), %zmm18{%k1} #61.36 + vsubpd %zmm11, %zmm4, %zmm29 #62.36 + vsubpd %zmm3, %zmm2, %zmm28 #61.36 + vsubpd %zmm18, %zmm17, %zmm31 #63.36 + vmulpd %zmm29, %zmm29, %zmm27 #64.49 + vfmadd231pd %zmm28, %zmm28, %zmm27 #64.49 + vfmadd231pd %zmm31, %zmm31, %zmm27 #64.63 + vrcp14pd %zmm27, %zmm26 #75.39 + vcmppd $1, %zmm14, %zmm27, %k6{%k5} #74.22 + vfpclasspd $30, %zmm26, %k0 #75.39 + vmovaps %zmm27, %zmm19 #75.39 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #75.39 + knotw %k0, %k4 #75.39 + vmulpd %zmm19, %zmm19, %zmm20 #75.39 + vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #75.39 + vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #75.39 vmulpd %zmm13, %zmm26, %zmm21 #76.38 - vmulpd %zmm12, %zmm26, %zmm23 #77.54 + vmulpd %zmm12, %zmm26, %zmm23 #77.55 vmulpd %zmm21, %zmm26, %zmm24 #76.44 vmulpd %zmm24, %zmm26, %zmm22 #76.50 - vfmsub213pd %zmm5, %zmm24, %zmm26 #77.54 - vmulpd %zmm23, %zmm22, %zmm25 #77.61 - vmulpd %zmm26, %zmm25, %zmm30 #77.67 + vfmsub213pd %zmm5, %zmm24, %zmm26 #77.55 + vmulpd %zmm23, %zmm22, %zmm25 #77.64 + vmulpd %zmm26, %zmm25, %zmm30 #77.70 vfmadd231pd %zmm28, %zmm30, %zmm10{%k6} #78.17 vfmadd231pd %zmm29, %zmm30, %zmm9{%k6} #79.17 vfmadd231pd %zmm31, %zmm30, %zmm8{%k6} #80.17 # LOE rax rdx rcx rbx rdi r8 r9 r13d ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.24: # Preds ..B1.17 ..B1.23 ..B1.22 # Execution count [4.50e+00] - vmovups .L_2il0floatpacket.6(%rip), %zmm19 #49.22 - vpermd %zmm8, %zmm19, %zmm0 #49.22 - vpermd %zmm9, %zmm19, %zmm6 #48.22 - vpermd %zmm10, %zmm19, %zmm20 #47.22 - vaddpd %zmm8, %zmm0, %zmm8 #49.22 - vaddpd %zmm9, %zmm6, %zmm9 #48.22 - vaddpd %zmm10, %zmm20, %zmm10 #47.22 - vpermpd $78, %zmm8, %zmm1 #49.22 - vpermpd $78, %zmm9, %zmm7 #48.22 - vpermpd $78, %zmm10, %zmm21 #47.22 - vaddpd %zmm1, %zmm8, %zmm2 #49.22 - vaddpd %zmm7, %zmm9, %zmm11 #48.22 - vaddpd %zmm21, %zmm10, %zmm22 #47.22 - vpermpd $177, %zmm2, %zmm3 #49.22 - vpermpd $177, %zmm11, %zmm17 #48.22 - vpermpd $177, %zmm22, %zmm23 #47.22 - vaddpd %zmm3, %zmm2, %zmm4 #49.22 - vaddpd %zmm17, %zmm11, %zmm18 #48.22 - vaddpd %zmm23, %zmm22, %zmm24 #47.22 + vmovups .L_2il0floatpacket.6(%rip), %zmm19 #53.22 + vpermd %zmm8, %zmm19, %zmm0 #53.22 + vpermd %zmm9, %zmm19, %zmm6 #52.22 + vpermd %zmm10, %zmm19, %zmm20 #51.22 + vaddpd %zmm8, %zmm0, %zmm8 #53.22 + vaddpd %zmm9, %zmm6, %zmm9 #52.22 + vaddpd %zmm10, %zmm20, %zmm10 #51.22 + vpermpd $78, %zmm8, %zmm1 #53.22 + vpermpd $78, %zmm9, %zmm7 #52.22 + vpermpd $78, %zmm10, %zmm21 #51.22 + vaddpd %zmm1, %zmm8, %zmm2 #53.22 + vaddpd %zmm7, %zmm9, %zmm11 #52.22 + vaddpd %zmm21, %zmm10, %zmm22 #51.22 + vpermpd $177, %zmm2, %zmm3 #53.22 + vpermpd $177, %zmm11, %zmm17 #52.22 + vpermpd $177, %zmm22, %zmm23 #51.22 + vaddpd %zmm3, %zmm2, %zmm4 #53.22 + vaddpd %zmm17, %zmm11, %zmm18 #52.22 + vaddpd %zmm23, %zmm22, %zmm24 #51.22 # LOE rax rdx rcx rbx rdi r8 r9 r13d xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 ..B1.25: # Preds ..B1.24 ..B1.7 # Execution count [5.00e+00] @@ -407,14 +407,14 @@ movl $222, %ebx # OSACA END MARKER sarl $3, %r11d #94.9 vmovsd %xmm1, 8(%rax,%rsi) #90.9 vmovsd %xmm2, 16(%rax,%rsi) #91.9 - addq $24, %rax #41.5 + addq $24, %rax #45.5 movslq %r11d, %r11 #94.9 - movslq %edi, %rsi #41.32 - incq %rdi #41.5 + movslq %edi, %rsi #45.32 + incq %rdi #45.5 addq %r11, %rcx #94.9 - incq %rsi #41.32 - cmpq 64(%rsp), %rdi #41.5[spill] - jb ..B1.7 # Prob 82% #41.5 + incq %rsi #45.32 + cmpq 64(%rsp), %rdi #45.5[spill] + jb ..B1.7 # Prob 82% #45.5 # LOE rax rdx rcx rbx rsi rdi r8 r9 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 ..B1.26: # Preds ..B1.25 # Execution count [9.00e-01] @@ -439,10 +439,10 @@ movl $222, %ebx # OSACA END MARKER # LOE r12 ..B1.28: # Preds ..B1.44 # Execution count [5.00e-01] - movl $.L_2__STRING.0, %edi #39.5 + movl $.L_2__STRING.0, %edi #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.36: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #39.5 + call likwid_markerStartRegion #42.5 ..___tag_value_computeForceLJFullNeigh_plain_c.37: # LOE r12 ..B1.29: # Preds ..B1.26 ..B1.28 @@ -456,29 +456,29 @@ movl $222, %ebx # OSACA END MARKER # LOE r12 ..B1.30: # Preds ..B1.29 # Execution count [1.00e+00] - xorl %eax, %eax #98.16 + xorl %eax, %eax #100.16 ..___tag_value_computeForceLJFullNeigh_plain_c.40: # getTimeStamp() - call getTimeStamp #98.16 + call getTimeStamp #100.16 ..___tag_value_computeForceLJFullNeigh_plain_c.41: # LOE r12 xmm0 ..B1.31: # Preds ..B1.30 # Execution count [1.00e+00] - vsubsd 40(%rsp), %xmm0, %xmm0 #102.14[spill] - addq $96, %rsp #102.14 + vsubsd 40(%rsp), %xmm0, %xmm0 #101.14[spill] + addq $96, %rsp #101.14 .cfi_restore 3 - popq %rbx #102.14 + popq %rbx #101.14 .cfi_restore 15 - popq %r15 #102.14 + popq %r15 #101.14 .cfi_restore 14 - popq %r14 #102.14 + popq %r14 #101.14 .cfi_restore 13 - popq %r13 #102.14 - movq %rbp, %rsp #102.14 - popq %rbp #102.14 + popq %r13 #101.14 + movq %rbp, %rsp #101.14 + popq %rbp #101.14 .cfi_def_cfa 7, 8 .cfi_restore 6 - ret #102.14 + ret #101.14 .cfi_def_cfa 6, 16 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 .cfi_offset 6, -16 @@ -489,63 +489,63 @@ movl $222, %ebx # OSACA END MARKER # LOE ..B1.32: # Preds ..B1.9 # Execution count [4.50e-01]: Infreq - movl %r13d, %r11d #56.9 - xorl %r12d, %r12d #56.9 - andl $-8, %r11d #56.9 - jmp ..B1.18 # Prob 100% #56.9 + movl %r13d, %r11d #59.9 + xorl %r12d, %r12d #59.9 + andl $-8, %r11d #59.9 + jmp ..B1.18 # Prob 100% #59.9 # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.33: # Preds ..B1.8 # Execution count [4.50e-01]: Infreq - xorl %r11d, %r11d #56.9 - jmp ..B1.22 # Prob 100% #56.9 + xorl %r11d, %r11d #59.9 + jmp ..B1.22 # Prob 100% #59.9 .cfi_restore 12 # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 ..B1.34: # Preds ..B1.2 # Execution count [1.00e+00]: Infreq - cmpl $8, %esi #32.5 - jl ..B1.40 # Prob 10% #32.5 + cmpl $8, %esi #33.5 + jl ..B1.40 # Prob 10% #33.5 # LOE rbx rdi r12 r13 r14 esi r15d ..B1.35: # Preds ..B1.34 # Execution count [1.00e+00]: Infreq - movl %esi, %eax #32.5 - xorl %ecx, %ecx #32.5 - andl $-8, %eax #32.5 - movslq %eax, %rdx #32.5 - vpxord %zmm0, %zmm0, %zmm0 #33.22 + movl %esi, %eax #33.5 + xorl %ecx, %ecx #33.5 + andl $-8, %eax #33.5 + movslq %eax, %rdx #33.5 + vpxord %zmm0, %zmm0, %zmm0 #34.22 # LOE rdx rcx rbx rdi r12 r13 r14 eax esi r15d zmm0 ..B1.36: # Preds ..B1.36 ..B1.35 # Execution count [5.56e+00]: Infreq - vmovupd %zmm0, (%rdi,%rcx,8) #33.9 - addq $8, %rcx #32.5 - cmpq %rdx, %rcx #32.5 - jb ..B1.36 # Prob 82% #32.5 + vmovupd %zmm0, (%rdi,%rcx,8) #34.9 + addq $8, %rcx #33.5 + cmpq %rdx, %rcx #33.5 + jb ..B1.36 # Prob 82% #33.5 # LOE rdx rcx rbx rdi r12 r13 r14 eax esi r15d zmm0 ..B1.38: # Preds ..B1.36 ..B1.40 # Execution count [1.11e+00]: Infreq - lea 1(%rax), %edx #32.5 - cmpl %esi, %edx #32.5 - ja ..B1.46 # Prob 50% #32.5 + lea 1(%rax), %edx #33.5 + cmpl %esi, %edx #33.5 + ja ..B1.46 # Prob 50% #33.5 # LOE rbx rdi r12 r13 r14 eax esi r15d ..B1.39: # Preds ..B1.38 # Execution count [5.56e+00]: Infreq - subl %eax, %esi #32.5 - vpbroadcastd %esi, %ymm0 #32.5 - vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k1 #32.5 - movslq %eax, %rax #32.5 - movslq %r15d, %r15 #32.5 - vpxord %zmm1, %zmm1, %zmm1 #33.22 - vmovupd %zmm1, (%rdi,%rax,8){%k1} #33.9 - jmp ..B1.4 # Prob 100% #33.9 + subl %eax, %esi #33.5 + vpbroadcastd %esi, %ymm0 #33.5 + vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k1 #33.5 + movslq %eax, %rax #33.5 + movslq %r15d, %r15 #33.5 + vpxord %zmm1, %zmm1, %zmm1 #34.22 + vmovupd %zmm1, (%rdi,%rax,8){%k1} #34.9 + jmp ..B1.4 # Prob 100% #34.9 # LOE rbx r12 r13 r14 r15 ..B1.40: # Preds ..B1.34 # Execution count [1.00e-01]: Infreq - xorl %eax, %eax #32.5 - jmp ..B1.38 # Prob 100% #32.5 + xorl %eax, %eax #33.5 + jmp ..B1.38 # Prob 100% #33.5 # LOE rbx rdi r12 r13 r14 eax esi r15d ..B1.46: # Preds ..B1.38 # Execution count [5.56e-01]: Infreq - movslq %r15d, %r15 #32.5 - jmp ..B1.4 # Prob 100% #32.5 + movslq %r15d, %r15 #33.5 + jmp ..B1.4 # Prob 100% #33.5 .align 16,0x90 # LOE rbx r12 r13 r14 r15 .cfi_endproc @@ -573,568 +573,532 @@ computeForceLJHalfNeigh: .cfi_startproc ..___tag_value_computeForceLJHalfNeigh.58: ..L59: - #105.96 - pushq %rbp #105.96 + #104.96 + pushq %rbp #104.96 .cfi_def_cfa_offset 16 - movq %rsp, %rbp #105.96 + movq %rsp, %rbp #104.96 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 - andq $-64, %rsp #105.96 - pushq %r12 #105.96 - pushq %r13 #105.96 - pushq %r14 #105.96 - pushq %r15 #105.96 - pushq %rbx #105.96 - subq $88, %rsp #105.96 + andq $-64, %rsp #104.96 + pushq %r12 #104.96 + pushq %r13 #104.96 + pushq %r14 #104.96 + pushq %r15 #104.96 + pushq %rbx #104.96 + subq $88, %rsp #104.96 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 - movq %rsi, %rbx #105.96 - vmovsd 144(%rdi), %xmm0 #109.27 - movq %rcx, %r13 #105.96 - vmovsd 56(%rdi), %xmm1 #110.23 - vmovsd 40(%rdi), %xmm2 #111.24 - movl 4(%rbx), %r15d #106.18 - movq %rdx, 48(%rsp) #105.96[spill] - movq %rdi, 16(%rsp) #105.96[spill] - vmovsd %xmm0, 32(%rsp) #109.27[spill] - vmovsd %xmm1, 24(%rsp) #110.23[spill] - vmovsd %xmm2, 40(%rsp) #111.24[spill] - testl %r15d, %r15d #114.24 - jle ..B2.28 # Prob 50% #114.24 - # LOE rbx r13 r15d + movq %rsi, %r13 #104.96 + vmovsd 144(%rdi), %xmm0 #108.27 + movq %rcx, %r14 #104.96 + vmovsd 56(%rdi), %xmm1 #109.23 + movq %rdx, %r12 #104.96 + vmovsd 40(%rdi), %xmm2 #110.24 + movl 4(%r13), %ebx #105.18 + vmovsd %xmm0, 24(%rsp) #108.27[spill] + vmovsd %xmm1, 32(%rsp) #109.23[spill] + vmovsd %xmm2, 16(%rsp) #110.24[spill] + testl %ebx, %ebx #113.24 + jle ..B2.28 # Prob 50% #113.24 + # LOE r12 r13 r14 ebx ..B2.2: # Preds ..B2.1 # Execution count [5.00e-03] - movq 64(%rbx), %rdi #115.9 - lea (%r15,%r15,2), %esi #106.18 - cmpl $12, %esi #114.5 - jle ..B2.36 # Prob 0% #114.5 - # LOE rbx rdi r13 esi r15d + movq 64(%r13), %rdi #114.9 + lea (%rbx,%rbx,2), %esi #105.18 + cmpl $12, %esi #113.5 + jle ..B2.35 # Prob 0% #113.5 + # LOE rdi r12 r13 r14 ebx esi ..B2.3: # Preds ..B2.2 # Execution count [1.00e+00] - movslq %r15d, %r14 #114.5 - xorl %esi, %esi #114.5 - lea (%r14,%r14,2), %rdx #114.5 - shlq $3, %rdx #114.5 - call __intel_skx_avx512_memset #114.5 - # LOE rbx r13 r14 r15d -..B2.4: # Preds ..B2.3 ..B2.48 ..B2.41 + movslq %ebx, %r15 #113.5 + xorl %esi, %esi #113.5 + lea (%r15,%r15,2), %rdx #113.5 + shlq $3, %rdx #113.5 + call __intel_skx_avx512_memset #113.5 + # LOE r12 r13 r14 r15 ebx +..B2.4: # Preds ..B2.3 ..B2.47 ..B2.40 # Execution count [1.00e+00] - xorl %r12d, %r12d #120.22 - xorl %eax, %eax #121.16 - vzeroupper #121.16 -..___tag_value_computeForceLJHalfNeigh.73: + xorl %eax, %eax #119.16 + vzeroupper #119.16 +..___tag_value_computeForceLJHalfNeigh.71: # getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.74: - # LOE rbx r13 r14 r12d r15d xmm0 -..B2.45: # Preds ..B2.4 + call getTimeStamp #119.16 +..___tag_value_computeForceLJHalfNeigh.72: + # LOE r12 r13 r14 r15 ebx xmm0 +..B2.44: # Preds ..B2.4 # Execution count [1.00e+00] - vmovsd %xmm0, 8(%rsp) #121.16[spill] - # LOE rbx r13 r14 r12d r15d -..B2.5: # Preds ..B2.45 + vmovsd %xmm0, 8(%rsp) #119.16[spill] + # LOE r12 r13 r14 r15 ebx +..B2.5: # Preds ..B2.44 # Execution count [5.00e-01] - movl $.L_2__STRING.1, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.76: + movl $.L_2__STRING.1, %edi #123.5 +..___tag_value_computeForceLJHalfNeigh.74: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.77: - # LOE rbx r13 r14 r12d r15d + call likwid_markerStartRegion #123.5 +..___tag_value_computeForceLJHalfNeigh.75: + # LOE r12 r13 r14 r15 ebx ..B2.6: # Preds ..B2.5 # Execution count [9.00e-01] - vmovsd 32(%rsp), %xmm9 #109.45[spill] - xorl %edi, %edi #124.15 - vmovsd 40(%rsp), %xmm0 #161.41[spill] - xorl %r9d, %r9d #124.5 - vmulsd %xmm9, %xmm9, %xmm10 #109.45 - vmovdqu .L_2il0floatpacket.0(%rip), %ymm14 #143.9 - vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm1 #161.41 - vmovdqu .L_2il0floatpacket.1(%rip), %ymm13 #143.9 - vmovdqu .L_2il0floatpacket.7(%rip), %ymm12 #146.36 - vmovdqu .L_2il0floatpacket.8(%rip), %ymm11 #147.36 - vmovups .L_2il0floatpacket.4(%rip), %zmm5 #161.54 - vpbroadcastd %r15d, %ymm4 #106.18 - vbroadcastsd %xmm10, %zmm10 #109.25 - vbroadcastsd 24(%rsp), %zmm9 #110.21[spill] - vbroadcastsd %xmm1, %zmm7 #161.41 - movq 48(%rsp), %rax #125.19[spill] - movq 16(%rbx), %r11 #127.25 - movq 64(%rbx), %rdx #168.21 - movq 24(%rax), %r15 #126.25 - movslq 8(%rax), %r8 #125.43 - movq 16(%rax), %r10 #125.19 - xorl %eax, %eax #124.5 - shlq $2, %r8 #107.5 - movq (%r13), %rcx #179.9 - movq 8(%r13), %rbx #180.9 - movq %r15, 56(%rsp) #124.5[spill] - movq %r14, 64(%rsp) #124.5[spill] - movq %r13, (%rsp) #124.5[spill] - vpxord %zmm15, %zmm15, %zmm15 #124.5 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 + vmovsd 24(%rsp), %xmm9 #108.45[spill] + xorl %r10d, %r10d #126.15 + vmovsd 16(%rsp), %xmm0 #162.41[spill] + xorl %r9d, %r9d #126.5 + vmulsd %xmm9, %xmm9, %xmm10 #108.45 + xorl %eax, %eax #126.5 + vmovdqu .L_2il0floatpacket.0(%rip), %ymm14 #144.9 + vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm1 #162.41 + vmovdqu .L_2il0floatpacket.1(%rip), %ymm13 #144.9 + vmovdqu .L_2il0floatpacket.7(%rip), %ymm12 #147.36 + vmovdqu .L_2il0floatpacket.8(%rip), %ymm11 #148.36 + vmovups .L_2il0floatpacket.4(%rip), %zmm5 #162.54 + vpbroadcastd %ebx, %ymm4 #105.18 + vbroadcastsd %xmm10, %zmm10 #108.25 + vbroadcastsd 32(%rsp), %zmm9 #109.21[spill] + vbroadcastsd %xmm1, %zmm7 #162.41 + movq 24(%r12), %r11 #128.25 + movslq 8(%r12), %rdi #127.43 + movq 16(%r12), %r8 #127.19 + shlq $2, %rdi #106.5 + movq 16(%r13), %rsi #129.25 + movq 64(%r13), %rdx #169.21 + movq (%r14), %rcx #180.9 + movq 8(%r14), %rbx #181.9 + movq %r11, 40(%rsp) #126.5[spill] + movq %r15, 48(%rsp) #126.5[spill] + movq %r14, (%rsp) #126.5[spill] + vpxord %zmm15, %zmm15, %zmm15 #126.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 ..B2.7: # Preds ..B2.26 ..B2.6 # Execution count [5.00e+00] - movq 56(%rsp), %r13 #126.25[spill] - vxorpd %xmm27, %xmm27, %xmm27 #130.22 - vmovapd %xmm27, %xmm21 #131.22 - movl (%r13,%r9,4), %r13d #126.25 - addl %r13d, %r12d #138.9 - vmovsd (%rax,%r11), %xmm1 #127.25 - vmovapd %xmm21, %xmm3 #132.22 - vmovsd 8(%rax,%r11), %xmm0 #128.25 - vmovsd 16(%rax,%r11), %xmm2 #129.25 - testl %r13d, %r13d #143.9 - jle ..B2.26 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 + movq 40(%rsp), %r11 #128.25[spill] + vxorpd %xmm27, %xmm27, %xmm27 #132.22 + vmovapd %xmm27, %xmm21 #133.22 + movl (%r11,%r9,4), %r11d #128.25 + vmovapd %xmm21, %xmm3 #134.22 + vmovsd (%rax,%rsi), %xmm1 #129.25 + vmovsd 8(%rax,%rsi), %xmm0 #130.25 + vmovsd 16(%rax,%rsi), %xmm2 #131.25 + testl %r11d, %r11d #144.9 + jle ..B2.26 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d xmm0 xmm1 xmm2 xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 ..B2.8: # Preds ..B2.7 # Execution count [2.50e+00] - jbe ..B2.26 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 + jbe ..B2.26 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d xmm0 xmm1 xmm2 xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 ..B2.9: # Preds ..B2.8 # Execution count [2.25e+00] - vmovaps %zmm15, %zmm8 #130.22 - vmovaps %zmm8, %zmm6 #131.22 - vmovaps %zmm6, %zmm3 #132.22 - cmpl $8, %r13d #143.9 - jb ..B2.35 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + vmovaps %zmm15, %zmm8 #132.22 + vmovaps %zmm8, %zmm6 #133.22 + vmovaps %zmm6, %zmm3 #134.22 + cmpl $8, %r11d #144.9 + jb ..B2.34 # Prob 10% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.10: # Preds ..B2.9 # Execution count [2.25e+00] - cmpl $1200, %r13d #143.9 - jb ..B2.34 # Prob 10% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + cmpl $1200, %r11d #144.9 + jb ..B2.33 # Prob 10% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.11: # Preds ..B2.10 # Execution count [2.25e+00] - movq %r8, %rsi #125.43 - imulq %rdi, %rsi #125.43 - addq %r10, %rsi #107.5 - movq %rsi, %r14 #143.9 - andq $63, %r14 #143.9 - testl $3, %r14d #143.9 - je ..B2.13 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + movq %rdi, %r15 #127.43 + imulq %r10, %r15 #127.43 + addq %r8, %r15 #106.5 + movq %r15, %r12 #144.9 + andq $63, %r12 #144.9 + testl $3, %r12d #144.9 + je ..B2.13 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r15 r11d r12d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.12: # Preds ..B2.11 # Execution count [1.12e+00] - movl %r13d, %r15d #143.9 - xorl %r14d, %r14d #143.9 - andl $7, %r15d #143.9 - negl %r15d #143.9 - addl %r13d, %r15d #143.9 - jmp ..B2.19 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + movl %r11d, %r14d #144.9 + xorl %r12d, %r12d #144.9 + andl $7, %r14d #144.9 + negl %r14d #144.9 + addl %r11d, %r14d #144.9 + jmp ..B2.19 # Prob 100% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.13: # Preds ..B2.11 # Execution count [1.12e+00] - testl %r14d, %r14d #143.9 - je ..B2.18 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + testl %r12d, %r12d #144.9 + je ..B2.18 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r15 r11d r12d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.14: # Preds ..B2.13 # Execution count [1.25e+01] - negl %r14d #143.9 - movl %r13d, %r15d #143.9 - addl $64, %r14d #143.9 - shrl $2, %r14d #143.9 - cmpl %r14d, %r13d #143.9 - cmovb %r13d, %r14d #143.9 - subl %r14d, %r15d #143.9 - andl $7, %r15d #143.9 - negl %r15d #143.9 - addl %r13d, %r15d #143.9 - cmpl $1, %r14d #143.9 - jb ..B2.19 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + negl %r12d #144.9 + movl %r11d, %r14d #144.9 + addl $64, %r12d #144.9 + shrl $2, %r12d #144.9 + cmpl %r12d, %r11d #144.9 + cmovb %r11d, %r12d #144.9 + subl %r12d, %r14d #144.9 + andl $7, %r14d #144.9 + negl %r14d #144.9 + addl %r11d, %r14d #144.9 + cmpl $1, %r12d #144.9 + jb ..B2.19 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.15: # Preds ..B2.14 # Execution count [2.25e+00] - vpbroadcastd %r14d, %ymm28 #143.9 - vbroadcastsd %xmm1, %zmm27 #127.23 - vbroadcastsd %xmm0, %zmm26 #128.23 - vbroadcastsd %xmm2, %zmm25 #129.23 - movslq %r14d, %r14 #143.9 - movq $0, 40(%rsp) #143.9[spill] - movq %r9, 24(%rsp) #143.9[spill] - movq %rdi, 32(%rsp) #143.9[spill] - vmovdqa32 %ymm14, %ymm29 #143.9 - movq %r14, %rdi #143.9 - movq 40(%rsp), %r9 #143.9[spill] - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 ymm28 ymm29 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm25 zmm26 zmm27 + vpbroadcastd %r12d, %ymm28 #144.9 + xorl %r13d, %r13d #144.9 + vbroadcastsd %xmm1, %zmm27 #129.23 + vbroadcastsd %xmm0, %zmm26 #130.23 + vbroadcastsd %xmm2, %zmm25 #131.23 + movslq %r12d, %r12 #144.9 + movq %r10, 16(%rsp) #144.9[spill] + vmovdqa32 %ymm14, %ymm29 #144.9 + movq %r12, %r10 #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r13 r15 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 ymm28 ymm29 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm25 zmm26 zmm27 ..B2.16: # Preds ..B2.16 ..B2.15 # Execution count [1.25e+01] - vpcmpud $1, %ymm28, %ymm29, %k4 #143.9 - vpaddd %ymm13, %ymm29, %ymm29 #143.9 - vmovdqu32 (%rsi,%r9,4), %ymm21{%k4}{z} #144.21 - vpaddd %ymm21, %ymm21, %ymm30 #145.36 - addq $8, %r9 #143.9 - vpcmpgtd %ymm21, %ymm4, %k6 #167.24 - vpaddd %ymm30, %ymm21, %ymm24 #145.36 - kmovw %k4, %k2 #145.36 - kmovw %k4, %k3 #145.36 - kmovw %k4, %k1 #145.36 - vpxord %zmm16, %zmm16, %zmm16 #145.36 - vpxord %zmm31, %zmm31, %zmm31 #145.36 - vpxord %zmm20, %zmm20, %zmm20 #145.36 - vpaddd %ymm12, %ymm24, %ymm17 #146.36 - vgatherdpd 8(%r11,%ymm24,8), %zmm16{%k2} #145.36 - vgatherdpd (%r11,%ymm24,8), %zmm31{%k3} #145.36 - vgatherdpd 16(%r11,%ymm24,8), %zmm20{%k1} #145.36 - vsubpd %zmm16, %zmm26, %zmm22 #146.36 - vsubpd %zmm31, %zmm27, %zmm23 #145.36 - vsubpd %zmm20, %zmm25, %zmm20 #147.36 - vmulpd %zmm22, %zmm22, %zmm18 #148.49 - vpaddd %ymm11, %ymm24, %ymm16 #147.36 - vfmadd231pd %zmm23, %zmm23, %zmm18 #148.49 - vfmadd231pd %zmm20, %zmm20, %zmm18 #148.63 - vrcp14pd %zmm18, %zmm19 #159.38 - vcmppd $1, %zmm10, %zmm18, %k7{%k4} #158.22 - vfpclasspd $30, %zmm19, %k0 #159.38 - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm19, %zmm18 #159.38 - knotw %k0, %k5 #159.38 - kandw %k6, %k7, %k6 #167.24 - vmulpd %zmm18, %zmm18, %zmm21 #159.38 - vfmadd213pd %zmm19, %zmm18, %zmm19{%k5} #159.38 - vfmadd213pd %zmm19, %zmm21, %zmm19{%k5} #159.38 - vmulpd %zmm9, %zmm19, %zmm30 #160.38 - vmulpd %zmm30, %zmm19, %zmm18 #160.44 - vmulpd %zmm18, %zmm19, %zmm31 #160.50 - vfmsub213pd %zmm5, %zmm19, %zmm18 #161.54 - vmulpd %zmm7, %zmm19, %zmm19 #161.54 - vmulpd %zmm19, %zmm31, %zmm19 #161.61 - vmulpd %zmm18, %zmm19, %zmm21 #161.67 - vmovaps %zmm15, %zmm18 #168.21 - kmovw %k6, %k1 #168.21 - vfmadd231pd %zmm23, %zmm21, %zmm8{%k7} #162.17 - vfmadd231pd %zmm22, %zmm21, %zmm6{%k7} #163.17 - vfmadd231pd %zmm20, %zmm21, %zmm3{%k7} #164.17 - .byte 144 #168.21 - vgatherdpd (%rdx,%ymm24,8), %zmm18{%k1} #168.21 - vfnmadd213pd %zmm18, %zmm21, %zmm23 #168.21 - kmovw %k6, %k2 #168.21 - vscatterdpd %zmm23, (%rdx,%ymm24,8){%k2} #168.21 - vmovaps %zmm15, %zmm23 #169.21 - kmovw %k6, %k3 #169.21 - kmovw %k6, %k4 #169.21 - kmovw %k6, %k5 #170.21 - vgatherdpd (%rdx,%ymm17,8), %zmm23{%k3} #169.21 - vfnmadd213pd %zmm23, %zmm21, %zmm22 #169.21 - vscatterdpd %zmm22, (%rdx,%ymm17,8){%k4} #169.21 - vmovaps %zmm15, %zmm17 #170.21 - vgatherdpd (%rdx,%ymm16,8), %zmm17{%k5} #170.21 - vfnmadd213pd %zmm17, %zmm21, %zmm20 #170.21 - vscatterdpd %zmm20, (%rdx,%ymm16,8){%k6} #170.21 - cmpq %rdi, %r9 #143.9 - jb ..B2.16 # Prob 82% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 ymm28 ymm29 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm25 zmm26 zmm27 + vpcmpud $1, %ymm28, %ymm29, %k4 #144.9 + vpaddd %ymm13, %ymm29, %ymm29 #144.9 + vmovdqu32 (%r15,%r13,4), %ymm21{%k4}{z} #145.21 + vpaddd %ymm21, %ymm21, %ymm30 #146.36 + addq $8, %r13 #144.9 + vpcmpgtd %ymm21, %ymm4, %k6 #168.24 + vpaddd %ymm30, %ymm21, %ymm24 #146.36 + kmovw %k4, %k2 #146.36 + kmovw %k4, %k3 #146.36 + kmovw %k4, %k1 #146.36 + vpxord %zmm16, %zmm16, %zmm16 #146.36 + vpxord %zmm31, %zmm31, %zmm31 #146.36 + vpxord %zmm20, %zmm20, %zmm20 #146.36 + vpaddd %ymm12, %ymm24, %ymm17 #147.36 + vgatherdpd 8(%rsi,%ymm24,8), %zmm16{%k2} #146.36 + vgatherdpd (%rsi,%ymm24,8), %zmm31{%k3} #146.36 + vgatherdpd 16(%rsi,%ymm24,8), %zmm20{%k1} #146.36 + vsubpd %zmm16, %zmm26, %zmm22 #147.36 + vsubpd %zmm31, %zmm27, %zmm23 #146.36 + vsubpd %zmm20, %zmm25, %zmm20 #148.36 + vmulpd %zmm22, %zmm22, %zmm18 #149.49 + vpaddd %ymm11, %ymm24, %ymm16 #148.36 + vfmadd231pd %zmm23, %zmm23, %zmm18 #149.49 + vfmadd231pd %zmm20, %zmm20, %zmm18 #149.63 + vrcp14pd %zmm18, %zmm19 #160.38 + vcmppd $1, %zmm10, %zmm18, %k7{%k4} #159.22 + vfpclasspd $30, %zmm19, %k0 #160.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm19, %zmm18 #160.38 + knotw %k0, %k5 #160.38 + kandw %k6, %k7, %k6 #168.24 + vmulpd %zmm18, %zmm18, %zmm21 #160.38 + vfmadd213pd %zmm19, %zmm18, %zmm19{%k5} #160.38 + vfmadd213pd %zmm19, %zmm21, %zmm19{%k5} #160.38 + vmulpd %zmm9, %zmm19, %zmm30 #161.38 + vmulpd %zmm30, %zmm19, %zmm18 #161.44 + vmulpd %zmm18, %zmm19, %zmm31 #161.50 + vfmsub213pd %zmm5, %zmm19, %zmm18 #162.54 + vmulpd %zmm7, %zmm19, %zmm19 #162.54 + vmulpd %zmm19, %zmm31, %zmm19 #162.61 + vmulpd %zmm18, %zmm19, %zmm21 #162.67 + vmovaps %zmm15, %zmm18 #169.21 + kmovw %k6, %k1 #169.21 + vfmadd231pd %zmm23, %zmm21, %zmm8{%k7} #163.17 + vfmadd231pd %zmm22, %zmm21, %zmm6{%k7} #164.17 + vfmadd231pd %zmm20, %zmm21, %zmm3{%k7} #165.17 + .byte 144 #169.21 + vgatherdpd (%rdx,%ymm24,8), %zmm18{%k1} #169.21 + vfnmadd213pd %zmm18, %zmm21, %zmm23 #169.21 + kmovw %k6, %k2 #169.21 + vscatterdpd %zmm23, (%rdx,%ymm24,8){%k2} #169.21 + vmovaps %zmm15, %zmm23 #170.21 + kmovw %k6, %k3 #170.21 + kmovw %k6, %k4 #170.21 + kmovw %k6, %k5 #171.21 + vgatherdpd (%rdx,%ymm17,8), %zmm23{%k3} #170.21 + vfnmadd213pd %zmm23, %zmm21, %zmm22 #170.21 + vscatterdpd %zmm22, (%rdx,%ymm17,8){%k4} #170.21 + vmovaps %zmm15, %zmm17 #171.21 + vgatherdpd (%rdx,%ymm16,8), %zmm17{%k5} #171.21 + vfnmadd213pd %zmm17, %zmm21, %zmm20 #171.21 + vscatterdpd %zmm20, (%rdx,%ymm16,8){%k6} #171.21 + cmpq %r10, %r13 #144.9 + jb ..B2.16 # Prob 82% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r13 r15 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 ymm28 ymm29 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm25 zmm26 zmm27 ..B2.17: # Preds ..B2.16 # Execution count [2.25e+00] - movq 24(%rsp), %r9 #[spill] - movq 32(%rsp), %rdi #[spill] - cmpl %r14d, %r13d #143.9 - je ..B2.25 # Prob 10% #143.9 - jmp ..B2.19 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + movq 16(%rsp), %r10 #[spill] + cmpl %r12d, %r11d #144.9 + je ..B2.25 # Prob 10% #144.9 + jmp ..B2.19 # Prob 100% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.18: # Preds ..B2.13 # Execution count [5.62e-01] - movl %r13d, %r15d #143.9 - andl $7, %r15d #143.9 - negl %r15d #143.9 - addl %r13d, %r15d #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 -..B2.19: # Preds ..B2.12 ..B2.18 ..B2.17 ..B2.14 ..B2.34 + movl %r11d, %r14d #144.9 + andl $7, %r14d #144.9 + negl %r14d #144.9 + addl %r11d, %r14d #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.19: # Preds ..B2.12 ..B2.18 ..B2.17 ..B2.14 ..B2.33 # # Execution count [1.25e+01] - lea 8(%r14), %esi #143.9 - cmpl %esi, %r15d #143.9 - jb ..B2.23 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + lea 8(%r12), %r13d #144.9 + cmpl %r13d, %r14d #144.9 + jb ..B2.23 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.20: # Preds ..B2.19 # Execution count [2.25e+00] - movq %r8, %rsi #125.43 - imulq %rdi, %rsi #125.43 - vbroadcastsd %xmm1, %zmm26 #127.23 - vbroadcastsd %xmm0, %zmm25 #128.23 - vbroadcastsd %xmm2, %zmm23 #129.23 - movslq %r14d, %r14 #143.9 - addq %r10, %rsi #107.5 - movq %rdi, 32(%rsp) #107.5[spill] - movq %r14, %rdi #107.5 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm23 zmm25 zmm26 + movq %rdi, %r13 #127.43 + imulq %r10, %r13 #127.43 + vbroadcastsd %xmm1, %zmm26 #129.23 + vbroadcastsd %xmm0, %zmm25 #130.23 + vbroadcastsd %xmm2, %zmm23 #131.23 + movslq %r12d, %r15 #144.9 + addq %r8, %r13 #106.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r13 r15 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm23 zmm25 zmm26 ..B2.21: # Preds ..B2.21 ..B2.20 # Execution count [1.25e+01] - vmovdqu32 (%rsi,%rdi,4), %ymm24 #144.21 - addl $8, %r14d #143.9 - vpcmpeqb %xmm0, %xmm0, %k2 #145.36 - vpcmpeqb %xmm0, %xmm0, %k3 #145.36 - vpcmpeqb %xmm0, %xmm0, %k1 #145.36 - vpcmpgtd %ymm24, %ymm4, %k6 #167.24 - vpaddd %ymm24, %ymm24, %ymm27 #145.36 - vpaddd %ymm27, %ymm24, %ymm20 #145.36 - addq $8, %rdi #143.9 - vpxord %zmm29, %zmm29, %zmm29 #145.36 - vpxord %zmm28, %zmm28, %zmm28 #145.36 - vpxord %zmm30, %zmm30, %zmm30 #145.36 - vpaddd %ymm20, %ymm12, %ymm21 #146.36 - vpaddd %ymm20, %ymm11, %ymm18 #147.36 - vgatherdpd 8(%r11,%ymm20,8), %zmm29{%k2} #145.36 - vgatherdpd (%r11,%ymm20,8), %zmm28{%k3} #145.36 - vgatherdpd 16(%r11,%ymm20,8), %zmm30{%k1} #145.36 - vsubpd %zmm29, %zmm25, %zmm19 #146.36 - vsubpd %zmm28, %zmm26, %zmm22 #145.36 - vsubpd %zmm30, %zmm23, %zmm17 #147.36 - vmulpd %zmm19, %zmm19, %zmm31 #148.49 - vfmadd231pd %zmm22, %zmm22, %zmm31 #148.49 - vfmadd231pd %zmm17, %zmm17, %zmm31 #148.63 - vrcp14pd %zmm31, %zmm16 #159.38 - vcmppd $1, %zmm10, %zmm31, %k5 #158.22 - vfpclasspd $30, %zmm16, %k0 #159.38 - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm16, %zmm31 #159.38 - knotw %k0, %k4 #159.38 - vmulpd %zmm31, %zmm31, %zmm27 #159.38 - vfmadd213pd %zmm16, %zmm31, %zmm16{%k4} #159.38 - vfmadd213pd %zmm16, %zmm27, %zmm16{%k4} #159.38 - vmulpd %zmm9, %zmm16, %zmm28 #160.38 - vmulpd %zmm7, %zmm16, %zmm24 #161.54 - vmulpd %zmm28, %zmm16, %zmm30 #160.44 - vmulpd %zmm30, %zmm16, %zmm29 #160.50 - vfmsub213pd %zmm5, %zmm30, %zmm16 #161.54 - vmulpd %zmm24, %zmm29, %zmm31 #161.61 - vmulpd %zmm16, %zmm31, %zmm24 #161.67 - vfmadd231pd %zmm22, %zmm24, %zmm8{%k5} #162.17 - vfmadd231pd %zmm19, %zmm24, %zmm6{%k5} #163.17 - vfmadd231pd %zmm17, %zmm24, %zmm3{%k5} #164.17 - kandw %k6, %k5, %k5 #167.24 - vmovaps %zmm15, %zmm16 #168.21 - kmovw %k5, %k7 #168.21 - kmovw %k5, %k1 #168.21 - kmovw %k5, %k2 #169.21 - kmovw %k5, %k3 #169.21 - kmovw %k5, %k4 #170.21 - vgatherdpd (%rdx,%ymm20,8), %zmm16{%k7} #168.21 - vfnmadd213pd %zmm16, %zmm24, %zmm22 #168.21 - vscatterdpd %zmm22, (%rdx,%ymm20,8){%k1} #168.21 - vmovaps %zmm15, %zmm20 #169.21 - vgatherdpd (%rdx,%ymm21,8), %zmm20{%k2} #169.21 - vfnmadd213pd %zmm20, %zmm24, %zmm19 #169.21 - vscatterdpd %zmm19, (%rdx,%ymm21,8){%k3} #169.21 - vmovaps %zmm15, %zmm19 #170.21 - vgatherdpd (%rdx,%ymm18,8), %zmm19{%k4} #170.21 - vfnmadd213pd %zmm19, %zmm24, %zmm17 #170.21 - vscatterdpd %zmm17, (%rdx,%ymm18,8){%k5} #170.21 - cmpl %r15d, %r14d #143.9 - jb ..B2.21 # Prob 82% #143.9 - # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm23 zmm25 zmm26 -..B2.22: # Preds ..B2.21 - # Execution count [2.25e+00] - movq 32(%rsp), %rdi #[spill] - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 -..B2.23: # Preds ..B2.22 ..B2.19 ..B2.35 + vmovdqu32 (%r13,%r15,4), %ymm24 #145.21 + addl $8, %r12d #144.9 + vpcmpeqb %xmm0, %xmm0, %k2 #146.36 + vpcmpeqb %xmm0, %xmm0, %k3 #146.36 + vpcmpeqb %xmm0, %xmm0, %k1 #146.36 + vpcmpgtd %ymm24, %ymm4, %k6 #168.24 + vpaddd %ymm24, %ymm24, %ymm27 #146.36 + vpaddd %ymm27, %ymm24, %ymm20 #146.36 + addq $8, %r15 #144.9 + vpxord %zmm29, %zmm29, %zmm29 #146.36 + vpxord %zmm28, %zmm28, %zmm28 #146.36 + vpxord %zmm30, %zmm30, %zmm30 #146.36 + vpaddd %ymm20, %ymm12, %ymm21 #147.36 + vpaddd %ymm20, %ymm11, %ymm18 #148.36 + vgatherdpd 8(%rsi,%ymm20,8), %zmm29{%k2} #146.36 + vgatherdpd (%rsi,%ymm20,8), %zmm28{%k3} #146.36 + vgatherdpd 16(%rsi,%ymm20,8), %zmm30{%k1} #146.36 + vsubpd %zmm29, %zmm25, %zmm19 #147.36 + vsubpd %zmm28, %zmm26, %zmm22 #146.36 + vsubpd %zmm30, %zmm23, %zmm17 #148.36 + vmulpd %zmm19, %zmm19, %zmm31 #149.49 + vfmadd231pd %zmm22, %zmm22, %zmm31 #149.49 + vfmadd231pd %zmm17, %zmm17, %zmm31 #149.63 + vrcp14pd %zmm31, %zmm16 #160.38 + vcmppd $1, %zmm10, %zmm31, %k5 #159.22 + vfpclasspd $30, %zmm16, %k0 #160.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm16, %zmm31 #160.38 + knotw %k0, %k4 #160.38 + vmulpd %zmm31, %zmm31, %zmm27 #160.38 + vfmadd213pd %zmm16, %zmm31, %zmm16{%k4} #160.38 + vfmadd213pd %zmm16, %zmm27, %zmm16{%k4} #160.38 + vmulpd %zmm9, %zmm16, %zmm28 #161.38 + vmulpd %zmm7, %zmm16, %zmm24 #162.54 + vmulpd %zmm28, %zmm16, %zmm30 #161.44 + vmulpd %zmm30, %zmm16, %zmm29 #161.50 + vfmsub213pd %zmm5, %zmm30, %zmm16 #162.54 + vmulpd %zmm24, %zmm29, %zmm31 #162.61 + vmulpd %zmm16, %zmm31, %zmm24 #162.67 + vfmadd231pd %zmm22, %zmm24, %zmm8{%k5} #163.17 + vfmadd231pd %zmm19, %zmm24, %zmm6{%k5} #164.17 + vfmadd231pd %zmm17, %zmm24, %zmm3{%k5} #165.17 + kandw %k6, %k5, %k5 #168.24 + vmovaps %zmm15, %zmm16 #169.21 + kmovw %k5, %k7 #169.21 + kmovw %k5, %k1 #169.21 + kmovw %k5, %k2 #170.21 + kmovw %k5, %k3 #170.21 + kmovw %k5, %k4 #171.21 + vgatherdpd (%rdx,%ymm20,8), %zmm16{%k7} #169.21 + vfnmadd213pd %zmm16, %zmm24, %zmm22 #169.21 + vscatterdpd %zmm22, (%rdx,%ymm20,8){%k1} #169.21 + vmovaps %zmm15, %zmm20 #170.21 + vgatherdpd (%rdx,%ymm21,8), %zmm20{%k2} #170.21 + vfnmadd213pd %zmm20, %zmm24, %zmm19 #170.21 + vscatterdpd %zmm19, (%rdx,%ymm21,8){%k3} #170.21 + vmovaps %zmm15, %zmm19 #171.21 + vgatherdpd (%rdx,%ymm18,8), %zmm19{%k4} #171.21 + vfnmadd213pd %zmm19, %zmm24, %zmm17 #171.21 + vscatterdpd %zmm17, (%rdx,%ymm18,8){%k5} #171.21 + cmpl %r14d, %r12d #144.9 + jb ..B2.21 # Prob 82% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r13 r15 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm23 zmm25 zmm26 +..B2.23: # Preds ..B2.21 ..B2.19 ..B2.34 # Execution count [2.50e+00] - lea 1(%r15), %r14d #143.9 - cmpl %r13d, %r14d #143.9 - ja ..B2.25 # Prob 50% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + lea 1(%r14), %r12d #144.9 + cmpl %r11d, %r12d #144.9 + ja ..B2.25 # Prob 50% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.24: # Preds ..B2.23 # Execution count [1.25e+01] - imulq %r8, %rdi #125.43 - vbroadcastsd %xmm0, %zmm24 #128.23 - vbroadcastsd %xmm1, %zmm22 #127.23 - vbroadcastsd %xmm2, %zmm26 #129.23 - movl %r13d, %r14d #143.9 - addq %r10, %rdi #107.5 - subl %r15d, %r14d #143.9 - vpbroadcastd %r14d, %ymm20 #143.9 - vpcmpud $1, %ymm20, %ymm14, %k5 #143.9 - movslq %r15d, %r15 #143.9 - kmovw %k5, %k2 #145.36 - kmovw %k5, %k3 #145.36 - kmovw %k5, %k1 #145.36 - vmovdqu32 (%rdi,%r15,4), %ymm19{%k5}{z} #144.21 - vpaddd %ymm19, %ymm19, %ymm21 #145.36 - vpcmpgtd %ymm19, %ymm4, %k7 #167.24 - vpaddd %ymm21, %ymm19, %ymm18 #145.36 - vmovaps %zmm15, %zmm19 #168.21 - vpxord %zmm25, %zmm25, %zmm25 #145.36 - vpxord %zmm23, %zmm23, %zmm23 #145.36 - vpxord %zmm27, %zmm27, %zmm27 #145.36 - vpaddd %ymm18, %ymm12, %ymm16 #146.36 - vpaddd %ymm18, %ymm11, %ymm0 #147.36 - vgatherdpd 8(%r11,%ymm18,8), %zmm25{%k2} #145.36 - vgatherdpd (%r11,%ymm18,8), %zmm23{%k3} #145.36 - vgatherdpd 16(%r11,%ymm18,8), %zmm27{%k1} #145.36 - vsubpd %zmm25, %zmm24, %zmm1 #146.36 - vsubpd %zmm23, %zmm22, %zmm17 #145.36 - vsubpd %zmm27, %zmm26, %zmm2 #147.36 - vmulpd %zmm1, %zmm1, %zmm21 #148.49 - vfmadd231pd %zmm17, %zmm17, %zmm21 #148.49 - vfmadd231pd %zmm2, %zmm2, %zmm21 #148.63 - vrcp14pd %zmm21, %zmm20 #159.38 - vcmppd $1, %zmm10, %zmm21, %k6{%k5} #158.22 - vfpclasspd $30, %zmm20, %k0 #159.38 - vmovaps %zmm21, %zmm28 #159.38 - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm20, %zmm28 #159.38 - knotw %k0, %k4 #159.38 - vmulpd %zmm28, %zmm28, %zmm29 #159.38 - vfmadd213pd %zmm20, %zmm28, %zmm20{%k4} #159.38 - vfmadd213pd %zmm20, %zmm29, %zmm20{%k4} #159.38 - vmulpd %zmm9, %zmm20, %zmm30 #160.38 - vmulpd %zmm7, %zmm20, %zmm28 #161.54 - vmulpd %zmm30, %zmm20, %zmm29 #160.44 - vmulpd %zmm29, %zmm20, %zmm31 #160.50 - vfmsub213pd %zmm5, %zmm29, %zmm20 #161.54 - vmulpd %zmm28, %zmm31, %zmm30 #161.61 - vmulpd %zmm20, %zmm30, %zmm22 #161.67 - vfmadd231pd %zmm17, %zmm22, %zmm8{%k6} #162.17 - vfmadd231pd %zmm1, %zmm22, %zmm6{%k6} #163.17 - vfmadd231pd %zmm2, %zmm22, %zmm3{%k6} #164.17 - kandw %k7, %k6, %k6 #167.24 - kmovw %k6, %k1 #168.21 - kmovw %k6, %k2 #168.21 - kmovw %k6, %k3 #169.21 - kmovw %k6, %k4 #169.21 - kmovw %k6, %k5 #170.21 - vgatherdpd (%rdx,%ymm18,8), %zmm19{%k1} #168.21 - vfnmadd213pd %zmm19, %zmm22, %zmm17 #168.21 - vscatterdpd %zmm17, (%rdx,%ymm18,8){%k2} #168.21 - vmovaps %zmm15, %zmm17 #169.21 - vgatherdpd (%rdx,%ymm16,8), %zmm17{%k3} #169.21 - vfnmadd213pd %zmm17, %zmm22, %zmm1 #169.21 - vscatterdpd %zmm1, (%rdx,%ymm16,8){%k4} #169.21 - vmovaps %zmm15, %zmm1 #170.21 - vgatherdpd (%rdx,%ymm0,8), %zmm1{%k5} #170.21 - vfnmadd213pd %zmm1, %zmm22, %zmm2 #170.21 - vscatterdpd %zmm2, (%rdx,%ymm0,8){%k6} #170.21 - # LOE rax rdx rcx rbx r8 r9 r10 r11 r12d r13d ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 + imulq %rdi, %r10 #127.43 + vbroadcastsd %xmm0, %zmm24 #130.23 + vbroadcastsd %xmm1, %zmm22 #129.23 + vbroadcastsd %xmm2, %zmm26 #131.23 + movl %r11d, %r12d #144.9 + addq %r8, %r10 #106.5 + subl %r14d, %r12d #144.9 + vpbroadcastd %r12d, %ymm20 #144.9 + vpcmpud $1, %ymm20, %ymm14, %k5 #144.9 + movslq %r14d, %r14 #144.9 + kmovw %k5, %k2 #146.36 + kmovw %k5, %k3 #146.36 + kmovw %k5, %k1 #146.36 + vmovdqu32 (%r10,%r14,4), %ymm19{%k5}{z} #145.21 + vpaddd %ymm19, %ymm19, %ymm21 #146.36 + vpcmpgtd %ymm19, %ymm4, %k7 #168.24 + vpaddd %ymm21, %ymm19, %ymm18 #146.36 + vmovaps %zmm15, %zmm19 #169.21 + vpxord %zmm25, %zmm25, %zmm25 #146.36 + vpxord %zmm23, %zmm23, %zmm23 #146.36 + vpxord %zmm27, %zmm27, %zmm27 #146.36 + vpaddd %ymm18, %ymm12, %ymm16 #147.36 + vpaddd %ymm18, %ymm11, %ymm0 #148.36 + vgatherdpd 8(%rsi,%ymm18,8), %zmm25{%k2} #146.36 + vgatherdpd (%rsi,%ymm18,8), %zmm23{%k3} #146.36 + vgatherdpd 16(%rsi,%ymm18,8), %zmm27{%k1} #146.36 + vsubpd %zmm25, %zmm24, %zmm1 #147.36 + vsubpd %zmm23, %zmm22, %zmm17 #146.36 + vsubpd %zmm27, %zmm26, %zmm2 #148.36 + vmulpd %zmm1, %zmm1, %zmm21 #149.49 + vfmadd231pd %zmm17, %zmm17, %zmm21 #149.49 + vfmadd231pd %zmm2, %zmm2, %zmm21 #149.63 + vrcp14pd %zmm21, %zmm20 #160.38 + vcmppd $1, %zmm10, %zmm21, %k6{%k5} #159.22 + vfpclasspd $30, %zmm20, %k0 #160.38 + vmovaps %zmm21, %zmm28 #160.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm20, %zmm28 #160.38 + knotw %k0, %k4 #160.38 + vmulpd %zmm28, %zmm28, %zmm29 #160.38 + vfmadd213pd %zmm20, %zmm28, %zmm20{%k4} #160.38 + vfmadd213pd %zmm20, %zmm29, %zmm20{%k4} #160.38 + vmulpd %zmm9, %zmm20, %zmm30 #161.38 + vmulpd %zmm7, %zmm20, %zmm28 #162.54 + vmulpd %zmm30, %zmm20, %zmm29 #161.44 + vmulpd %zmm29, %zmm20, %zmm31 #161.50 + vfmsub213pd %zmm5, %zmm29, %zmm20 #162.54 + vmulpd %zmm28, %zmm31, %zmm30 #162.61 + vmulpd %zmm20, %zmm30, %zmm22 #162.67 + vfmadd231pd %zmm17, %zmm22, %zmm8{%k6} #163.17 + vfmadd231pd %zmm1, %zmm22, %zmm6{%k6} #164.17 + vfmadd231pd %zmm2, %zmm22, %zmm3{%k6} #165.17 + kandw %k7, %k6, %k6 #168.24 + kmovw %k6, %k1 #169.21 + kmovw %k6, %k2 #169.21 + kmovw %k6, %k3 #170.21 + kmovw %k6, %k4 #170.21 + kmovw %k6, %k5 #171.21 + vgatherdpd (%rdx,%ymm18,8), %zmm19{%k1} #169.21 + vfnmadd213pd %zmm19, %zmm22, %zmm17 #169.21 + vscatterdpd %zmm17, (%rdx,%ymm18,8){%k2} #169.21 + vmovaps %zmm15, %zmm17 #170.21 + vgatherdpd (%rdx,%ymm16,8), %zmm17{%k3} #170.21 + vfnmadd213pd %zmm17, %zmm22, %zmm1 #170.21 + vscatterdpd %zmm1, (%rdx,%ymm16,8){%k4} #170.21 + vmovaps %zmm15, %zmm1 #171.21 + vgatherdpd (%rdx,%ymm0,8), %zmm1{%k5} #171.21 + vfnmadd213pd %zmm1, %zmm22, %zmm2 #171.21 + vscatterdpd %zmm2, (%rdx,%ymm0,8){%k6} #171.21 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 ..B2.25: # Preds ..B2.17 ..B2.24 ..B2.23 # Execution count [2.25e+00] - vmovups .L_2il0floatpacket.6(%rip), %zmm22 #132.22 - vpermd %zmm3, %zmm22, %zmm0 #132.22 - vpermd %zmm6, %zmm22, %zmm17 #131.22 - vpermd %zmm8, %zmm22, %zmm23 #130.22 - vaddpd %zmm3, %zmm0, %zmm3 #132.22 - vaddpd %zmm6, %zmm17, %zmm6 #131.22 - vaddpd %zmm8, %zmm23, %zmm8 #130.22 - vpermpd $78, %zmm3, %zmm1 #132.22 - vpermpd $78, %zmm6, %zmm18 #131.22 - vpermpd $78, %zmm8, %zmm24 #130.22 - vaddpd %zmm1, %zmm3, %zmm2 #132.22 - vaddpd %zmm18, %zmm6, %zmm19 #131.22 - vaddpd %zmm24, %zmm8, %zmm25 #130.22 - vpermpd $177, %zmm2, %zmm16 #132.22 - vpermpd $177, %zmm19, %zmm20 #131.22 - vpermpd $177, %zmm25, %zmm26 #130.22 - vaddpd %zmm16, %zmm2, %zmm3 #132.22 - vaddpd %zmm20, %zmm19, %zmm21 #131.22 - vaddpd %zmm26, %zmm25, %zmm27 #130.22 - # LOE rax rdx rcx rbx r8 r9 r10 r11 r12d r13d xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 + vmovups .L_2il0floatpacket.6(%rip), %zmm22 #134.22 + vpermd %zmm3, %zmm22, %zmm0 #134.22 + vpermd %zmm6, %zmm22, %zmm17 #133.22 + vpermd %zmm8, %zmm22, %zmm23 #132.22 + vaddpd %zmm3, %zmm0, %zmm3 #134.22 + vaddpd %zmm6, %zmm17, %zmm6 #133.22 + vaddpd %zmm8, %zmm23, %zmm8 #132.22 + vpermpd $78, %zmm3, %zmm1 #134.22 + vpermpd $78, %zmm6, %zmm18 #133.22 + vpermpd $78, %zmm8, %zmm24 #132.22 + vaddpd %zmm1, %zmm3, %zmm2 #134.22 + vaddpd %zmm18, %zmm6, %zmm19 #133.22 + vaddpd %zmm24, %zmm8, %zmm25 #132.22 + vpermpd $177, %zmm2, %zmm16 #134.22 + vpermpd $177, %zmm19, %zmm20 #133.22 + vpermpd $177, %zmm25, %zmm26 #132.22 + vaddpd %zmm16, %zmm2, %zmm3 #134.22 + vaddpd %zmm20, %zmm19, %zmm21 #133.22 + vaddpd %zmm26, %zmm25, %zmm27 #132.22 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 ..B2.26: # Preds ..B2.25 ..B2.8 ..B2.7 # Execution count [5.00e+00] - movslq %r13d, %r13 #179.9 - vaddsd (%rax,%rdx), %xmm27, %xmm0 #175.9 - vaddsd 8(%rax,%rdx), %xmm21, %xmm1 #176.9 - vaddsd 16(%rax,%rdx), %xmm3, %xmm2 #177.9 - vmovsd %xmm0, (%rax,%rdx) #175.9 - lea 7(%r13), %edi #180.9 - sarl $2, %edi #180.9 - addq %r13, %rcx #179.9 - shrl $29, %edi #180.9 - vmovsd %xmm1, 8(%rax,%rdx) #176.9 - vmovsd %xmm2, 16(%rax,%rdx) #177.9 - addq $24, %rax #124.5 - lea 7(%rdi,%r13), %r14d #180.9 - movslq %r9d, %rdi #124.32 - sarl $3, %r14d #180.9 - incq %r9 #124.5 - movslq %r14d, %r14 #180.9 - incq %rdi #124.32 - addq %r14, %rbx #180.9 - cmpq 64(%rsp), %r9 #124.5[spill] - jb ..B2.7 # Prob 82% #124.5 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 + movslq %r11d, %r11 #180.9 + vaddsd (%rax,%rdx), %xmm27, %xmm0 #176.9 + vaddsd 8(%rax,%rdx), %xmm21, %xmm1 #177.9 + vaddsd 16(%rax,%rdx), %xmm3, %xmm2 #178.9 + vmovsd %xmm0, (%rax,%rdx) #176.9 + lea 7(%r11), %r10d #181.9 + sarl $2, %r10d #181.9 + addq %r11, %rcx #180.9 + shrl $29, %r10d #181.9 + vmovsd %xmm1, 8(%rax,%rdx) #177.9 + vmovsd %xmm2, 16(%rax,%rdx) #178.9 + addq $24, %rax #126.5 + lea 7(%r10,%r11), %r12d #181.9 + movslq %r9d, %r10 #126.32 + sarl $3, %r12d #181.9 + incq %r9 #126.5 + movslq %r12d, %r12 #181.9 + incq %r10 #126.32 + addq %r12, %rbx #181.9 + cmpq 48(%rsp), %r9 #126.5[spill] + jb ..B2.7 # Prob 82% #126.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 ..B2.27: # Preds ..B2.26 # Execution count [9.00e-01] - movq (%rsp), %r13 #[spill] - movq %rcx, (%r13) #179.9 - movq %rbx, 8(%r13) #180.9 - jmp ..B2.30 # Prob 100% #180.9 - # LOE r12d + movq (%rsp), %r14 #[spill] + movq %rcx, (%r14) #180.9 + movq %rbx, 8(%r14) #181.9 + jmp ..B2.30 # Prob 100% #181.9 + # LOE ..B2.28: # Preds ..B2.1 # Execution count [5.00e-01] - xorl %r12d, %r12d #120.22 - xorl %eax, %eax #121.16 -..___tag_value_computeForceLJHalfNeigh.96: + xorl %eax, %eax #119.16 +..___tag_value_computeForceLJHalfNeigh.87: # getTimeStamp() - call getTimeStamp #121.16 -..___tag_value_computeForceLJHalfNeigh.97: - # LOE r12d xmm0 -..B2.46: # Preds ..B2.28 + call getTimeStamp #119.16 +..___tag_value_computeForceLJHalfNeigh.88: + # LOE xmm0 +..B2.45: # Preds ..B2.28 # Execution count [5.00e-01] - vmovsd %xmm0, 8(%rsp) #121.16[spill] - # LOE r12d -..B2.29: # Preds ..B2.46 + vmovsd %xmm0, 8(%rsp) #119.16[spill] + # LOE +..B2.29: # Preds ..B2.45 # Execution count [5.00e-01] - movl $.L_2__STRING.1, %edi #122.5 -..___tag_value_computeForceLJHalfNeigh.99: + movl $.L_2__STRING.1, %edi #123.5 +..___tag_value_computeForceLJHalfNeigh.90: # likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #122.5 -..___tag_value_computeForceLJHalfNeigh.100: - # LOE r12d + call likwid_markerStartRegion #123.5 +..___tag_value_computeForceLJHalfNeigh.91: + # LOE ..B2.30: # Preds ..B2.27 ..B2.29 # Execution count [1.00e+00] - movl $.L_2__STRING.1, %edi #183.5 - vzeroupper #183.5 -..___tag_value_computeForceLJHalfNeigh.101: + movl $.L_2__STRING.1, %edi #184.5 + vzeroupper #184.5 +..___tag_value_computeForceLJHalfNeigh.92: # likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #183.5 -..___tag_value_computeForceLJHalfNeigh.102: - # LOE r12d + call likwid_markerStopRegion #184.5 +..___tag_value_computeForceLJHalfNeigh.93: + # LOE ..B2.31: # Preds ..B2.30 # Execution count [1.00e+00] - xorl %eax, %eax #184.16 -..___tag_value_computeForceLJHalfNeigh.103: + xorl %eax, %eax #187.16 +..___tag_value_computeForceLJHalfNeigh.94: # getTimeStamp() - call getTimeStamp #184.16 -..___tag_value_computeForceLJHalfNeigh.104: - # LOE r12d xmm0 + call getTimeStamp #187.16 +..___tag_value_computeForceLJHalfNeigh.95: + # LOE xmm0 ..B2.32: # Preds ..B2.31 # Execution count [1.00e+00] - vxorpd %xmm4, %xmm4, %xmm4 #185.5 - movl $.L_2__STRING.2, %edi #185.5 - vmovsd .L_2il0floatpacket.9(%rip), %xmm3 #185.5 - movl %r12d, %esi #185.5 - movq 16(%rsp), %rax #185.74[spill] - vsubsd 8(%rsp), %xmm0, %xmm1 #185.94[spill] - vmovsd 264(%rax), %xmm7 #185.74 - movl $3, %eax #185.5 - vcvtusi2sdl %r12d, %xmm4, %xmm4 #185.5 - vdivsd %xmm4, %xmm3, %xmm5 #185.5 - vmulsd %xmm1, %xmm5, %xmm6 #185.5 - vmulsd %xmm7, %xmm6, %xmm2 #185.5 - vmovapd %xmm7, %xmm0 #185.5 - vmovsd %xmm1, (%rsp) #185.5[spill] -..___tag_value_computeForceLJHalfNeigh.107: -# printf(const char *__restrict__, ...) - call printf #185.5 -..___tag_value_computeForceLJHalfNeigh.108: - # LOE -..B2.33: # Preds ..B2.32 - # Execution count [1.00e+00] - vmovsd (%rsp), %xmm1 #[spill] - vmovapd %xmm1, %xmm0 #186.14 - addq $88, %rsp #186.14 + vsubsd 8(%rsp), %xmm0, %xmm0 #188.14[spill] + addq $88, %rsp #188.14 .cfi_restore 3 - popq %rbx #186.14 + popq %rbx #188.14 .cfi_restore 15 - popq %r15 #186.14 + popq %r15 #188.14 .cfi_restore 14 - popq %r14 #186.14 + popq %r14 #188.14 .cfi_restore 13 - popq %r13 #186.14 + popq %r13 #188.14 .cfi_restore 12 - popq %r12 #186.14 - movq %rbp, %rsp #186.14 - popq %rbp #186.14 + popq %r12 #188.14 + movq %rbp, %rsp #188.14 + popq %rbp #188.14 .cfi_def_cfa 7, 8 .cfi_restore 6 - ret #186.14 + ret #188.14 .cfi_def_cfa 6, 16 .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 .cfi_offset 6, -16 @@ -1143,66 +1107,66 @@ computeForceLJHalfNeigh: .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 # LOE -..B2.34: # Preds ..B2.10 +..B2.33: # Preds ..B2.10 # Execution count [2.25e-01]: Infreq - movl %r13d, %r15d #143.9 - xorl %r14d, %r14d #143.9 - andl $-8, %r15d #143.9 - jmp ..B2.19 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 -..B2.35: # Preds ..B2.9 + movl %r11d, %r14d #144.9 + xorl %r12d, %r12d #144.9 + andl $-8, %r14d #144.9 + jmp ..B2.19 # Prob 100% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.34: # Preds ..B2.9 # Execution count [2.25e-01]: Infreq - xorl %r15d, %r15d #143.9 - jmp ..B2.23 # Prob 100% #143.9 - # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 -..B2.36: # Preds ..B2.2 + xorl %r14d, %r14d #144.9 + jmp ..B2.23 # Prob 100% #144.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.35: # Preds ..B2.2 # Execution count [1.00e+00]: Infreq - cmpl $8, %esi #114.5 - jl ..B2.42 # Prob 10% #114.5 - # LOE rbx rdi r13 esi r15d -..B2.37: # Preds ..B2.36 + cmpl $8, %esi #113.5 + jl ..B2.41 # Prob 10% #113.5 + # LOE rdi r12 r13 r14 ebx esi +..B2.36: # Preds ..B2.35 # Execution count [1.00e+00]: Infreq - movl %esi, %eax #114.5 - xorl %ecx, %ecx #114.5 - andl $-8, %eax #114.5 - movslq %eax, %rdx #114.5 - vpxord %zmm0, %zmm0, %zmm0 #114.5 - # LOE rdx rcx rbx rdi r13 eax esi r15d zmm0 -..B2.38: # Preds ..B2.38 ..B2.37 + movl %esi, %eax #113.5 + xorl %ecx, %ecx #113.5 + andl $-8, %eax #113.5 + movslq %eax, %rdx #113.5 + vpxord %zmm0, %zmm0, %zmm0 #113.5 + # LOE rdx rcx rdi r12 r13 r14 eax ebx esi zmm0 +..B2.37: # Preds ..B2.37 ..B2.36 # Execution count [5.56e+00]: Infreq - vmovupd %zmm0, (%rdi,%rcx,8) #115.9 - addq $8, %rcx #114.5 - cmpq %rdx, %rcx #114.5 - jb ..B2.38 # Prob 82% #114.5 - # LOE rdx rcx rbx rdi r13 eax esi r15d zmm0 -..B2.40: # Preds ..B2.38 ..B2.42 + vmovupd %zmm0, (%rdi,%rcx,8) #114.9 + addq $8, %rcx #113.5 + cmpq %rdx, %rcx #113.5 + jb ..B2.37 # Prob 82% #113.5 + # LOE rdx rcx rdi r12 r13 r14 eax ebx esi zmm0 +..B2.39: # Preds ..B2.37 ..B2.41 # Execution count [1.11e+00]: Infreq - lea 1(%rax), %edx #114.5 - cmpl %esi, %edx #114.5 - ja ..B2.48 # Prob 50% #114.5 - # LOE rbx rdi r13 eax esi r15d -..B2.41: # Preds ..B2.40 + lea 1(%rax), %edx #113.5 + cmpl %esi, %edx #113.5 + ja ..B2.47 # Prob 50% #113.5 + # LOE rdi r12 r13 r14 eax ebx esi +..B2.40: # Preds ..B2.39 # Execution count [5.56e+00]: Infreq - subl %eax, %esi #114.5 - vpbroadcastd %esi, %ymm0 #114.5 - vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k1 #114.5 - movslq %eax, %rax #114.5 - movslq %r15d, %r14 #114.5 - vpxord %zmm1, %zmm1, %zmm1 #115.9 - vmovupd %zmm1, (%rdi,%rax,8){%k1} #115.9 - jmp ..B2.4 # Prob 100% #115.9 - # LOE rbx r13 r14 r15d -..B2.42: # Preds ..B2.36 + subl %eax, %esi #113.5 + vpbroadcastd %esi, %ymm0 #113.5 + vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k1 #113.5 + movslq %eax, %rax #113.5 + movslq %ebx, %r15 #113.5 + vpxord %zmm1, %zmm1, %zmm1 #114.9 + vmovupd %zmm1, (%rdi,%rax,8){%k1} #114.9 + jmp ..B2.4 # Prob 100% #114.9 + # LOE r12 r13 r14 r15 ebx +..B2.41: # Preds ..B2.35 # Execution count [1.00e-01]: Infreq - xorl %eax, %eax #114.5 - jmp ..B2.40 # Prob 100% #114.5 - # LOE rbx rdi r13 eax esi r15d -..B2.48: # Preds ..B2.40 + xorl %eax, %eax #113.5 + jmp ..B2.39 # Prob 100% #113.5 + # LOE rdi r12 r13 r14 eax ebx esi +..B2.47: # Preds ..B2.39 # Execution count [5.56e-01]: Infreq - movslq %r15d, %r14 #114.5 - jmp ..B2.4 # Prob 100% #114.5 + movslq %ebx, %r15 #113.5 + jmp ..B2.4 # Prob 100% #113.5 .align 16,0x90 - # LOE rbx r13 r14 r15d + # LOE r12 r13 r14 r15 ebx .cfi_endproc # mark_end; .type computeForceLJHalfNeigh,@function @@ -1226,311 +1190,286 @@ computeForceLJFullNeigh_simd: ..B3.1: # Preds ..B3.0 # Execution count [1.00e+00] .cfi_startproc -..___tag_value_computeForceLJFullNeigh_simd.126: -..L127: - #189.101 - pushq %rbp #189.101 +..___tag_value_computeForceLJFullNeigh_simd.112: +..L113: + #191.101 + pushq %rbp #191.101 .cfi_def_cfa_offset 16 - movq %rsp, %rbp #189.101 + movq %rsp, %rbp #191.101 .cfi_def_cfa 6, 16 .cfi_offset 6, -16 - andq $-64, %rsp #189.101 - pushq %r12 #189.101 - pushq %r13 #189.101 - pushq %r14 #189.101 - pushq %r15 #189.101 - pushq %rbx #189.101 - subq $216, %rsp #189.101 - .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 - .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 - movq %rdi, %rbx #189.101 - movq %rsi, %r13 #189.101 - movq %rdx, %r12 #189.101 - vmovsd 144(%rbx), %xmm0 #192.27 - vmovsd 56(%rbx), %xmm1 #193.23 - vmovsd 40(%rbx), %xmm2 #194.24 - movl 4(%r13), %r14d #190.18 - vmovsd %xmm0, 16(%rsp) #192.27[spill] - vmovsd %xmm1, 8(%rsp) #193.23[spill] - vmovsd %xmm2, (%rsp) #194.24[spill] - testl %r14d, %r14d #196.24 - jle ..B3.9 # Prob 50% #196.24 - # LOE rbx r12 r13 r14d + andq $-64, %rsp #191.101 + pushq %r14 #191.101 + pushq %r15 #191.101 + pushq %rbx #191.101 + subq $232, %rsp #191.101 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + movq %rsi, %r14 #191.101 + vmovsd 144(%rdi), %xmm0 #194.27 + movq %rdx, %r15 #191.101 + vmovsd 56(%rdi), %xmm1 #195.23 + vmovsd 40(%rdi), %xmm2 #196.24 + movl 4(%r14), %ebx #192.18 + vmovsd %xmm0, 216(%rsp) #194.27[spill] + vmovsd %xmm1, 200(%rsp) #195.23[spill] + vmovsd %xmm2, 208(%rsp) #196.24[spill] + testl %ebx, %ebx #198.24 + jle ..B3.8 # Prob 50% #198.24 + # LOE r12 r13 r14 r15 ebx ..B3.2: # Preds ..B3.1 # Execution count [1.00e+00] - movl %r14d, %ecx #196.5 - xorl %edx, %edx #196.5 - movl $1, %esi #196.5 - xorl %eax, %eax #196.5 - shrl $1, %ecx #196.5 - je ..B3.6 # Prob 9% #196.5 - # LOE rax rdx rcx rbx r12 r13 esi r14d + movl %ebx, %eax #198.5 + xorl %edx, %edx #198.5 + movl $1, %ecx #198.5 + xorl %esi, %esi #198.5 + shrl $1, %eax #198.5 + je ..B3.6 # Prob 9% #198.5 + # LOE rax rdx rsi r12 r13 r14 r15 ecx ebx ..B3.3: # Preds ..B3.2 # Execution count [9.00e-01] - xorl %r15d, %r15d #196.5 + xorl %ecx, %ecx #198.5 .align 16,0x90 - # LOE rax rdx rcx rbx r12 r13 r15 r14d + # LOE rax rdx rcx rsi r12 r13 r14 r15 ebx ..B3.4: # Preds ..B3.4 ..B3.3 # Execution count [2.50e+00] - movq 64(%r13), %rsi #197.9 - incq %rdx #196.5 - movq %r15, (%rsi,%rax) #197.9 - movq 64(%r13), %rdi #198.9 - movq %r15, 8(%rdi,%rax) #198.9 - movq 64(%r13), %r8 #199.9 - movq %r15, 16(%r8,%rax) #199.9 - movq 64(%r13), %r9 #197.9 - movq %r15, 24(%r9,%rax) #197.9 - movq 64(%r13), %r10 #198.9 - movq %r15, 32(%r10,%rax) #198.9 - movq 64(%r13), %r11 #199.9 - movq %r15, 40(%r11,%rax) #199.9 - addq $48, %rax #196.5 - cmpq %rcx, %rdx #196.5 - jb ..B3.4 # Prob 63% #196.5 - # LOE rax rdx rcx rbx r12 r13 r15 r14d + movq 64(%r14), %rdi #199.9 + incq %rdx #198.5 + movq %rcx, (%rdi,%rsi) #199.9 + movq 64(%r14), %r8 #200.9 + movq %rcx, 8(%r8,%rsi) #200.9 + movq 64(%r14), %r9 #201.9 + movq %rcx, 16(%r9,%rsi) #201.9 + movq 64(%r14), %r10 #199.9 + movq %rcx, 24(%r10,%rsi) #199.9 + movq 64(%r14), %r11 #200.9 + movq %rcx, 32(%r11,%rsi) #200.9 + movq 64(%r14), %rdi #201.9 + movq %rcx, 40(%rdi,%rsi) #201.9 + addq $48, %rsi #198.5 + cmpq %rax, %rdx #198.5 + jb ..B3.4 # Prob 63% #198.5 + # LOE rax rdx rcx rsi r12 r13 r14 r15 ebx ..B3.5: # Preds ..B3.4 # Execution count [9.00e-01] - lea 1(%rdx,%rdx), %esi #197.9 - # LOE rbx r12 r13 esi r14d + lea 1(%rdx,%rdx), %ecx #199.9 + # LOE r12 r13 r14 r15 ecx ebx ..B3.6: # Preds ..B3.5 ..B3.2 # Execution count [1.00e+00] - lea -1(%rsi), %eax #196.5 - cmpl %r14d, %eax #196.5 - jae ..B3.9 # Prob 9% #196.5 - # LOE rbx r12 r13 esi r14d + lea -1(%rcx), %eax #198.5 + cmpl %ebx, %eax #198.5 + jae ..B3.8 # Prob 9% #198.5 + # LOE r12 r13 r14 r15 ecx ebx ..B3.7: # Preds ..B3.6 # Execution count [9.00e-01] - movslq %esi, %rsi #197.9 - xorl %ecx, %ecx #197.9 - movq 64(%r13), %rax #197.9 - lea (%rsi,%rsi,2), %r8 #197.9 - movq %rcx, -24(%rax,%r8,8) #197.9 - movq 64(%r13), %rdx #198.9 - movq %rcx, -16(%rdx,%r8,8) #198.9 - movq 64(%r13), %rdi #199.9 - movq %rcx, -8(%rdi,%r8,8) #199.9 - # LOE rbx r12 r13 r14d -..B3.9: # Preds ..B3.7 ..B3.6 ..B3.1 - # Execution count [5.00e-01] - xorl %eax, %eax #203.16 + movslq %ecx, %rcx #199.9 + xorl %esi, %esi #199.9 + movq 64(%r14), %rax #199.9 + lea (%rcx,%rcx,2), %r8 #199.9 + movq %rsi, -24(%rax,%r8,8) #199.9 + movq 64(%r14), %rdx #200.9 + movq %rsi, -16(%rdx,%r8,8) #200.9 + movq 64(%r14), %rdi #201.9 + movq %rsi, -8(%rdi,%r8,8) #201.9 + # LOE r12 r13 r14 r15 ebx +..B3.8: # Preds ..B3.1 ..B3.6 ..B3.7 + # Execution count [1.00e+00] + xorl %eax, %eax #204.16 +..___tag_value_computeForceLJFullNeigh_simd.123: +# getTimeStamp() + call getTimeStamp #204.16 +..___tag_value_computeForceLJFullNeigh_simd.124: + # LOE r12 r13 r14 r15 ebx xmm0 +..B3.23: # Preds ..B3.8 + # Execution count [1.00e+00] + vmovsd %xmm0, 192(%rsp) #204.16[spill] + # LOE r12 r13 r14 r15 ebx +..B3.9: # Preds ..B3.23 + # Execution count [1.00e+00] + vmovsd 216(%rsp), %xmm0 #210.36[spill] + movl $.L_2__STRING.0, %edi #219.5 + vmulsd %xmm0, %xmm0, %xmm1 #210.36 + vbroadcastsd 200(%rsp), %zmm3 #211.32[spill] + vbroadcastsd 208(%rsp), %zmm4 #212.29[spill] + vbroadcastsd %xmm1, %zmm2 #210.36 + vmovups %zmm3, 64(%rsp) #211.32[spill] + vmovups %zmm4, 128(%rsp) #212.29[spill] + vmovups %zmm2, (%rsp) #210.36[spill] + vzeroupper #219.5 +..___tag_value_computeForceLJFullNeigh_simd.132: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #219.5 +..___tag_value_computeForceLJFullNeigh_simd.133: + # LOE r12 r13 r14 r15 ebx +..B3.10: # Preds ..B3.9 + # Execution count [1.00e+00] + xorl %ecx, %ecx #223.9 + xorl %r10d, %r10d #222.5 + xorl %edx, %edx #223.9 + testl %ebx, %ebx #222.24 + jle ..B3.18 # Prob 9% #222.24 + # LOE rdx rcx r12 r13 r14 r15 ebx r10d +..B3.11: # Preds ..B3.10 + # Execution count [9.00e-01] + vmovdqu .L_2il0floatpacket.10(%rip), %ymm13 #235.101 + vmovups .L_2il0floatpacket.6(%rip), %zmm0 #258.23 + vmovups 128(%rsp), %zmm10 #258.23[spill] + vmovups 64(%rsp), %zmm11 #258.23[spill] + vmovups (%rsp), %zmm12 #258.23[spill] + vbroadcastsd .L_2il0floatpacket.9(%rip), %zmm8 #258.23 + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm9 #258.23 + vpxord %zmm1, %zmm1, %zmm1 #229.29 + # LOE rdx rcx r12 r13 r14 r15 ebx r10d ymm13 zmm0 zmm1 zmm8 zmm9 zmm10 zmm11 zmm12 +..B3.12: # Preds ..B3.16 ..B3.11 + # Execution count [5.00e+00] + movl %r10d, %edi #223.43 + xorl %r9d, %r9d #233.9 + imull 8(%r15), %edi #223.43 + movslq %edi, %rdi #223.19 + movq 24(%r15), %rsi #224.25 + movq 16(%r15), %rax #223.19 + movq 16(%r14), %r8 #226.45 + vmovaps %zmm1, %zmm3 #229.29 + vmovaps %zmm3, %zmm2 #230.29 + lea (%rax,%rdi,4), %rdi #223.19 + movl (%rsi,%rcx,4), %esi #224.25 + xorl %eax, %eax #235.78 + vmovaps %zmm2, %zmm7 #231.29 + vpbroadcastd %esi, %ymm6 #225.37 + vbroadcastsd (%r8,%rdx,8), %zmm5 #226.30 + vbroadcastsd 8(%r8,%rdx,8), %zmm4 #227.30 + vbroadcastsd 16(%r8,%rdx,8), %zmm14 #228.30 + testl %esi, %esi #233.28 + jle ..B3.16 # Prob 10% #233.28 + # LOE rdx rcx rdi r8 r12 r13 r14 r15 eax ebx esi r9d r10d ymm6 ymm13 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm14 +..B3.13: # Preds ..B3.12 + # Execution count [4.50e+00] + addl $7, %esi #224.25 + shrl $3, %esi #224.25 + # LOE rdx rcx rdi r8 r12 r13 r14 r15 eax ebx esi r9d r10d ymm6 ymm13 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm14 +..B3.14: # Preds ..B3.14 ..B3.13 + # Execution count [2.50e+01] + vpbroadcastd %eax, %ymm15 #235.78 + incl %r9d #233.9 + vpcmpeqb %xmm0, %xmm0, %k4 #241.41 + vpcmpeqb %xmm0, %xmm0, %k3 #240.41 + vpcmpeqb %xmm0, %xmm0, %k2 #239.41 + vpaddd %ymm13, %ymm15, %ymm16 #235.65 + vpcmpgtd %ymm16, %ymm6, %k1 #235.43 + movslq %eax, %rax #236.29 + kmovw %k1, %r11d #235.43 + kmovb %r11d, %k5 #248.40 + vmovdqu32 (%rdi,%rax,4), %ymm18{%k1}{z} #236.29 + addl $8, %eax #233.9 + vpaddd %ymm18, %ymm18, %ymm17 #238.43 + vpaddd %ymm18, %ymm17, %ymm19 #238.30 + vpxord %zmm22, %zmm22, %zmm22 #241.41 + vpxord %zmm21, %zmm21, %zmm21 #240.41 + vpxord %zmm20, %zmm20, %zmm20 #239.41 + vgatherdpd 16(%r8,%ymm19,8), %zmm22{%k4} #241.41 + vgatherdpd 8(%r8,%ymm19,8), %zmm21{%k3} #240.41 + vgatherdpd (%r8,%ymm19,8), %zmm20{%k2} #239.41 + vsubpd %zmm22, %zmm14, %zmm16 #241.41 + vsubpd %zmm21, %zmm4, %zmm15 #240.41 + vsubpd %zmm20, %zmm5, %zmm31 #239.41 + vmulpd %zmm16, %zmm16, %zmm23 #247.75 + vfmadd231pd %zmm15, %zmm15, %zmm23 #247.54 + vfmadd231pd %zmm31, %zmm31, %zmm23 #247.33 + vrcp14pd %zmm23, %zmm25 #249.33 + vcmppd $17, %zmm12, %zmm23, %k0 #248.70 + vmulpd %zmm11, %zmm25, %zmm24 #250.61 + vmulpd %zmm10, %zmm25, %zmm27 #251.100 + kmovw %k0, %r11d #248.70 + vmulpd %zmm24, %zmm25, %zmm26 #250.47 + vmulpd %zmm26, %zmm25, %zmm28 #250.33 + vfmsub213pd %zmm8, %zmm25, %zmm26 #251.76 + vmulpd %zmm27, %zmm26, %zmm29 #251.67 + vmulpd %zmm29, %zmm28, %zmm30 #251.53 + vmulpd %zmm30, %zmm9, %zmm23 #251.35 + kmovb %r11d, %k6 #248.40 + kandb %k6, %k5, %k7 #248.40 + kmovb %k7, %r11d #248.40 + kmovw %r11d, %k1 #253.19 + vfmadd231pd %zmm31, %zmm23, %zmm3{%k1} #253.19 + vfmadd231pd %zmm15, %zmm23, %zmm2{%k1} #254.19 + vfmadd231pd %zmm16, %zmm23, %zmm7{%k1} #255.19 + cmpl %esi, %r9d #233.9 + jb ..B3.14 # Prob 82% #233.9 + # LOE rdx rcx rdi r8 r12 r13 r14 r15 eax ebx esi r9d r10d ymm6 ymm13 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm14 +..B3.16: # Preds ..B3.14 ..B3.12 + # Execution count [5.00e+00] + vpermd %zmm3, %zmm0, %zmm4 #258.23 + incl %r10d #222.5 + vpermd %zmm2, %zmm0, %zmm18 #259.23 + vpermd %zmm7, %zmm0, %zmm25 #260.23 + vaddpd %zmm3, %zmm4, %zmm5 #258.23 + vaddpd %zmm2, %zmm18, %zmm19 #259.23 + vaddpd %zmm7, %zmm25, %zmm26 #260.23 + vshuff64x2 $17, %zmm5, %zmm5, %zmm3 #258.23 + vshuff64x2 $17, %zmm19, %zmm19, %zmm2 #259.23 + vshuff64x2 $17, %zmm26, %zmm26, %zmm7 #260.23 + vaddpd %zmm5, %zmm3, %zmm14 #258.23 + vaddpd %zmm19, %zmm2, %zmm21 #259.23 + vaddpd %zmm26, %zmm7, %zmm28 #260.23 + vpermilpd $1, %zmm14, %zmm6 #258.23 + incq %rcx #222.5 + vaddpd %zmm14, %zmm6, %zmm15 #258.23 + vmovups %zmm15, (%rsp) #258.23 + movq 64(%r14), %rax #258.9 + vpermilpd $1, %zmm21, %zmm20 #259.23 + vaddpd %zmm21, %zmm20, %zmm22 #259.23 + vmovsd (%rax,%rdx,8), %xmm16 #258.9 + vaddsd (%rsp), %xmm16, %xmm17 #258.9 + vmovups %zmm22, 64(%rsp) #259.23 + vmovsd %xmm17, (%rax,%rdx,8) #258.9 + movq 64(%r14), %rsi #259.9 + vpermilpd $1, %zmm28, %zmm27 #260.23 + vaddpd %zmm28, %zmm27, %zmm29 #260.23 + vmovsd 8(%rsi,%rdx,8), %xmm23 #259.9 + vaddsd 64(%rsp), %xmm23, %xmm24 #259.9 + vmovups %zmm29, 128(%rsp) #260.23 + vmovsd %xmm24, 8(%rsi,%rdx,8) #259.9 + movq 64(%r14), %rdi #260.9 + vmovsd 16(%rdi,%rdx,8), %xmm30 #260.9 + vaddsd 128(%rsp), %xmm30, %xmm31 #260.9 + vmovsd %xmm31, 16(%rdi,%rdx,8) #260.9 + addq $3, %rdx #222.5 + cmpl %ebx, %r10d #222.5 + jb ..B3.12 # Prob 82% #222.5 + # LOE rdx rcx r12 r13 r14 r15 ebx r10d ymm13 zmm0 zmm1 zmm8 zmm9 zmm10 zmm11 zmm12 +..B3.18: # Preds ..B3.16 ..B3.10 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #263.5 + vzeroupper #263.5 +..___tag_value_computeForceLJFullNeigh_simd.137: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #263.5 +..___tag_value_computeForceLJFullNeigh_simd.138: + # LOE r12 r13 +..B3.19: # Preds ..B3.18 + # Execution count [1.00e+00] + xorl %eax, %eax #267.16 ..___tag_value_computeForceLJFullNeigh_simd.139: # getTimeStamp() - call getTimeStamp #203.16 + call getTimeStamp #267.16 ..___tag_value_computeForceLJFullNeigh_simd.140: - # LOE rbx r12 r13 r14d xmm0 -..B3.26: # Preds ..B3.9 - # Execution count [5.00e-01] - vmovsd %xmm0, 192(%rsp) #203.16[spill] - # LOE rbx r12 r13 r14d -..B3.10: # Preds ..B3.26 - # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #204.5 - xorl %r15d, %r15d #204.5 -..___tag_value_computeForceLJFullNeigh_simd.142: -# likwid_markerStartRegion(const char *) - call likwid_markerStartRegion #204.5 -..___tag_value_computeForceLJFullNeigh_simd.143: - # LOE rbx r12 r13 r14d r15d -..B3.11: # Preds ..B3.10 - # Execution count [1.00e+00] - vmovsd 16(%rsp), %xmm0 #210.36[spill] - xorl %edi, %edi #217.9 - vmulsd %xmm0, %xmm0, %xmm1 #210.36 - xorl %r11d, %r11d #216.5 - vbroadcastsd 8(%rsp), %zmm10 #211.32[spill] - vbroadcastsd (%rsp), %zmm9 #212.29[spill] - vbroadcastsd %xmm1, %zmm11 #210.36 - vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm8 #213.29 - vbroadcastsd .L_2il0floatpacket.10(%rip), %zmm13 #214.29 - xorl %edx, %edx #217.9 - testl %r14d, %r14d #216.24 - jle ..B3.19 # Prob 9% #216.24 - # LOE rdx rbx rdi r12 r13 r11d r14d r15d zmm8 zmm9 zmm10 zmm11 zmm13 -..B3.12: # Preds ..B3.11 - # Execution count [9.00e-01] - vmovdqu .L_2il0floatpacket.11(%rip), %ymm12 #230.101 - vmovups .L_2il0floatpacket.6(%rip), %zmm0 #253.23 - vpxord %zmm1, %zmm1, %zmm1 #223.29 - # LOE rdx rbx rdi r12 r13 r11d r14d r15d ymm12 zmm0 zmm1 zmm8 zmm9 zmm10 zmm11 zmm13 -..B3.13: # Preds ..B3.17 ..B3.12 - # Execution count [5.00e+00] - movl %r11d, %r8d #217.43 - xorl %r10d, %r10d #228.9 - imull 8(%r12), %r8d #217.43 - movslq %r8d, %r8 #217.19 - movq 24(%r12), %rcx #218.25 - movq 16(%r12), %rax #217.19 - movq 16(%r13), %r9 #220.45 - vmovaps %zmm1, %zmm3 #223.29 - vmovaps %zmm3, %zmm2 #224.29 - lea (%rax,%r8,4), %r8 #217.19 - movl (%rcx,%rdi,4), %ecx #218.25 - addl %ecx, %r15d #227.9 - vmovaps %zmm2, %zmm7 #225.29 - xorl %eax, %eax #230.78 - vpbroadcastd %ecx, %ymm6 #219.37 - vbroadcastsd (%r9,%rdx,8), %zmm5 #220.30 - vbroadcastsd 8(%r9,%rdx,8), %zmm4 #221.30 - vbroadcastsd 16(%r9,%rdx,8), %zmm14 #222.30 - testl %ecx, %ecx #228.28 - jle ..B3.17 # Prob 10% #228.28 - # LOE rdx rbx rdi r8 r9 r12 r13 eax ecx r10d r11d r14d r15d ymm6 ymm12 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 -..B3.14: # Preds ..B3.13 - # Execution count [4.50e+00] - addl $7, %ecx #218.25 - shrl $3, %ecx #218.25 - # LOE rdx rbx rdi r8 r9 r12 r13 eax ecx r10d r11d r14d r15d ymm6 ymm12 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 -..B3.15: # Preds ..B3.15 ..B3.14 - # Execution count [2.50e+01] - vpbroadcastd %eax, %ymm15 #230.78 - incl %r10d #228.9 - vpcmpeqb %xmm0, %xmm0, %k4 #236.41 - vpcmpeqb %xmm0, %xmm0, %k3 #235.41 - vpcmpeqb %xmm0, %xmm0, %k2 #234.41 - vpaddd %ymm12, %ymm15, %ymm16 #230.65 - vpcmpgtd %ymm16, %ymm6, %k1 #230.43 - movslq %eax, %rax #231.29 - kmovw %k1, %esi #230.43 - kmovb %esi, %k5 #243.40 - vmovdqu32 (%r8,%rax,4), %ymm18{%k1}{z} #231.29 - addl $8, %eax #228.9 - vpaddd %ymm18, %ymm18, %ymm17 #233.43 - vpaddd %ymm18, %ymm17, %ymm19 #233.30 - vpxord %zmm22, %zmm22, %zmm22 #236.41 - vpxord %zmm21, %zmm21, %zmm21 #235.41 - vpxord %zmm20, %zmm20, %zmm20 #234.41 - vgatherdpd 16(%r9,%ymm19,8), %zmm22{%k4} #236.41 - vgatherdpd 8(%r9,%ymm19,8), %zmm21{%k3} #235.41 - vgatherdpd (%r9,%ymm19,8), %zmm20{%k2} #234.41 - vsubpd %zmm22, %zmm14, %zmm16 #236.41 - vsubpd %zmm21, %zmm4, %zmm15 #235.41 - vsubpd %zmm20, %zmm5, %zmm31 #234.41 - vmulpd %zmm16, %zmm16, %zmm23 #242.75 - vfmadd231pd %zmm15, %zmm15, %zmm23 #242.54 - vfmadd231pd %zmm31, %zmm31, %zmm23 #242.33 - vrcp14pd %zmm23, %zmm25 #244.33 - vcmppd $17, %zmm11, %zmm23, %k0 #243.70 - vmulpd %zmm10, %zmm25, %zmm24 #245.61 - vmulpd %zmm9, %zmm25, %zmm27 #246.100 - kmovw %k0, %esi #243.70 - vmulpd %zmm24, %zmm25, %zmm26 #245.47 - vmulpd %zmm26, %zmm25, %zmm28 #245.33 - vfmsub213pd %zmm13, %zmm25, %zmm26 #246.76 - vmulpd %zmm27, %zmm26, %zmm29 #246.67 - vmulpd %zmm29, %zmm28, %zmm30 #246.53 - vmulpd %zmm30, %zmm8, %zmm23 #246.35 - kmovb %esi, %k6 #243.40 - kandb %k6, %k5, %k7 #243.40 - kmovb %k7, %esi #243.40 - kmovw %esi, %k1 #248.19 - vfmadd231pd %zmm31, %zmm23, %zmm3{%k1} #248.19 - vfmadd231pd %zmm15, %zmm23, %zmm2{%k1} #249.19 - vfmadd231pd %zmm16, %zmm23, %zmm7{%k1} #250.19 - cmpl %ecx, %r10d #228.9 - jb ..B3.15 # Prob 82% #228.9 - # LOE rdx rbx rdi r8 r9 r12 r13 eax ecx r10d r11d r14d r15d ymm6 ymm12 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 -..B3.17: # Preds ..B3.15 ..B3.13 - # Execution count [5.00e+00] - vpermd %zmm3, %zmm0, %zmm4 #253.23 - incl %r11d #216.5 - vpermd %zmm2, %zmm0, %zmm16 #254.23 - vpermd %zmm7, %zmm0, %zmm21 #255.23 - vaddpd %zmm3, %zmm4, %zmm5 #253.23 - vaddpd %zmm2, %zmm16, %zmm17 #254.23 - vaddpd %zmm7, %zmm21, %zmm22 #255.23 - vshuff64x2 $17, %zmm5, %zmm5, %zmm3 #253.23 - vshuff64x2 $17, %zmm17, %zmm17, %zmm2 #254.23 - vshuff64x2 $17, %zmm22, %zmm22, %zmm7 #255.23 - vaddpd %zmm5, %zmm3, %zmm14 #253.23 - vaddpd %zmm17, %zmm2, %zmm19 #254.23 - vaddpd %zmm22, %zmm7, %zmm24 #255.23 - vpermilpd $1, %zmm14, %zmm6 #253.23 - incq %rdi #216.5 - vaddpd %zmm14, %zmm6, %zmm15 #253.23 - vmovups %zmm15, (%rsp) #253.23 - movq 64(%r13), %rax #253.9 - vpermilpd $1, %zmm19, %zmm18 #254.23 - vaddpd %zmm19, %zmm18, %zmm20 #254.23 - vmovsd (%rax,%rdx,8), %xmm26 #253.9 - vaddsd (%rsp), %xmm26, %xmm27 #253.9 - vmovups %zmm20, 64(%rsp) #254.23 - vmovsd %xmm27, (%rax,%rdx,8) #253.9 - movq 64(%r13), %rcx #254.9 - vpermilpd $1, %zmm24, %zmm23 #255.23 - vaddpd %zmm24, %zmm23, %zmm25 #255.23 - vmovsd 8(%rcx,%rdx,8), %xmm28 #254.9 - vaddsd 64(%rsp), %xmm28, %xmm29 #254.9 - vmovups %zmm25, 128(%rsp) #255.23 - vmovsd %xmm29, 8(%rcx,%rdx,8) #254.9 - movq 64(%r13), %r8 #255.9 - vmovsd 16(%r8,%rdx,8), %xmm30 #255.9 - vaddsd 128(%rsp), %xmm30, %xmm31 #255.9 - vmovsd %xmm31, 16(%r8,%rdx,8) #255.9 - addq $3, %rdx #216.5 - cmpl %r14d, %r11d #216.5 - jb ..B3.13 # Prob 82% #216.5 - # LOE rdx rbx rdi r12 r13 r11d r14d r15d ymm12 zmm0 zmm1 zmm8 zmm9 zmm10 zmm11 zmm13 -..B3.19: # Preds ..B3.17 ..B3.11 - # Execution count [1.00e+00] - movl $.L_2__STRING.0, %edi #259.5 - vzeroupper #259.5 -..___tag_value_computeForceLJFullNeigh_simd.147: -# likwid_markerStopRegion(const char *) - call likwid_markerStopRegion #259.5 -..___tag_value_computeForceLJFullNeigh_simd.148: - # LOE rbx r15d + # LOE r12 r13 xmm0 ..B3.20: # Preds ..B3.19 # Execution count [1.00e+00] - xorl %eax, %eax #260.16 -..___tag_value_computeForceLJFullNeigh_simd.149: -# getTimeStamp() - call getTimeStamp #260.16 -..___tag_value_computeForceLJFullNeigh_simd.150: - # LOE rbx r15d xmm0 -..B3.21: # Preds ..B3.20 - # Execution count [1.00e+00] - vxorpd %xmm4, %xmm4, %xmm4 #261.5 - movl $.L_2__STRING.3, %edi #261.5 - vmovsd .L_2il0floatpacket.9(%rip), %xmm3 #261.5 - movl %r15d, %esi #261.5 - vmovsd 264(%rbx), %xmm7 #261.68 - movl $3, %eax #261.5 - vsubsd 192(%rsp), %xmm0, %xmm1 #261.88[spill] - vcvtusi2sdl %r15d, %xmm4, %xmm4 #261.5 - vdivsd %xmm4, %xmm3, %xmm5 #261.5 - vmulsd %xmm1, %xmm5, %xmm6 #261.5 - vmulsd %xmm7, %xmm6, %xmm2 #261.5 - vmovapd %xmm7, %xmm0 #261.5 - vmovsd %xmm1, (%rsp) #261.5[spill] -..___tag_value_computeForceLJFullNeigh_simd.152: -# printf(const char *__restrict__, ...) - call printf #261.5 -..___tag_value_computeForceLJFullNeigh_simd.153: - # LOE -..B3.22: # Preds ..B3.21 - # Execution count [1.00e+00] - vmovsd (%rsp), %xmm1 #[spill] - vmovapd %xmm1, %xmm0 #262.14 - addq $216, %rsp #262.14 + vsubsd 192(%rsp), %xmm0, %xmm0 #268.14[spill] + addq $232, %rsp #268.14 .cfi_restore 3 - popq %rbx #262.14 + popq %rbx #268.14 .cfi_restore 15 - popq %r15 #262.14 + popq %r15 #268.14 .cfi_restore 14 - popq %r14 #262.14 - .cfi_restore 13 - popq %r13 #262.14 - .cfi_restore 12 - popq %r12 #262.14 - movq %rbp, %rsp #262.14 - popq %rbp #262.14 + popq %r14 #268.14 + movq %rbp, %rsp #268.14 + popq %rbp #268.14 .cfi_def_cfa 7, 8 .cfi_restore 6 - ret #262.14 + ret #268.14 .align 16,0x90 # LOE .cfi_endproc @@ -1578,10 +1517,10 @@ computeForceLJFullNeigh_simd: .type .L_2il0floatpacket.8,@object .size .L_2il0floatpacket.8,32 .align 32 -.L_2il0floatpacket.11: +.L_2il0floatpacket.10: .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 - .type .L_2il0floatpacket.11,@object - .size .L_2il0floatpacket.11,32 + .type .L_2il0floatpacket.10,@object + .size .L_2il0floatpacket.10,32 .align 8 .L_2il0floatpacket.3: .long 0x00000000,0x40480000 @@ -1594,14 +1533,9 @@ computeForceLJFullNeigh_simd: .size .L_2il0floatpacket.5,8 .align 8 .L_2il0floatpacket.9: - .long 0x00000000,0x41cdcd65 + .long 0x00000000,0x3fe00000 .type .L_2il0floatpacket.9,@object .size .L_2il0floatpacket.9,8 - .align 8 -.L_2il0floatpacket.10: - .long 0x00000000,0x3fe00000 - .type .L_2il0floatpacket.10,@object - .size .L_2il0floatpacket.10,8 .section .rodata.str1.4, "aMS",@progbits,1 .align 4 .align 4 @@ -1620,40 +1554,6 @@ computeForceLJFullNeigh_simd: .word 104 .type .L_2__STRING.1,@object .size .L_2__STRING.1,18 - .space 2, 0x00 # pad - .align 4 -.L_2__STRING.2: - .long 980644937 - .long 544548128 - .long 1701987872 - .long 622869105 - .long 1411391590 - .long 979725673 - .long 174466336 - .long 1764718915 - .long 622869108 - .long 1747460198 - .long 761687137 - .long 1734960494 - .long 665960 - .type .L_2__STRING.2,@object - .size .L_2__STRING.2,52 - .align 4 -.L_2__STRING.3: - .long 980644937 - .long 544548128 - .long 1701987872 - .long 622869105 - .long 1411391590 - .long 979725673 - .long 174466336 - .long 1764718915 - .long 622869108 - .long 1932009574 - .long 694447465 - .word 10 - .type .L_2__STRING.3,@object - .size .L_2__STRING.3,46 .data .section .note.GNU-stack, "" # End diff --git a/static_analysis/jan/lammps-icx-avx2zen.o b/static_analysis/jan/lammps-icx-avx2zen.o new file mode 100644 index 0000000..291f54c Binary files /dev/null and b/static_analysis/jan/lammps-icx-avx2zen.o differ diff --git a/static_analysis/jan/zen-icx-lammps-avx2.s b/static_analysis/jan/lammps-icx-avx2zen.s similarity index 85% rename from static_analysis/jan/zen-icx-lammps-avx2.s rename to static_analysis/jan/lammps-icx-avx2zen.s index c340cad..fb50b6d 100644 --- a/static_analysis/jan/zen-icx-lammps-avx2.s +++ b/static_analysis/jan/lammps-icx-avx2zen.s @@ -172,8 +172,8 @@ movl $111, %ebx # OSACA START MARKER .byte 100 # OSACA START MARKER .byte 103 # OSACA START MARKER .byte 144 # OSACA START MARKER -# LLVM-MCA-BEGIN # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc +# LLVM-MCA-BEGIN .LBB0_9: # # Parent Loop BB0_6 Depth=1 # => This Inner Loop Header: Depth=2 @@ -386,8 +386,6 @@ movl $222, %ebx # OSACA END MARKER .quad 4607182418800017408 # 1 .LCPI1_2: .quad -4620693217682128896 # -0.5 -.LCPI1_3: - .quad 4741671816366391296 # 1.0E+9 .text .globl computeForceLJHalfNeigh .p2align 4, 0x90 @@ -408,128 +406,125 @@ computeForceLJHalfNeigh: # .cfi_def_cfa_offset 48 pushq %rbx .cfi_def_cfa_offset 56 - subq $56, %rsp - .cfi_def_cfa_offset 112 + subq $40, %rsp + .cfi_def_cfa_offset 96 .cfi_offset %rbx, -56 .cfi_offset %r12, -48 .cfi_offset %r13, -40 .cfi_offset %r14, -32 .cfi_offset %r15, -24 .cfi_offset %rbp, -16 - movq %rcx, 24(%rsp) # 8-byte Spill - movq %rdx, %r12 - movq %rsi, %r13 - movl 4(%rsi), %r15d + movq %rcx, 16(%rsp) # 8-byte Spill + movq %rdx, %r15 + movq %rsi, %r12 + movl 4(%rsi), %r13d vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero - vmovsd %xmm0, (%rsp) # 8-byte Spill - vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero vmovsd %xmm0, 8(%rsp) # 8-byte Spill - movq %rdi, 40(%rsp) # 8-byte Spill + vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, (%rsp) # 8-byte Spill vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero - vmovsd %xmm0, 16(%rsp) # 8-byte Spill - testl %r15d, %r15d + vmovsd %xmm0, 32(%rsp) # 8-byte Spill + testl %r13d, %r13d jle .LBB1_2 # %bb.1: # - movq 64(%r13), %rdi - leaq (,%r15,8), %rax + movq 64(%r12), %rdi + leaq (,%r13,8), %rax leaq (%rax,%rax,2), %rdx xorl %esi, %esi callq _intel_fast_memset .LBB1_2: # - xorl %r14d, %r14d xorl %eax, %eax callq getTimeStamp - vmovsd %xmm0, 32(%rsp) # 8-byte Spill + vmovsd %xmm0, 24(%rsp) # 8-byte Spill movl $.L.str.1, %edi callq likwid_markerStartRegion - testl %r15d, %r15d + testl %r13d, %r13d jle .LBB1_8 # %bb.3: # - vmovsd (%rsp), %xmm0 # 8-byte Reload - # xmm0 = mem[0],zero - vmulsd %xmm0, %xmm0, %xmm12 - movq 16(%r12), %rax - movq 24(%r12), %rcx - movq %rcx, (%rsp) # 8-byte Spill - movslq 8(%r12), %rdx - movq 16(%r13), %rsi - movq 64(%r13), %rdi vmovsd 8(%rsp), %xmm0 # 8-byte Reload # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm12 + movq 16(%r15), %rax + movq 24(%r15), %rcx + movq %rcx, 8(%rsp) # 8-byte Spill + movslq 8(%r15), %rdx + movq 16(%r12), %rsi + movq 64(%r12), %rdi + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero vmulsd .LCPI1_0(%rip), %xmm0, %xmm11 - movq 24(%rsp), %rcx # 8-byte Reload + movq 16(%rsp), %rcx # 8-byte Reload vmovdqu (%rcx), %xmm10 shlq $2, %rdx - movq %rdx, 48(%rsp) # 8-byte Spill - xorl %r13d, %r13d - xorl %r14d, %r14d + movq %rdx, (%rsp) # 8-byte Spill + xorl %r12d, %r12d jmp .LBB1_4 .p2align 4, 0x90 -.LBB1_14: # +.LBB1_5: # # in Loop: Header=BB1_4 Depth=1 - movq 8(%rsp), %rbp # 8-byte Reload + vxorpd %xmm13, %xmm13, %xmm13 + movq %r9, %rdx + vxorpd %xmm9, %xmm9, %xmm9 + vxorpd %xmm14, %xmm14, %xmm14 .LBB1_6: # # in Loop: Header=BB1_4 Depth=1 - addl %r10d, %r14d - vaddsd (%rdi,%r12,8), %xmm14, %xmm0 - vmovsd %xmm0, (%rdi,%r12,8) - vaddsd (%rdi,%rbp,8), %xmm15, %xmm0 - vmovsd %xmm0, (%rdi,%rbp,8) + vaddsd (%rdi,%r15,8), %xmm14, %xmm0 + vmovsd %xmm0, (%rdi,%r15,8) + vaddsd (%rdi,%r10,8), %xmm9, %xmm0 + vmovsd %xmm0, (%rdi,%r10,8) vaddsd (%rdi,%r11,8), %xmm13, %xmm0 vmovsd %xmm0, (%rdi,%r11,8) - leal 3(%r10), %ecx - addl $6, %r10d + leal 3(%r9), %ecx + addl $6, %r9d testl %ecx, %ecx - cmovnsl %ecx, %r10d - sarl $2, %r10d - movslq %r10d, %rcx + cmovnsl %ecx, %r9d + sarl $2, %r9d + movslq %r9d, %rcx vmovq %rcx, %xmm0 vmovq %rdx, %xmm1 vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] vpaddq %xmm0, %xmm10, %xmm10 - incq %r13 - addq 48(%rsp), %rax # 8-byte Folded Reload - cmpq %r15, %r13 + incq %r12 + addq (%rsp), %rax # 8-byte Folded Reload + cmpq %r13, %r12 je .LBB1_7 .LBB1_4: # # =>This Loop Header: Depth=1 # Child Loop BB1_10 Depth 2 - movq (%rsp), %rcx # 8-byte Reload - movslq (%rcx,%r13,4), %r10 - leaq (,%r13,2), %rcx - addq %r13, %rcx - leal 1(%rcx), %ebp + movq 8(%rsp), %rcx # 8-byte Reload + movslq (%rcx,%r12,4), %r9 + leaq (%r12,%r12,2), %rcx + leal 1(%rcx), %r10d leal 2(%rcx), %r11d - movl %ecx, %r12d - testq %r10, %r10 + movl %ecx, %r15d + testq %r9, %r9 jle .LBB1_5 # %bb.9: # # in Loop: Header=BB1_4 Depth=1 - vmovsd (%rsi,%r12,8), %xmm9 # xmm9 = mem[0],zero - movq %rbp, 8(%rsp) # 8-byte Spill - vmovsd (%rsi,%rbp,8), %xmm4 # xmm4 = mem[0],zero + vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero + vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero - movl %r10d, %edx + movl %r9d, %edx vxorpd %xmm14, %xmm14, %xmm14 - xorl %ebx, %ebx - vxorpd %xmm15, %xmm15, %xmm15 + xorl %ecx, %ecx + vxorpd %xmm9, %xmm9, %xmm9 vxorpd %xmm13, %xmm13, %xmm13 jmp .LBB1_10 .p2align 4, 0x90 .LBB1_13: # # in Loop: Header=BB1_10 Depth=2 - incq %rbx - cmpq %rbx, %rdx - je .LBB1_14 + incq %rcx + cmpq %rcx, %rdx + je .LBB1_6 .LBB1_10: # # Parent Loop BB1_4 Depth=1 # => This Inner Loop Header: Depth=2 - movslq (%rax,%rbx,4), %r9 - leaq (%r9,%r9,2), %r8 - vsubsd (%rsi,%r8,8), %xmm9, %xmm2 - movslq %r8d, %rcx - vsubsd 8(%rsi,%rcx,8), %xmm4, %xmm5 - vsubsd 16(%rsi,%rcx,8), %xmm1, %xmm0 + movslq (%rax,%rcx,4), %r8 + leaq (%r8,%r8,2), %r14 + vsubsd (%rsi,%r14,8), %xmm15, %xmm2 + movslq %r14d, %rbp + vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5 + vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0 vmulsd %xmm2, %xmm2, %xmm6 vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6 vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6 @@ -539,70 +534,45 @@ computeForceLJHalfNeigh: # # in Loop: Header=BB1_10 Depth=2 vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero vdivsd %xmm6, %xmm3, %xmm6 - vmulsd 16(%rsp), %xmm6, %xmm7 # 8-byte Folded Reload + vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload vmulsd %xmm6, %xmm6, %xmm8 - vmulsd %xmm7, %xmm8, %xmm7 - vaddsd .LCPI1_2(%rip), %xmm7, %xmm3 + vmulsd %xmm3, %xmm8, %xmm3 + vaddsd .LCPI1_2(%rip), %xmm3, %xmm7 vmulsd %xmm6, %xmm11, %xmm6 - vmulsd %xmm7, %xmm6, %xmm6 vmulsd %xmm3, %xmm6, %xmm3 + vmulsd %xmm7, %xmm3, %xmm3 vmulsd %xmm2, %xmm3, %xmm6 vaddsd %xmm6, %xmm14, %xmm14 vmulsd %xmm5, %xmm3, %xmm2 - vaddsd %xmm2, %xmm15, %xmm15 + vaddsd %xmm2, %xmm9, %xmm9 vmulsd %xmm0, %xmm3, %xmm0 vaddsd %xmm0, %xmm13, %xmm13 - cmpl %r15d, %r9d + cmpl %r13d, %r8d jge .LBB1_13 # %bb.12: # # in Loop: Header=BB1_10 Depth=2 - leaq 1(%rcx), %rbp - addq $2, %rcx - vmovsd (%rdi,%r8,8), %xmm3 # xmm3 = mem[0],zero + leaq 1(%rbp), %rbx + addq $2, %rbp + vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero vsubsd %xmm6, %xmm3, %xmm3 - vmovsd %xmm3, (%rdi,%r8,8) - vmovsd (%rdi,%rbp,8), %xmm3 # xmm3 = mem[0],zero + vmovsd %xmm3, (%rdi,%r14,8) + vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero vsubsd %xmm2, %xmm3, %xmm2 - vmovsd %xmm2, (%rdi,%rbp,8) - vmovsd (%rdi,%rcx,8), %xmm2 # xmm2 = mem[0],zero + vmovsd %xmm2, (%rdi,%rbx,8) + vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero vsubsd %xmm0, %xmm2, %xmm0 - vmovsd %xmm0, (%rdi,%rcx,8) + vmovsd %xmm0, (%rdi,%rbp,8) jmp .LBB1_13 - .p2align 4, 0x90 -.LBB1_5: # - # in Loop: Header=BB1_4 Depth=1 - vxorpd %xmm13, %xmm13, %xmm13 - movq %r10, %rdx - vxorpd %xmm15, %xmm15, %xmm15 - vxorpd %xmm14, %xmm14, %xmm14 - jmp .LBB1_6 .LBB1_7: # - movq 24(%rsp), %rax # 8-byte Reload + movq 16(%rsp), %rax # 8-byte Reload vmovdqu %xmm10, (%rax) .LBB1_8: # movl $.L.str.1, %edi callq likwid_markerStopRegion xorl %eax, %eax callq getTimeStamp - movq 40(%rsp), %rax # 8-byte Reload - vmovsd 264(%rax), %xmm3 # xmm3 = mem[0],zero - vsubsd 32(%rsp), %xmm0, %xmm2 # 8-byte Folded Reload - vmulsd .LCPI1_3(%rip), %xmm3, %xmm0 - vmulsd %xmm2, %xmm0, %xmm0 - vmovapd %xmm2, %xmm1 - vmovsd %xmm2, 16(%rsp) # 8-byte Spill - movl %r14d, %eax - vxorps %xmm12, %xmm12, %xmm12 - vcvtsi2sd %rax, %xmm12, %xmm2 - vdivsd %xmm2, %xmm0, %xmm2 - movl $.L.str.2, %edi - movl %r14d, %esi - vmovapd %xmm3, %xmm0 - movb $3, %al - callq printf - vmovsd 16(%rsp), %xmm0 # 8-byte Reload - # xmm0 = mem[0],zero - addq $56, %rsp + vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $40, %rsp .cfi_def_cfa_offset 56 popq %rbx .cfi_def_cfa_offset 48 @@ -642,10 +612,8 @@ computeForceLJFullNeigh_simd: # .LBB2_2: # xorl %eax, %eax callq getTimeStamp - movl $.L.str, %edi - callq likwid_markerStartRegion movq stderr(%rip), %rcx - movl $.L.str.3, %edi + movl $.L.str.2, %edi movl $65, %esi movl $1, %edx callq fwrite @@ -666,11 +634,7 @@ computeForceLJFullNeigh_simd: # .size .L.str.1, 18 .type .L.str.2,@object # .L.str.2: - .asciz "Its: %u Freq: %f Time: %f\nCy/it: %f (half-neigh)\n" - .size .L.str.2, 52 - .type .L.str.3,@object # -.L.str.3: .asciz "Error: SIMD kernel not implemented for specified instruction set!" - .size .L.str.3, 66 + .size .L.str.2, 66 .ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)" .section ".note.GNU-stack","",@progbits