.intel_syntax noprefix .text .align 16,0x90 .globl computeForce computeForce: # parameter 1: rdi Parameter* # parameter 2: rsi Atom* # parameter 3: rdx Neighbor* push rbp push r12 push r13 push r14 push r15 push rbx mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal vmovsd xmm2, QWORD PTR [72+rdi] # xmm2 <- param->cutforce vmovsd xmm1, QWORD PTR [8+rdi] # xmm1 <- param->sigma6 vmovsd xmm0, QWORD PTR [rdi] # xmm0 <- param->epsilon mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz test r9d, r9d # atom->Nlocal <= 0 jle ..atom_loop_exit xor r10d, r10d # r10d <- 0 mov ecx, r9d # ecx <- atom->Nlocal xor r8d, r8d # r8d <- 0 mov r11d, 1 # r11d <- 1 xor eax, eax # eax <- 0 shr ecx, 1 # ecx <- atom->Nlocal >> 1 je ..zero_last_element # ecx == 0 # Init forces to zero loop (unroll factor = 2) ..init_force_loop: mov QWORD PTR [r8+r13], rax # fx[i] <- 0 mov QWORD PTR [r8+r14], rax # fy[i] <- 0 mov QWORD PTR [r8+rdi], rax # fz[i] <- 0 mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0 mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0 mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0 add r8, 16 # i++ inc r10 # i++ cmp r10, rcx # i < Nlocal jb ..init_force_loop # Trick to make r11d contain value of last element to be zeroed plus 1 # Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1 ..zero_last_element: lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2 cmp ecx, r9d # i >= Nlocal jae ..before_atom_loop # Set last element to zero movsxd r11, r11d # r11 <- i * 2 mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0 mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0 mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0 # Initialize registers to be used within atom loop ..before_atom_loop: vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...] vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7] vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...] vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...] vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...] vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...] movsxd r9, r9d # r9 <- atom->Nlocal xor r10d, r10d # r10d <- 0 (i) mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x ### AOS xor eax, eax ### SOA #mov rax, QWORD PTR [24+rsi] # rax <- atom->y #mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z ### shl r12, 2 # r12 <- neighbor->maxneighs * 4 # Register spilling mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15 mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx ..atom_loop_begin: mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix) vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy) mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs) vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz) ### AOS vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3] vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1] vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2] ### SOA #vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i] #vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i] #vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i] ### vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i) vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i) vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i) test r13d, r13d # numneighs <= 0 jle ..atom_loop_exit vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix) vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy) vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz) mov rcx, r12 # rcx <- neighbor->maxneighs * 4 imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i] xor r9d, r9d # r9d <- 0 (k) mov r14d, r13d # r14d <- numneighs cmp r14d, 8 jl ..compute_forces_remainder # and r14d, -8 # r14d <- numneighs & (-8) # lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?) # cmp r14d, r9d # r14d < r9d # jl ..B1.33 # cmp r13d, 8 # numneighs < 8 # jl ..B1.32 #..B1.11: # cmp r13d, 1200 # numneighs < 1200 # jl ..B1.31 #..B1.12: # mov r9, rcx # r9 <- neighs # and r9, 63 # r9 <- neighs & 63 # test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8 # je ..B1.14 #..B1.13: # xor r9d, r9d # r9d <- 0 # jmp ..B1.16 #..B1.14: # test r9d, r9d # r9d == 0 # je ..B1.16 #..B1.15: # neg r9d # add r9d, 64 # shr r9d, 2 # r9d <- (64 - r9d) / 4 # cmp r13d, r9d # numneighs < r9d # cmovl r9d, r13d # r9d <- MIN(numneighs, r9d) #..B1.16: # mov ebx, r13d # sub ebx, r9d # and ebx, 7 # neg ebx # add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs # cmp r9d, 1 # r9d < 1 # jb ..B1.20 #..B1.20: # lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1] # cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs # jl ..B1.24 ..compute_forces: vpcmpeqb k1, xmm0, xmm0 vpcmpeqb k2, xmm0, xmm0 vpcmpeqb k3, xmm0, xmm0 vmovdqu ymm3, YMMWORD PTR [rcx+r9*4] vpxord zmm5, zmm5, zmm5 vpxord zmm6, zmm6, zmm6 ### AOS vpaddd ymm4, ymm3, ymm3 vpaddd ymm3, ymm3, ymm4 vpxord zmm4, zmm4, zmm4 vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8] vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8] ### SOA #vpxord zmm4, zmm4, zmm4 #vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] #vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] #vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] ### vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq # Cutoff radius condition vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2) vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6 vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2 vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2 vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2 vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5 vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 vmulpd zmm30, zmm26, zmm27 # zmm30 <- force vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz sub r14, 8 add r9, 8 cmp r14, 8 jge ..compute_forces # Check if there are remaining neighbors to be computed ..compute_forces_remainder: cmp r14, 1 jl ..sum_up_forces vpbroadcastd ymm0, r14d vpcmpgtd k1, ymm0, ymm17 kmovw r15d, k1 vmovdqu ymm3{k3}{z}, YMMWORD PTR [rcx+r9*4] kmov k2, k1 kmov k3, k1 vpxord zmm5, zmm5, zmm5 vpxord zmm6, zmm6, zmm6 ### AOS vpaddd ymm4, ymm3, ymm3 vpaddd ymm3, ymm3, ymm4 vpxord zmm4, zmm4, zmm4 vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8] vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8] ### SOA #vpxord zmm4, zmm4, zmm4 #vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] #vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] #vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] ### vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq # Cutoff radius condition vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2) vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq kmovw r9d, k5 # r9d <- rsq < cutforcesq and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6 vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2 vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2 vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2 vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5 vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 vmulpd zmm30, zmm26, zmm27 # zmm30 <- force vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz # Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate # and add them (reduction) to obtain the final contribution for the current atom ..sum_up_forces: vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip] vpermd zmm0, zmm10, zmm11 vpermd zmm5, zmm10, zmm12 vpermd zmm21, zmm10, zmm13 vaddpd zmm11, zmm0, zmm11 vaddpd zmm12, zmm5, zmm12 vaddpd zmm13, zmm21, zmm13 vpermpd zmm1, zmm11, 78 vpermpd zmm6, zmm12, 78 vpermpd zmm22, zmm13, 78 vaddpd zmm2, zmm11, zmm1 vaddpd zmm8, zmm12, zmm6 vaddpd zmm23, zmm13, zmm22 vpermpd zmm3, zmm2, 177 vpermpd zmm9, zmm8, 177 vpermpd zmm24, zmm23, 177 vaddpd zmm4, zmm2, zmm3 vaddpd zmm20, zmm8, zmm9 vaddpd zmm25, zmm23, zmm24 ..atom_loop_exit: mov rcx, QWORD PTR [-8+rsp] #84.9[spill] mov rbx, QWORD PTR [-16+rsp] #85.9[spill] ### AOS add rax, 24 ### vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9 vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9 vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9 vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9 vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9 vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9 inc r10 #55.5 cmp r10, QWORD PTR [-32+rsp] #55.5[spill] jb ..atom_loop_begin vzeroupper #93.12 vxorpd xmm0, xmm0, xmm0 #93.12 pop rbx pop r15 pop r14 #93.12 pop r13 #93.12 pop r12 #93.12 pop rbp #93.12 ret #93.12 .type computeForce,@function .size computeForce,.-computeForce ..LNcomputeForce.0: .data # -- End computeForce .section .rodata, "a" .align 64 .align 64 .L_2il0floatpacket.2: .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000 .type .L_2il0floatpacket.2,@object .size .L_2il0floatpacket.2,64 .align 64 .L_2il0floatpacket.4: .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000 .type .L_2il0floatpacket.4,@object .size .L_2il0floatpacket.4,64 .align 64 .L_2il0floatpacket.6: .long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f .type .L_2il0floatpacket.6,@object .size .L_2il0floatpacket.6,64 .align 32 .L_2il0floatpacket.0: .long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008 .type .L_2il0floatpacket.0,@object .size .L_2il0floatpacket.0,32 .align 32 .L_2il0floatpacket.1: .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 .type .L_2il0floatpacket.1,@object .size .L_2il0floatpacket.1,32 .align 8 .L_2il0floatpacket.3: .long 0x00000000,0x40480000 .type .L_2il0floatpacket.3,@object .size .L_2il0floatpacket.3,8 .align 8 .L_2il0floatpacket.5: .long 0x00000000,0x3ff00000 .type .L_2il0floatpacket.5,@object .size .L_2il0floatpacket.5,8 .data .section .note.GNU-stack, "" # End