From 9b615cf0b3e3727646d443073be04a87ed8d4447 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Mon, 8 Nov 2021 14:05:29 +0100 Subject: [PATCH] Update force kernel to be integrated Signed-off-by: Rafael Ravedutti --- asm/unused/force.s | 366 +++++++++++++++++++-------------------------- 1 file changed, 151 insertions(+), 215 deletions(-) diff --git a/asm/unused/force.s b/asm/unused/force.s index 11cf275..1bd5518 100644 --- a/asm/unused/force.s +++ b/asm/unused/force.s @@ -7,227 +7,156 @@ computeForce: # parameter 1: rdi Parameter* # parameter 2: rsi Atom* # parameter 3: rdx Neighbor* -push r12 -push r13 -push r14 - -# r9d <- atom->Nlocal - mov r9d, DWORD PTR [4+rsi] -# xmm2 <- param->cutforce - vmovsd xmm2, QWORD PTR [72+rdi] -# xmm1 <- param->sigma6 - vmovsd xmm1, QWORD PTR [8+rdi] -# xmm0 <- param->epsilon - vmovsd xmm0, QWORD PTR [rdi] - -# r13 <- atom->fx - mov r13, QWORD PTR [64+rsi] -# r14 <- atom->fy - mov r14, QWORD PTR [72+rsi] -# rdi <- atom->fz - mov rdi, QWORD PTR [80+rsi] - -# atom->Nlocal <= 0 - test r9d, r9d - jle ..B1.30 + push r12 + push r13 + push r14 + mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal + vmovsd xmm2, QWORD PTR [72+rdi] # xmm2 <- param->cutforce + vmovsd xmm1, QWORD PTR [8+rdi] # xmm1 <- param->sigma6 + vmovsd xmm0, QWORD PTR [rdi] # xmm0 <- param->epsilon + mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx + mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy + mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz + test r9d, r9d # atom->Nlocal <= 0 + jle ..exit_func ..B1.2: -# r10d <- 0 - xor r10d, r10d -# ecx <- atom->Nlocal - mov ecx, r9d -# r8d <- 0 - xor r8d, r8d -# r11d <- 1 - mov r11d, 1 -# eax <- 0 - xor eax, eax -# ecx <- atom->Nlocal >> 1 - shr ecx, 1 - je ..B1.6 + xor r10d, r10d # r10d <- 0 + mov ecx, r9d # ecx <- atom->Nlocal + xor r8d, r8d # r8d <- 0 + mov r11d, 1 # r11d <- 1 + xor eax, eax # eax <- 0 + shr ecx, 1 # ecx <- atom->Nlocal >> 1 + je ..B1.6 # ecx == 0 # Init forces to zero loop ..B1.4: -# fx[i] <- 0 - mov QWORD PTR [r8+r13], rax -# i++ - inc r10 -# fy[i] <- 0 - mov QWORD PTR [r8+r14], rax -# fz[i] <- 0 - mov QWORD PTR [r8+rdi], rax -# fx[i] <- 0 - mov QWORD PTR [8+r8+r13], rax -# fy[i] <- 0 - mov QWORD PTR [8+r8+r14], rax -# fz[i] <- 0 - mov QWORD PTR [8+r8+rdi], rax -# i++ - add r8, 16 -# i < Nlocal - cmp r10, rcx + mov QWORD PTR [r8+r13], rax # fx[i] <- 0 + mov QWORD PTR [r8+r14], rax # fy[i] <- 0 + mov QWORD PTR [r8+rdi], rax # fz[i] <- 0 + mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0 + mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0 + mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0 + add r8, 16 # i++ + inc r10 # i++ + cmp r10, rcx # i < Nlocal jb ..B1.4 ..B1.5: -# r11d <- i * 2 + 1 - lea r11d, DWORD PTR [1+r10+r10] + lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1 ..B1.6: -# r11d <- i * 2 - lea ecx, DWORD PTR [-1+r11] -# i < Nlocal - cmp ecx, r9d + lea ecx, DWORD PTR [-1+r11] # r11d <- i * 2 + cmp ecx, r9d # i < Nlocal jae ..B1.8 ..B1.7: - -# r11 <- i * 2 - movsxd r11, r11d -# fx[i] <- 0 - mov QWORD PTR [-8+r13+r11*8], rax -# fy[i] <- 0 - mov QWORD PTR [-8+r14+r11*8], rax -# fz[i] <- 0 - mov QWORD PTR [-8+rdi+r11*8], rax + movsxd r11, r11d # r11 <- i * 2 + mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0 + mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0 + mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0 ..B1.8: + vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq + xor r8d, r8d # r8d <- 0 + vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...] + vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon + vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7] + vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...] + vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...] + vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...] + vbroadcastsd zmm14, xmm0 # zmm16 <- [48 * epsilon, ...] + movsxd r9, r9d # r9 <- atom->Nlocal + xor r10d, r10d # r10d <- 0 (i) + mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh + mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors + movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs + mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x -# xmm15 <- cutforcesq - vmulsd xmm15, xmm2, xmm2 -# r8d <- 0 - xor r8d, r8d -# ymm18 <- 8 - vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] -# xmm0 <- 48 * epsilon - vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] -# ymm17 <- [0..7] - vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] -# zmm7 <- 0.5 - vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] -# zmm16 <- cutforcesq - vbroadcastsd zmm16, xmm15 -# zmm15 <- param->sigma6 - vbroadcastsd zmm15, xmm1 -# zmm16 <- 48 * epsilon - vbroadcastsd zmm14, xmm0 -# r9 <- atom->Nlocal - movsxd r9, r9d -# r10d <- 0 (i) - xor r10d, r10d -# rcx <- neighbor->numneigh - mov rcx, QWORD PTR [24+rdx] -# r11 <- neighbor->neighbors - mov r11, QWORD PTR [8+rdx] -# r12 <- neighbor->maxneighs - movsxd r12, DWORD PTR [16+rdx] + ### AOS + xor eax, eax + ### SOA + #mov rax, QWORD PTR [24+rsi] # rax <- atom->y + #mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z + ### -# rdx <- atom->x - mov rdx, QWORD PTR [16+rsi] -# rax <- atom->y - mov rax, QWORD PTR [24+rsi] -# rsi <- atom->z - mov rsi, QWORD PTR [32+rsi] - -# r12 <- neighbor->maxneighs * 4 - shl r12, 2 -# [-32+rsp] <- atom->Nlocal - mov QWORD PTR [-32+rsp], r9 -# [-24+rsp] <- neighbor->numneigh - mov QWORD PTR [-24+rsp], rcx -# [-16+rsp] <- atom->fy - mov QWORD PTR [-16+rsp], r14 -# [-8+rsp] <- atom->fx - mov QWORD PTR [-8+rsp], r13 -# [-40+rsp] <- r15 - mov QWORD PTR [-40+rsp], r15 -# [-48+rsp] <- rbx - mov QWORD PTR [-48+rsp], rbx -# zmm19 <- 0 - vpxord zmm19, zmm19, zmm19 + shl r12, 2 # r12 <- neighbor->maxneighs * 4 + mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal + mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh + mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy + mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx + mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15 + mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx # Loop over all atoms ..B1.9: + mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh + vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 + vmovapd xmm20, xmm25 # xmm20 <- 0 + mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs) + vmovapd xmm4, xmm20 # xmm4 <- 0 -# rcx <- neighbor->numneigh - mov rcx, QWORD PTR [-24+rsp] -# xmm25 <- 0 - vxorpd xmm25, xmm25, xmm25 -# xmm20 <- 0 - vmovapd xmm20, xmm25 -# r13d <- neighbor->numneigh[i] (numneighs) - mov r13d, DWORD PTR [rcx+r10*4] -# xmm4 <- 0 - vmovapd xmm4, xmm20 -# xmm8 <- atom->x[i] - vmovsd xmm8, QWORD PTR [rdx+r10*8] -# xmm9 <- atom->y[i] - vmovsd xmm9, QWORD PTR [rax+r10*8] -# xmm9 <- atom->z[i] - vmovsd xmm10, QWORD PTR [rsi+r10*8] -# numneighs <= 0 - test r13d, r13d - jle ..B1.27 + ### AOS + vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3] + vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1] + vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2] + ### SOA + #vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i] + #vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i] + #vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i] + ### + + test r13d, r13d # numneighs <= 0 + jle ..exit_func ..B1.10: + vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix) + vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy) + vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz) -# zmm13 <- 0 - vmovaps zmm13, zmm19 -# zmm12 <- 0 - vmovaps zmm12, zmm13 -# zmm11 <- 0 - vmovaps zmm11, zmm12 + mov r14d, r13d # r14d <- numneighs + xor r11d, r11d # r11d <- 0 + and r14d, -8 # r14d <- numneighs & (-8) + lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?) + cmp r14d, r9d # r14d < r9d + jl ..B1.33 -# numneighs < 8 - cmp r13d, 8 - jl ..B1.32 - -..B1.11: - -# numneighs < 1200 - cmp r13d, 1200 - jl ..B1.31 - -..B1.12: - - mov rcx, r12 - imul rcx, r8 - add rcx, r11 - mov r9, rcx - and r9, 63 - test r9d, 3 - je ..B1.14 - -..B1.13: - - xor r9d, r9d - jmp ..B1.16 - -..B1.14: - - test r9d, r9d - je ..B1.16 - -..B1.15: - - neg r9d - add r9d, 64 - shr r9d, 2 - cmp r13d, r9d - cmovl r9d, r13d - -..B1.16: - - mov ebx, r13d - sub ebx, r9d - and ebx, 7 - neg ebx - add ebx, r13d - cmp r9d, 1 - jb ..B1.20 - -..B1.20: - lea ecx, DWORD PTR [8+r9] - cmp ebx, ecx - jl ..B1.24 +# cmp r13d, 8 # numneighs < 8 +# jl ..B1.32 +#..B1.11: +# cmp r13d, 1200 # numneighs < 1200 +# jl ..B1.31 +#..B1.12: +# mov rcx, r12 +# imul rcx, r8 +# add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i (r8)] +# mov r9, rcx # r9 <- neighs +# and r9, 63 # r9 <- neighs & 63 +# test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8 +# je ..B1.14 +#..B1.13: +# xor r9d, r9d # r9d <- 0 +# jmp ..B1.16 +#..B1.14: +# test r9d, r9d # r9d == 0 +# je ..B1.16 +#..B1.15: +# neg r9d +# add r9d, 64 +# shr r9d, 2 # r9d <- (64 - r9d) / 4 +# cmp r13d, r9d # numneighs < r9d +# cmovl r9d, r13d # r9d <- MIN(numneighs, r9d) +#..B1.16: +# mov ebx, r13d +# sub ebx, r9d +# and ebx, 7 +# neg ebx +# add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs +# cmp r9d, 1 # r9d < 1 +# jb ..B1.20 +#..B1.20: +# lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1] +# cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs +# jl ..B1.24 ..B1.21: mov rcx, r12 @@ -246,17 +175,25 @@ push r14 vmovdqu ymm3, YMMWORD PTR [rcx+r14*4] add r14, 8 vpxord zmm5, zmm5, zmm5 - vpxord zmm4, zmm4, zmm4 vpxord zmm6, zmm6, zmm6 - vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] + ### AOS + vpaddd ymm4, ymm3, ymm3 + vpaddd ymm3, ymm3, ymm4 + vpxord zmm4, zmm4, zmm4 vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] - vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] + vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8] + vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8] + ### SOA + #vpxord zmm4, zmm4, zmm4 + #vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] + #vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] + #vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] + ### vsubpd zmm29, zmm1, zmm5 vsubpd zmm28, zmm0, zmm4 vsubpd zmm31, zmm2, zmm6 - vmulpd zmm20, zmm29, zmm29 vfmadd231pd zmm20, zmm28, zmm28 vfmadd231pd zmm20, zmm31, zmm31 @@ -264,17 +201,13 @@ push r14 # if condition cutoff radius vrcp14pd zmm27, zmm20 #-> sr2 vcmppd k5, zmm20, zmm16, 1 - - vmulpd zmm22, zmm27, zmm15 #-> sr2 * sigma6 - vmulpd zmm24, zmm27, zmm14 #-> 48.0 * epsilon * sr2 - - vmulpd zmm25, zmm27, zmm22 #-> sr2 * sigma6 * sr2 - vmulpd zmm23, zmm27, zmm25 #-> sr2 * sigma6 * sr2 * sr2 - - vfmsub213pd zmm27, zmm25, zmm7 #-> sr2 * sigma * sr2 * sr2 - 0.5 - vmulpd zmm26, zmm23, zmm24 #-> 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 - vmulpd zmm30, zmm26, zmm27 #-> - + vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6 + vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2 + vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2 + vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2 + vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5 + vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 + vmulpd zmm30, zmm26, zmm27 # zmm30 <- force vfmadd231pd zmm13{k5}, zmm30, zmm28 vfmadd231pd zmm12{k5}, zmm30, zmm29 vfmadd231pd zmm11{k5}, zmm30, zmm31 @@ -304,9 +237,14 @@ push r14 vaddpd zmm25, zmm23, zmm24 #exit function -..B1.27: +..exit_func: mov rcx, QWORD PTR [-8+rsp] #84.9[spill] mov rbx, QWORD PTR [-16+rsp] #85.9[spill] + + ### AOS + add rax, 24 + ### + movsxd r8, r10d #55.32 inc r8 #55.32 vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9 @@ -318,8 +256,6 @@ push r14 inc r10 #55.5 cmp r10, QWORD PTR [-32+rsp] #55.5[spill] jb ..B1.9 - - vzeroupper #93.12 vxorpd xmm0, xmm0, xmm0 #93.12 pop r14 #93.12