Update force kernel to be integrated

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2021-11-08 14:05:29 +01:00
parent ec556eb117
commit 9b615cf0b3

View File

@ -7,227 +7,156 @@ computeForce:
# parameter 1: rdi Parameter* # parameter 1: rdi Parameter*
# parameter 2: rsi Atom* # parameter 2: rsi Atom*
# parameter 3: rdx Neighbor* # parameter 3: rdx Neighbor*
push r12 push r12
push r13 push r13
push r14 push r14
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
# r9d <- atom->Nlocal vmovsd xmm2, QWORD PTR [72+rdi] # xmm2 <- param->cutforce
mov r9d, DWORD PTR [4+rsi] vmovsd xmm1, QWORD PTR [8+rdi] # xmm1 <- param->sigma6
# xmm2 <- param->cutforce vmovsd xmm0, QWORD PTR [rdi] # xmm0 <- param->epsilon
vmovsd xmm2, QWORD PTR [72+rdi] mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
# xmm1 <- param->sigma6 mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
vmovsd xmm1, QWORD PTR [8+rdi] mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
# xmm0 <- param->epsilon test r9d, r9d # atom->Nlocal <= 0
vmovsd xmm0, QWORD PTR [rdi] jle ..exit_func
# r13 <- atom->fx
mov r13, QWORD PTR [64+rsi]
# r14 <- atom->fy
mov r14, QWORD PTR [72+rsi]
# rdi <- atom->fz
mov rdi, QWORD PTR [80+rsi]
# atom->Nlocal <= 0
test r9d, r9d
jle ..B1.30
..B1.2: ..B1.2:
# r10d <- 0 xor r10d, r10d # r10d <- 0
xor r10d, r10d mov ecx, r9d # ecx <- atom->Nlocal
# ecx <- atom->Nlocal xor r8d, r8d # r8d <- 0
mov ecx, r9d mov r11d, 1 # r11d <- 1
# r8d <- 0 xor eax, eax # eax <- 0
xor r8d, r8d shr ecx, 1 # ecx <- atom->Nlocal >> 1
# r11d <- 1 je ..B1.6 # ecx == 0
mov r11d, 1
# eax <- 0
xor eax, eax
# ecx <- atom->Nlocal >> 1
shr ecx, 1
je ..B1.6
# Init forces to zero loop # Init forces to zero loop
..B1.4: ..B1.4:
# fx[i] <- 0 mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r13], rax mov QWORD PTR [r8+r14], rax # fy[i] <- 0
# i++ mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
inc r10 mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
# fy[i] <- 0 mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+r14], rax mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
# fz[i] <- 0 add r8, 16 # i++
mov QWORD PTR [r8+rdi], rax inc r10 # i++
# fx[i] <- 0 cmp r10, rcx # i < Nlocal
mov QWORD PTR [8+r8+r13], rax
# fy[i] <- 0
mov QWORD PTR [8+r8+r14], rax
# fz[i] <- 0
mov QWORD PTR [8+r8+rdi], rax
# i++
add r8, 16
# i < Nlocal
cmp r10, rcx
jb ..B1.4 jb ..B1.4
..B1.5: ..B1.5:
# r11d <- i * 2 + 1 lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
lea r11d, DWORD PTR [1+r10+r10]
..B1.6: ..B1.6:
# r11d <- i * 2 lea ecx, DWORD PTR [-1+r11] # r11d <- i * 2
lea ecx, DWORD PTR [-1+r11] cmp ecx, r9d # i < Nlocal
# i < Nlocal
cmp ecx, r9d
jae ..B1.8 jae ..B1.8
..B1.7: ..B1.7:
movsxd r11, r11d # r11 <- i * 2
# r11 <- i * 2 mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
movsxd r11, r11d mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
# fx[i] <- 0 mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
mov QWORD PTR [-8+r13+r11*8], rax
# fy[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax
# fz[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax
..B1.8: ..B1.8:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
xor r8d, r8d # r8d <- 0
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm16 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
# xmm15 <- cutforcesq ### AOS
vmulsd xmm15, xmm2, xmm2 xor eax, eax
# r8d <- 0 ### SOA
xor r8d, r8d #mov rax, QWORD PTR [24+rsi] # rax <- atom->y
# ymm18 <- 8 #mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] ###
# xmm0 <- 48 * epsilon
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]
# ymm17 <- [0..7]
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]
# zmm7 <- 0.5
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]
# zmm16 <- cutforcesq
vbroadcastsd zmm16, xmm15
# zmm15 <- param->sigma6
vbroadcastsd zmm15, xmm1
# zmm16 <- 48 * epsilon
vbroadcastsd zmm14, xmm0
# r9 <- atom->Nlocal
movsxd r9, r9d
# r10d <- 0 (i)
xor r10d, r10d
# rcx <- neighbor->numneigh
mov rcx, QWORD PTR [24+rdx]
# r11 <- neighbor->neighbors
mov r11, QWORD PTR [8+rdx]
# r12 <- neighbor->maxneighs
movsxd r12, DWORD PTR [16+rdx]
# rdx <- atom->x shl r12, 2 # r12 <- neighbor->maxneighs * 4
mov rdx, QWORD PTR [16+rsi] mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
# rax <- atom->y mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov rax, QWORD PTR [24+rsi] mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
# rsi <- atom->z mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov rsi, QWORD PTR [32+rsi] mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
# r12 <- neighbor->maxneighs * 4
shl r12, 2
# [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-32+rsp], r9
# [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-24+rsp], rcx
# [-16+rsp] <- atom->fy
mov QWORD PTR [-16+rsp], r14
# [-8+rsp] <- atom->fx
mov QWORD PTR [-8+rsp], r13
# [-40+rsp] <- r15
mov QWORD PTR [-40+rsp], r15
# [-48+rsp] <- rbx
mov QWORD PTR [-48+rsp], rbx
# zmm19 <- 0
vpxord zmm19, zmm19, zmm19
# Loop over all atoms # Loop over all atoms
..B1.9: ..B1.9:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0
vmovapd xmm20, xmm25 # xmm20 <- 0
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0
# rcx <- neighbor->numneigh ### AOS
mov rcx, QWORD PTR [-24+rsp] vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
# xmm25 <- 0 vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vxorpd xmm25, xmm25, xmm25 vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
# xmm20 <- 0 ### SOA
vmovapd xmm20, xmm25 #vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
# r13d <- neighbor->numneigh[i] (numneighs) #vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
mov r13d, DWORD PTR [rcx+r10*4] #vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
# xmm4 <- 0 ###
vmovapd xmm4, xmm20
# xmm8 <- atom->x[i] test r13d, r13d # numneighs <= 0
vmovsd xmm8, QWORD PTR [rdx+r10*8] jle ..exit_func
# xmm9 <- atom->y[i]
vmovsd xmm9, QWORD PTR [rax+r10*8]
# xmm9 <- atom->z[i]
vmovsd xmm10, QWORD PTR [rsi+r10*8]
# numneighs <= 0
test r13d, r13d
jle ..B1.27
..B1.10: ..B1.10:
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
# zmm13 <- 0 mov r14d, r13d # r14d <- numneighs
vmovaps zmm13, zmm19 xor r11d, r11d # r11d <- 0
# zmm12 <- 0 and r14d, -8 # r14d <- numneighs & (-8)
vmovaps zmm12, zmm13 lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?)
# zmm11 <- 0 cmp r14d, r9d # r14d < r9d
vmovaps zmm11, zmm12 jl ..B1.33
# numneighs < 8 # cmp r13d, 8 # numneighs < 8
cmp r13d, 8 # jl ..B1.32
jl ..B1.32 #..B1.11:
# cmp r13d, 1200 # numneighs < 1200
..B1.11: # jl ..B1.31
#..B1.12:
# numneighs < 1200 # mov rcx, r12
cmp r13d, 1200 # imul rcx, r8
jl ..B1.31 # add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i (r8)]
# mov r9, rcx # r9 <- neighs
..B1.12: # and r9, 63 # r9 <- neighs & 63
# test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8
mov rcx, r12 # je ..B1.14
imul rcx, r8 #..B1.13:
add rcx, r11 # xor r9d, r9d # r9d <- 0
mov r9, rcx # jmp ..B1.16
and r9, 63 #..B1.14:
test r9d, 3 # test r9d, r9d # r9d == 0
je ..B1.14 # je ..B1.16
#..B1.15:
..B1.13: # neg r9d
# add r9d, 64
xor r9d, r9d # shr r9d, 2 # r9d <- (64 - r9d) / 4
jmp ..B1.16 # cmp r13d, r9d # numneighs < r9d
# cmovl r9d, r13d # r9d <- MIN(numneighs, r9d)
..B1.14: #..B1.16:
# mov ebx, r13d
test r9d, r9d # sub ebx, r9d
je ..B1.16 # and ebx, 7
# neg ebx
..B1.15: # add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs
# cmp r9d, 1 # r9d < 1
neg r9d # jb ..B1.20
add r9d, 64 #..B1.20:
shr r9d, 2 # lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1]
cmp r13d, r9d # cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs
cmovl r9d, r13d # jl ..B1.24
..B1.16:
mov ebx, r13d
sub ebx, r9d
and ebx, 7
neg ebx
add ebx, r13d
cmp r9d, 1
jb ..B1.20
..B1.20:
lea ecx, DWORD PTR [8+r9]
cmp ebx, ecx
jl ..B1.24
..B1.21: ..B1.21:
mov rcx, r12 mov rcx, r12
@ -246,17 +175,25 @@ push r14
vmovdqu ymm3, YMMWORD PTR [rcx+r14*4] vmovdqu ymm3, YMMWORD PTR [rcx+r14*4]
add r14, 8 add r14, 8
vpxord zmm5, zmm5, zmm5 vpxord zmm5, zmm5, zmm5
vpxord zmm4, zmm4, zmm4
vpxord zmm6, zmm6, zmm6 vpxord zmm6, zmm6, zmm6
vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] ### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8]
#vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 vsubpd zmm29, zmm1, zmm5
vsubpd zmm28, zmm0, zmm4 vsubpd zmm28, zmm0, zmm4
vsubpd zmm31, zmm2, zmm6 vsubpd zmm31, zmm2, zmm6
vmulpd zmm20, zmm29, zmm29 vmulpd zmm20, zmm29, zmm29
vfmadd231pd zmm20, zmm28, zmm28 vfmadd231pd zmm20, zmm28, zmm28
vfmadd231pd zmm20, zmm31, zmm31 vfmadd231pd zmm20, zmm31, zmm31
@ -264,17 +201,13 @@ push r14
# if condition cutoff radius # if condition cutoff radius
vrcp14pd zmm27, zmm20 #-> sr2 vrcp14pd zmm27, zmm20 #-> sr2
vcmppd k5, zmm20, zmm16, 1 vcmppd k5, zmm20, zmm16, 1
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm22, zmm27, zmm15 #-> sr2 * sigma6 vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm24, zmm27, zmm14 #-> 48.0 * epsilon * sr2 vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vmulpd zmm25, zmm27, zmm22 #-> sr2 * sigma6 * sr2 vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm23, zmm27, zmm25 #-> sr2 * sigma6 * sr2 * sr2 vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmsub213pd zmm27, zmm25, zmm7 #-> sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 #-> 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 #->
vfmadd231pd zmm13{k5}, zmm30, zmm28 vfmadd231pd zmm13{k5}, zmm30, zmm28
vfmadd231pd zmm12{k5}, zmm30, zmm29 vfmadd231pd zmm12{k5}, zmm30, zmm29
vfmadd231pd zmm11{k5}, zmm30, zmm31 vfmadd231pd zmm11{k5}, zmm30, zmm31
@ -304,9 +237,14 @@ push r14
vaddpd zmm25, zmm23, zmm24 vaddpd zmm25, zmm23, zmm24
#exit function #exit function
..B1.27: ..exit_func:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill] mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill] mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
movsxd r8, r10d #55.32 movsxd r8, r10d #55.32
inc r8 #55.32 inc r8 #55.32
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9 vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
@ -318,8 +256,6 @@ push r14
inc r10 #55.5 inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill] cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..B1.9 jb ..B1.9
vzeroupper #93.12 vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12 vxorpd xmm0, xmm0, xmm0 #93.12
pop r14 #93.12 pop r14 #93.12