Add working version of force.s
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
b2dada6179
commit
f8b9a095cf
@ -122,46 +122,6 @@ computeForce:
|
|||||||
cmp r14d, 8
|
cmp r14d, 8
|
||||||
jl ..compute_forces_remainder
|
jl ..compute_forces_remainder
|
||||||
|
|
||||||
# and r14d, -8 # r14d <- numneighs & (-8)
|
|
||||||
# lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?)
|
|
||||||
# cmp r14d, r9d # r14d < r9d
|
|
||||||
# jl ..B1.33
|
|
||||||
|
|
||||||
# cmp r13d, 8 # numneighs < 8
|
|
||||||
# jl ..B1.32
|
|
||||||
#..B1.11:
|
|
||||||
# cmp r13d, 1200 # numneighs < 1200
|
|
||||||
# jl ..B1.31
|
|
||||||
#..B1.12:
|
|
||||||
# mov r9, rcx # r9 <- neighs
|
|
||||||
# and r9, 63 # r9 <- neighs & 63
|
|
||||||
# test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8
|
|
||||||
# je ..B1.14
|
|
||||||
#..B1.13:
|
|
||||||
# xor r9d, r9d # r9d <- 0
|
|
||||||
# jmp ..B1.16
|
|
||||||
#..B1.14:
|
|
||||||
# test r9d, r9d # r9d == 0
|
|
||||||
# je ..B1.16
|
|
||||||
#..B1.15:
|
|
||||||
# neg r9d
|
|
||||||
# add r9d, 64
|
|
||||||
# shr r9d, 2 # r9d <- (64 - r9d) / 4
|
|
||||||
# cmp r13d, r9d # numneighs < r9d
|
|
||||||
# cmovl r9d, r13d # r9d <- MIN(numneighs, r9d)
|
|
||||||
#..B1.16:
|
|
||||||
# mov ebx, r13d
|
|
||||||
# sub ebx, r9d
|
|
||||||
# and ebx, 7
|
|
||||||
# neg ebx
|
|
||||||
# add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs
|
|
||||||
# cmp r9d, 1 # r9d < 1
|
|
||||||
# jb ..B1.20
|
|
||||||
#..B1.20:
|
|
||||||
# lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1]
|
|
||||||
# cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs
|
|
||||||
# jl ..B1.24
|
|
||||||
|
|
||||||
..compute_forces:
|
..compute_forces:
|
||||||
vpcmpeqb k1, xmm0, xmm0
|
vpcmpeqb k1, xmm0, xmm0
|
||||||
vpcmpeqb k2, xmm0, xmm0
|
vpcmpeqb k2, xmm0, xmm0
|
||||||
@ -204,20 +164,20 @@ computeForce:
|
|||||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
||||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
||||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
||||||
sub r14, 8
|
sub r14d, 8
|
||||||
add r9, 8
|
add r9, 8
|
||||||
cmp r14, 8
|
cmp r14d, 8
|
||||||
jge ..compute_forces
|
jge ..compute_forces
|
||||||
|
|
||||||
# Check if there are remaining neighbors to be computed
|
# Check if there are remaining neighbors to be computed
|
||||||
..compute_forces_remainder:
|
..compute_forces_remainder:
|
||||||
cmp r14, 1
|
test r14d, r14d
|
||||||
jl ..sum_up_forces
|
jle ..sum_up_forces
|
||||||
|
|
||||||
vpbroadcastd ymm0, r14d
|
vpbroadcastd ymm4, r14d
|
||||||
vpcmpgtd k1, ymm0, ymm17
|
vpcmpgtd k1, ymm4, ymm17
|
||||||
kmovw r15d, k1
|
kmovw r15d, k1
|
||||||
vmovdqu32 ymm3{k3}{z}, YMMWORD PTR [rcx+r9*4]
|
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
|
||||||
kmovw k2, k1
|
kmovw k2, k1
|
||||||
kmovw k3, k1
|
kmovw k3, k1
|
||||||
vpxord zmm5, zmm5, zmm5
|
vpxord zmm5, zmm5, zmm5
|
||||||
@ -230,7 +190,7 @@ computeForce:
|
|||||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||||
### SOA
|
#### SOA
|
||||||
#vpxord zmm4, zmm4, zmm4
|
#vpxord zmm4, zmm4, zmm4
|
||||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||||
@ -257,9 +217,9 @@ computeForce:
|
|||||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
|
||||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
|
||||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
|
||||||
|
|
||||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
||||||
# and add them (reduction) to obtain the final contribution for the current atom
|
# and add them (reduction) to obtain the final contribution for the current atom
|
||||||
|
Loading…
Reference in New Issue
Block a user