Add first version with more than one optimization scheme
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
326
asm/unused/force_lj.s
Normal file
326
asm/unused/force_lj.s
Normal file
@@ -0,0 +1,326 @@
|
||||
.intel_syntax noprefix
|
||||
|
||||
.text
|
||||
.align 16,0x90
|
||||
.globl computeForceLJ
|
||||
computeForceLJ:
|
||||
# parameter 1: rdi Parameter*
|
||||
# parameter 2: rsi Atom*
|
||||
# parameter 3: rdx Neighbor*
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbx
|
||||
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
|
||||
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
|
||||
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
|
||||
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
|
||||
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
|
||||
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
|
||||
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
|
||||
test r9d, r9d # atom->Nlocal <= 0
|
||||
jle ..atom_loop_exit
|
||||
xor r10d, r10d # r10d <- 0
|
||||
mov ecx, r9d # ecx <- atom->Nlocal
|
||||
xor r8d, r8d # r8d <- 0
|
||||
mov r11d, 1 # r11d <- 1
|
||||
xor eax, eax # eax <- 0
|
||||
shr ecx, 1 # ecx <- atom->Nlocal >> 1
|
||||
je ..zero_last_element # ecx == 0
|
||||
|
||||
# Init forces to zero loop (unroll factor = 2)
|
||||
..init_force_loop:
|
||||
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
|
||||
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
|
||||
add r8, 16 # i++
|
||||
inc r10 # i++
|
||||
cmp r10, rcx # i < Nlocal
|
||||
jb ..init_force_loop
|
||||
|
||||
# Trick to make r11d contain value of last element to be zeroed plus 1
|
||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
|
||||
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
|
||||
..zero_last_element:
|
||||
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
|
||||
cmp ecx, r9d # i >= Nlocal
|
||||
jae ..before_atom_loop
|
||||
|
||||
# Set last element to zero
|
||||
movsxd r11, r11d # r11 <- i * 2
|
||||
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
|
||||
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
|
||||
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
|
||||
|
||||
# Initialize registers to be used within atom loop
|
||||
..before_atom_loop:
|
||||
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
|
||||
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
|
||||
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
|
||||
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
|
||||
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
|
||||
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
|
||||
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
|
||||
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
|
||||
movsxd r9, r9d # r9 <- atom->Nlocal
|
||||
xor r10d, r10d # r10d <- 0 (i)
|
||||
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
|
||||
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
|
||||
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
|
||||
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
|
||||
### AOS
|
||||
xor eax, eax
|
||||
### SOA
|
||||
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
|
||||
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
|
||||
###
|
||||
shl r12, 2 # r12 <- neighbor->maxneighs * 4
|
||||
|
||||
# Register spilling
|
||||
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
|
||||
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
|
||||
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
|
||||
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
|
||||
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
|
||||
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
|
||||
#sub rsp, 64
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
|
||||
#add rsp, 64
|
||||
|
||||
..atom_loop_begin:
|
||||
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
|
||||
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
|
||||
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
|
||||
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
|
||||
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
|
||||
|
||||
### AOS
|
||||
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
|
||||
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
|
||||
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
|
||||
### SOA
|
||||
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
|
||||
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
|
||||
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
|
||||
###
|
||||
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
|
||||
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
|
||||
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
|
||||
test r13d, r13d # numneighs <= 0
|
||||
jle ..atom_loop_exit
|
||||
|
||||
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
|
||||
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
|
||||
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
|
||||
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
|
||||
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
|
||||
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
|
||||
xor r9d, r9d # r9d <- 0 (k)
|
||||
mov r14d, r13d # r14d <- numneighs
|
||||
cmp r14d, 8
|
||||
jl ..compute_forces_remainder
|
||||
|
||||
..compute_forces:
|
||||
vpcmpeqb k1, xmm0, xmm0
|
||||
vpcmpeqb k2, xmm0, xmm0
|
||||
vpcmpeqb k3, xmm0, xmm0
|
||||
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
||||
sub r14d, 8
|
||||
add r9, 8
|
||||
cmp r14d, 8
|
||||
jge ..compute_forces
|
||||
|
||||
# Check if there are remaining neighbors to be computed
|
||||
..compute_forces_remainder:
|
||||
test r14d, r14d
|
||||
jle ..sum_up_forces
|
||||
|
||||
vpbroadcastd ymm4, r14d
|
||||
vpcmpgtd k1, ymm4, ymm17
|
||||
kmovw r15d, k1
|
||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
|
||||
kmovw k2, k1
|
||||
kmovw k3, k1
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
#### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
kmovw r9d, k5 # r9d <- rsq < cutforcesq
|
||||
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
|
||||
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
|
||||
|
||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
||||
# and add them (reduction) to obtain the final contribution for the current atom
|
||||
..sum_up_forces:
|
||||
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
|
||||
vpermd zmm0, zmm10, zmm11
|
||||
vpermd zmm5, zmm10, zmm12
|
||||
vpermd zmm21, zmm10, zmm13
|
||||
vaddpd zmm11, zmm0, zmm11
|
||||
vaddpd zmm12, zmm5, zmm12
|
||||
vaddpd zmm13, zmm21, zmm13
|
||||
vpermpd zmm1, zmm11, 78
|
||||
vpermpd zmm6, zmm12, 78
|
||||
vpermpd zmm22, zmm13, 78
|
||||
vaddpd zmm2, zmm11, zmm1
|
||||
vaddpd zmm8, zmm12, zmm6
|
||||
vaddpd zmm23, zmm13, zmm22
|
||||
vpermpd zmm3, zmm2, 177
|
||||
vpermpd zmm9, zmm8, 177
|
||||
vpermpd zmm24, zmm23, 177
|
||||
vaddpd zmm4, zmm2, zmm3
|
||||
vaddpd zmm20, zmm8, zmm9
|
||||
vaddpd zmm25, zmm23, zmm24
|
||||
|
||||
..atom_loop_exit:
|
||||
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
|
||||
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
|
||||
|
||||
### AOS
|
||||
add rax, 24
|
||||
###
|
||||
|
||||
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
|
||||
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
|
||||
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
|
||||
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
|
||||
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
|
||||
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
|
||||
inc r10 #55.5
|
||||
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
|
||||
jb ..atom_loop_begin
|
||||
vzeroupper #93.12
|
||||
vxorpd xmm0, xmm0, xmm0 #93.12
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
|
||||
pop rbx
|
||||
pop r15
|
||||
pop r14 #93.12
|
||||
pop r13 #93.12
|
||||
pop r12 #93.12
|
||||
pop rbp #93.12
|
||||
ret #93.12
|
||||
|
||||
.type computeForceLJ,@function
|
||||
.size computeForceLJ,.-computeForceLJ
|
||||
|
||||
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForceLJ
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
Reference in New Issue
Block a user