MD-Bench/asm/unused/force.s

377 lines
10 KiB
ArmAsm

.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForce
computeForce:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push r12
push r13
push r14
# r9d <- atom->Nlocal
mov r9d, DWORD PTR [4+rsi]
# xmm2 <- param->cutforce
vmovsd xmm2, QWORD PTR [72+rdi]
# xmm1 <- param->sigma6
vmovsd xmm1, QWORD PTR [8+rdi]
# xmm0 <- param->epsilon
vmovsd xmm0, QWORD PTR [rdi]
# r13 <- atom->fx
mov r13, QWORD PTR [64+rsi]
# r14 <- atom->fy
mov r14, QWORD PTR [72+rsi]
# rdi <- atom->fz
mov rdi, QWORD PTR [80+rsi]
# atom->Nlocal <= 0
test r9d, r9d
jle ..B1.30
..B1.2:
# r10d <- 0
xor r10d, r10d
# ecx <- atom->Nlocal
mov ecx, r9d
# r8d <- 0
xor r8d, r8d
# r11d <- 1
mov r11d, 1
# eax <- 0
xor eax, eax
# ecx <- atom->Nlocal >> 1
shr ecx, 1
je ..B1.6
# Init forces to zero loop
..B1.4:
# fx[i] <- 0
mov QWORD PTR [r8+r13], rax
# i++
inc r10
# fy[i] <- 0
mov QWORD PTR [r8+r14], rax
# fz[i] <- 0
mov QWORD PTR [r8+rdi], rax
# fx[i] <- 0
mov QWORD PTR [8+r8+r13], rax
# fy[i] <- 0
mov QWORD PTR [8+r8+r14], rax
# fz[i] <- 0
mov QWORD PTR [8+r8+rdi], rax
# i++
add r8, 16
# i < Nlocal
cmp r10, rcx
jb ..B1.4
..B1.5:
# r11d <- i * 2 + 1
lea r11d, DWORD PTR [1+r10+r10]
..B1.6:
# r11d <- i * 2
lea ecx, DWORD PTR [-1+r11]
# i < Nlocal
cmp ecx, r9d
jae ..B1.8
..B1.7:
# r11 <- i * 2
movsxd r11, r11d
# fx[i] <- 0
mov QWORD PTR [-8+r13+r11*8], rax
# fy[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax
# fz[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax
..B1.8:
# xmm15 <- cutforcesq
vmulsd xmm15, xmm2, xmm2
# r8d <- 0
xor r8d, r8d
# ymm18 <- 8
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]
# xmm0 <- 48 * epsilon
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]
# ymm17 <- [0..7]
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]
# zmm7 <- 0.5
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]
# zmm16 <- cutforcesq
vbroadcastsd zmm16, xmm15
# zmm15 <- param->sigma6
vbroadcastsd zmm15, xmm1
# zmm16 <- 48 * epsilon
vbroadcastsd zmm14, xmm0
# r9 <- atom->Nlocal
movsxd r9, r9d
# r10d <- 0 (i)
xor r10d, r10d
# rcx <- neighbor->numneigh
mov rcx, QWORD PTR [24+rdx]
# r11 <- neighbor->neighbors
mov r11, QWORD PTR [8+rdx]
# r12 <- neighbor->maxneighs
movsxd r12, DWORD PTR [16+rdx]
# rdx <- atom->x
mov rdx, QWORD PTR [16+rsi]
# rax <- atom->y
mov rax, QWORD PTR [24+rsi]
# rsi <- atom->z
mov rsi, QWORD PTR [32+rsi]
# r12 <- neighbor->maxneighs * 4
shl r12, 2
# [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-32+rsp], r9
# [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-24+rsp], rcx
# [-16+rsp] <- atom->fy
mov QWORD PTR [-16+rsp], r14
# [-8+rsp] <- atom->fx
mov QWORD PTR [-8+rsp], r13
# [-40+rsp] <- r15
mov QWORD PTR [-40+rsp], r15
# [-48+rsp] <- rbx
mov QWORD PTR [-48+rsp], rbx
# zmm19 <- 0
vpxord zmm19, zmm19, zmm19
# Loop over all atoms
..B1.9:
# rcx <- neighbor->numneigh
mov rcx, QWORD PTR [-24+rsp]
# xmm25 <- 0
vxorpd xmm25, xmm25, xmm25
# xmm20 <- 0
vmovapd xmm20, xmm25
# r13d <- neighbor->numneigh[i] (numneighs)
mov r13d, DWORD PTR [rcx+r10*4]
# xmm4 <- 0
vmovapd xmm4, xmm20
# xmm8 <- atom->x[i]
vmovsd xmm8, QWORD PTR [rdx+r10*8]
# xmm9 <- atom->y[i]
vmovsd xmm9, QWORD PTR [rax+r10*8]
# xmm9 <- atom->z[i]
vmovsd xmm10, QWORD PTR [rsi+r10*8]
# numneighs <= 0
test r13d, r13d
jle ..B1.27
..B1.10:
# zmm13 <- 0
vmovaps zmm13, zmm19
# zmm12 <- 0
vmovaps zmm12, zmm13
# zmm11 <- 0
vmovaps zmm11, zmm12
# numneighs < 8
cmp r13d, 8
jl ..B1.32
..B1.11:
# numneighs < 1200
cmp r13d, 1200
jl ..B1.31
..B1.12:
mov rcx, r12
imul rcx, r8
add rcx, r11
mov r9, rcx
and r9, 63
test r9d, 3
je ..B1.14
..B1.13:
xor r9d, r9d
jmp ..B1.16
..B1.14:
test r9d, r9d
je ..B1.16
..B1.15:
neg r9d
add r9d, 64
shr r9d, 2
cmp r13d, r9d
cmovl r9d, r13d
..B1.16:
mov ebx, r13d
sub ebx, r9d
and ebx, 7
neg ebx
add ebx, r13d
cmp r9d, 1
jb ..B1.20
..B1.20:
lea ecx, DWORD PTR [8+r9]
cmp ebx, ecx
jl ..B1.24
..B1.21:
mov rcx, r12
imul rcx, r8
vbroadcastsd zmm0, xmm8
vbroadcastsd zmm1, xmm9
vbroadcastsd zmm2, xmm10
movsxd r14, r9d
add rcx, r11
..B1.22:
vpcmpeqb k2, xmm0, xmm0
add r9d, 8
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r14*4]
add r14, 8
vpxord zmm5, zmm5, zmm5
vpxord zmm4, zmm4, zmm4
vpxord zmm6, zmm6, zmm6
vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8]
vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8]
vsubpd zmm29, zmm1, zmm5
vsubpd zmm28, zmm0, zmm4
vsubpd zmm31, zmm2, zmm6
vmulpd zmm20, zmm29, zmm29
vfmadd231pd zmm20, zmm28, zmm28
vfmadd231pd zmm20, zmm31, zmm31
# if condition cutoff radius
vrcp14pd zmm27, zmm20 #-> sr2
vcmppd k5, zmm20, zmm16, 1
vmulpd zmm22, zmm27, zmm15 #-> sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 #-> 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 #-> sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 #-> sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 #-> sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 #-> 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 #->
vfmadd231pd zmm13{k5}, zmm30, zmm28
vfmadd231pd zmm12{k5}, zmm30, zmm29
vfmadd231pd zmm11{k5}, zmm30, zmm31
cmp r9d, ebx
jb ..B1.22
#end neighbor loop
..B1.26:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
#exit function
..B1.27:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
movsxd r8, r10d #55.32
inc r8 #55.32
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..B1.9
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
ret #93.12
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End