148 lines
2.8 KiB
ArmAsm
148 lines
2.8 KiB
ArmAsm
.intel_syntax noprefix
|
|
.data
|
|
.align 64
|
|
SCALAR:
|
|
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
|
|
|
|
.section .rodata, "a"
|
|
.align 64
|
|
.align 64
|
|
.ymm_reg_mask.1:
|
|
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
|
.type .ymm_reg_mask.1,@object
|
|
.size .ymm_reg_mask.1,32
|
|
.align 8
|
|
|
|
# rdi -> a
|
|
# rsi -> neighbors
|
|
# rdx -> numneighs[i]
|
|
# rcx -> &t[t_idx]
|
|
# r8 -> ntest
|
|
.text
|
|
.globl gather_md_aos
|
|
.type gather_md_aos, @function
|
|
gather_md_aos :
|
|
push rbp
|
|
mov rbp, rsp
|
|
push rbx
|
|
push r10
|
|
push r11
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
|
|
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
|
|
mov r15, rdx
|
|
xor rax, rax
|
|
.align 16
|
|
1:
|
|
|
|
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
|
|
vpaddd ymm4, ymm3, ymm3
|
|
#ifdef PADDING
|
|
vpaddd ymm3, ymm4, ymm4
|
|
#else
|
|
vpaddd ymm3, ymm3, ymm4
|
|
#endif
|
|
|
|
# Prefetching instructions
|
|
#mov ebx, DWORD PTR[rsi + rax*4]
|
|
#mov r9d, DWORD PTR[4 + rsi + rax*4]
|
|
#mov r10d, DWORD PTR[8 + rsi + rax*4]
|
|
#mov r11d, DWORD PTR[12 + rsi + rax*4]
|
|
#mov r12d, DWORD PTR[16 + rsi + rax*4]
|
|
#mov r13d, DWORD PTR[20 + rsi + rax*4]
|
|
#mov r14d, DWORD PTR[24 + rsi + rax*4]
|
|
#mov r15d, DWORD PTR[28 + rsi + rax*4]
|
|
#lea ebx, DWORD PTR[rbx]
|
|
#lea r9d, DWORD PTR[r9]
|
|
#lea r10d, DWORD PTR[r10]
|
|
#lea r11d, DWORD PTR[r11]
|
|
#lea r12d, DWORD PTR[r12]
|
|
#lea r13d, DWORD PTR[r13]
|
|
#lea r14d, DWORD PTR[r14]
|
|
#lea r15d, DWORD PTR[r15]
|
|
|
|
vpcmpeqb k1, xmm5, xmm5
|
|
#ifndef ONLY_FIRST_DIMENSION
|
|
vpcmpeqb k2, xmm5, xmm5
|
|
vpcmpeqb k3, xmm5, xmm5
|
|
#endif
|
|
|
|
vpxord zmm0, zmm0, zmm0
|
|
#ifndef ONLY_FIRST_DIMENSION
|
|
vpxord zmm1, zmm1, zmm1
|
|
vpxord zmm2, zmm2, zmm2
|
|
#endif
|
|
|
|
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
|
|
#ifndef ONLY_FIRST_DIMENSION
|
|
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
|
|
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
|
|
#endif
|
|
|
|
#ifdef TEST
|
|
vmovupd [rcx + rax * 8], zmm0
|
|
lea rbx, [rcx + r8 * 8]
|
|
vmovupd [rbx + rax * 8], zmm1
|
|
lea r10, [rbx + r8 * 8]
|
|
vmovupd [r10 + rax * 8], zmm2
|
|
#endif
|
|
|
|
# TODO: see if this logic can be optimized
|
|
addq rax, 8
|
|
subq r15, 8
|
|
cmpq r15, 8
|
|
jge 1b
|
|
|
|
cmpq r15, 0
|
|
jle .end_func
|
|
|
|
vpbroadcastd ymm6, r15d
|
|
vpcmpgtd k1, ymm6, ymm7
|
|
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
|
|
vpaddd ymm4, ymm3, ymm3
|
|
#ifdef PADDING
|
|
vpaddd ymm3, ymm4, ymm4
|
|
#else
|
|
vpaddd ymm3, ymm3, ymm4
|
|
#endif
|
|
|
|
vpxord zmm0, zmm1, zmm2
|
|
#ifndef ONLY_FIRST_DIMENSION
|
|
kmovw k2, k1
|
|
kmovw k3, k1
|
|
vpxord zmm1, zmm1, zmm1
|
|
vpxord zmm2, zmm2, zmm2
|
|
#endif
|
|
|
|
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
|
|
#ifndef ONLY_FIRST_DIMENSION
|
|
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
|
|
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
|
|
#endif
|
|
|
|
#ifdef TEST
|
|
vmovupd [rcx + rax * 8], zmm0
|
|
lea rbx, [rcx + r8 * 8]
|
|
vmovupd [rbx + rax * 8], zmm1
|
|
lea r10, [rbx + r8 * 8]
|
|
vmovupd [r10 + rax * 8], zmm2
|
|
#endif
|
|
|
|
addq rax, r15
|
|
|
|
.end_func:
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop r11
|
|
pop r10
|
|
pop rbx
|
|
mov rsp, rbp
|
|
pop rbp
|
|
ret
|
|
.size gather_md_aos, .-gather_md_aos
|