Cleanup and move gather-bench to util folder
This commit is contained in:
147
util/gather-bench/src/avx512/gather_md_aos.S
Normal file
147
util/gather-bench/src/avx512/gather_md_aos.S
Normal file
@@ -0,0 +1,147 @@
|
||||
.intel_syntax noprefix
|
||||
.data
|
||||
.align 64
|
||||
SCALAR:
|
||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
|
||||
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.ymm_reg_mask.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .ymm_reg_mask.1,@object
|
||||
.size .ymm_reg_mask.1,32
|
||||
.align 8
|
||||
|
||||
# rdi -> a
|
||||
# rsi -> neighbors
|
||||
# rdx -> numneighs[i]
|
||||
# rcx -> &t[t_idx]
|
||||
# r8 -> ntest
|
||||
.text
|
||||
.globl gather_md_aos
|
||||
.type gather_md_aos, @function
|
||||
gather_md_aos :
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
push rbx
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
|
||||
mov r15, rdx
|
||||
xor rax, rax
|
||||
.align 16
|
||||
1:
|
||||
|
||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
#ifdef PADDING
|
||||
vpaddd ymm3, ymm4, ymm4
|
||||
#else
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
#endif
|
||||
|
||||
# Prefetching instructions
|
||||
#mov ebx, DWORD PTR[rsi + rax*4]
|
||||
#mov r9d, DWORD PTR[4 + rsi + rax*4]
|
||||
#mov r10d, DWORD PTR[8 + rsi + rax*4]
|
||||
#mov r11d, DWORD PTR[12 + rsi + rax*4]
|
||||
#mov r12d, DWORD PTR[16 + rsi + rax*4]
|
||||
#mov r13d, DWORD PTR[20 + rsi + rax*4]
|
||||
#mov r14d, DWORD PTR[24 + rsi + rax*4]
|
||||
#mov r15d, DWORD PTR[28 + rsi + rax*4]
|
||||
#lea ebx, DWORD PTR[rbx]
|
||||
#lea r9d, DWORD PTR[r9]
|
||||
#lea r10d, DWORD PTR[r10]
|
||||
#lea r11d, DWORD PTR[r11]
|
||||
#lea r12d, DWORD PTR[r12]
|
||||
#lea r13d, DWORD PTR[r13]
|
||||
#lea r14d, DWORD PTR[r14]
|
||||
#lea r15d, DWORD PTR[r15]
|
||||
|
||||
vpcmpeqb k1, xmm5, xmm5
|
||||
#ifndef ONLY_FIRST_DIMENSION
|
||||
vpcmpeqb k2, xmm5, xmm5
|
||||
vpcmpeqb k3, xmm5, xmm5
|
||||
#endif
|
||||
|
||||
vpxord zmm0, zmm0, zmm0
|
||||
#ifndef ONLY_FIRST_DIMENSION
|
||||
vpxord zmm1, zmm1, zmm1
|
||||
vpxord zmm2, zmm2, zmm2
|
||||
#endif
|
||||
|
||||
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
|
||||
#ifndef ONLY_FIRST_DIMENSION
|
||||
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
|
||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
|
||||
#endif
|
||||
|
||||
#ifdef TEST
|
||||
vmovupd [rcx + rax * 8], zmm0
|
||||
lea rbx, [rcx + r8 * 8]
|
||||
vmovupd [rbx + rax * 8], zmm1
|
||||
lea r10, [rbx + r8 * 8]
|
||||
vmovupd [r10 + rax * 8], zmm2
|
||||
#endif
|
||||
|
||||
# TODO: see if this logic can be optimized
|
||||
addq rax, 8
|
||||
subq r15, 8
|
||||
cmpq r15, 8
|
||||
jge 1b
|
||||
|
||||
cmpq r15, 0
|
||||
jle .end_func
|
||||
|
||||
vpbroadcastd ymm6, r15d
|
||||
vpcmpgtd k1, ymm6, ymm7
|
||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
#ifdef PADDING
|
||||
vpaddd ymm3, ymm4, ymm4
|
||||
#else
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
#endif
|
||||
|
||||
vpxord zmm0, zmm1, zmm2
|
||||
#ifndef ONLY_FIRST_DIMENSION
|
||||
kmovw k2, k1
|
||||
kmovw k3, k1
|
||||
vpxord zmm1, zmm1, zmm1
|
||||
vpxord zmm2, zmm2, zmm2
|
||||
#endif
|
||||
|
||||
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
|
||||
#ifndef ONLY_FIRST_DIMENSION
|
||||
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
|
||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
|
||||
#endif
|
||||
|
||||
#ifdef TEST
|
||||
vmovupd [rcx + rax * 8], zmm0
|
||||
lea rbx, [rcx + r8 * 8]
|
||||
vmovupd [rbx + rax * 8], zmm1
|
||||
lea r10, [rbx + r8 * 8]
|
||||
vmovupd [r10 + rax * 8], zmm2
|
||||
#endif
|
||||
|
||||
addq rax, r15
|
||||
|
||||
.end_func:
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop rbx
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size gather_md_aos, .-gather_md_aos
|
Reference in New Issue
Block a user