Cleanup and move gather-bench to util folder

This commit is contained in:
2023-08-15 15:21:21 +02:00
parent 151f0c0e6f
commit 19209bdcce
98 changed files with 2104 additions and 38712 deletions

View File

@@ -0,0 +1,147 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
.section .rodata, "a"
.align 64
.align 64
.ymm_reg_mask.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .ymm_reg_mask.1,@object
.size .ymm_reg_mask.1,32
.align 8
# rdi -> a
# rsi -> neighbors
# rdx -> numneighs[i]
# rcx -> &t[t_idx]
# r8 -> ntest
.text
.globl gather_md_aos
.type gather_md_aos, @function
gather_md_aos :
push rbp
mov rbp, rsp
push rbx
push r10
push r11
push r12
push r13
push r14
push r15
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
mov r15, rdx
xor rax, rax
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
# Prefetching instructions
#mov ebx, DWORD PTR[rsi + rax*4]
#mov r9d, DWORD PTR[4 + rsi + rax*4]
#mov r10d, DWORD PTR[8 + rsi + rax*4]
#mov r11d, DWORD PTR[12 + rsi + rax*4]
#mov r12d, DWORD PTR[16 + rsi + rax*4]
#mov r13d, DWORD PTR[20 + rsi + rax*4]
#mov r14d, DWORD PTR[24 + rsi + rax*4]
#mov r15d, DWORD PTR[28 + rsi + rax*4]
#lea ebx, DWORD PTR[rbx]
#lea r9d, DWORD PTR[r9]
#lea r10d, DWORD PTR[r10]
#lea r11d, DWORD PTR[r11]
#lea r12d, DWORD PTR[r12]
#lea r13d, DWORD PTR[r13]
#lea r14d, DWORD PTR[r14]
#lea r15d, DWORD PTR[r15]
vpcmpeqb k1, xmm5, xmm5
#ifndef ONLY_FIRST_DIMENSION
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
#endif
vpxord zmm0, zmm0, zmm0
#ifndef ONLY_FIRST_DIMENSION
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + r8 * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + r8 * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
# TODO: see if this logic can be optimized
addq rax, 8
subq r15, 8
cmpq r15, 8
jge 1b
cmpq r15, 0
jle .end_func
vpbroadcastd ymm6, r15d
vpcmpgtd k1, ymm6, ymm7
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
vpxord zmm0, zmm1, zmm2
#ifndef ONLY_FIRST_DIMENSION
kmovw k2, k1
kmovw k3, k1
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + r8 * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + r8 * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
addq rax, r15
.end_func:
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_md_aos, .-gather_md_aos