64 lines
1.1 KiB
ArmAsm

.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather
.type gather, @function
gather :
push rbp
mov rbp, rsp
push rbx
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm0, ymm0, ymm0
.align 16
1:
vmovups xmm1, [rsi + rax * 4]
vmovups xmm2, [rsi + rax * 4 + 16]
vmovups xmm3, [rsi + rax * 4 + 32]
vmovups xmm4, [rsi + rax * 4 + 48]
vmovdqa ymm5, ymm0
vmovdqa ymm6, ymm0
vmovdqa ymm7, ymm0
vmovdqa ymm8, ymm0
vxorpd ymm9, ymm9, ymm9
vxorpd ymm10, ymm10, ymm10
vxorpd ymm11, ymm11, ymm11
vxorpd ymm12, ymm12, ymm12
vgatherdpd ymm9, [rdi + xmm1 * 8], ymm5
vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
#ifdef TEST
vmovapd [rcx + rax * 8], ymm9
vmovapd [rcx + rax * 8 + 32], ymm10
vmovapd [rcx + rax * 8 + 64], ymm11
vmovapd [rcx + rax * 8 + 96], ymm12
#endif
addq rax, 16
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather, .-gather