72 lines
1.1 KiB
ArmAsm
72 lines
1.1 KiB
ArmAsm
.intel_syntax noprefix
|
|
.data
|
|
.align 64
|
|
SCALAR:
|
|
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
|
|
|
|
# rdi -> a
|
|
# rsi -> idx
|
|
# rdx -> N
|
|
# rcx -> t
|
|
.text
|
|
.globl gather_aos
|
|
.type gather_aos, @function
|
|
gather_aos :
|
|
push rbp
|
|
mov rbp, rsp
|
|
push rbx
|
|
push r9
|
|
push r10
|
|
push r11
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
|
|
xor rax, rax
|
|
vpcmpeqd ymm8, ymm8, ymm8
|
|
.align 16
|
|
1:
|
|
|
|
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
|
|
vpaddd xmm4, xmm3, xmm3
|
|
#ifdef PADDING
|
|
vpaddd xmm3, xmm4, xmm4
|
|
#else
|
|
vpaddd xmm3, xmm3, xmm4
|
|
#endif
|
|
vmovdqa ymm5, ymm8
|
|
vmovdqa ymm6, ymm8
|
|
vmovdqa ymm7, ymm8
|
|
vxorpd ymm0, ymm0, ymm0
|
|
vxorpd ymm1, ymm1, ymm1
|
|
vxorpd ymm2, ymm2, ymm2
|
|
vgatherdpd ymm0, [ rdi + xmm3 * 8], ymm5
|
|
vgatherdpd ymm1, [8 + rdi + xmm3 * 8], ymm6
|
|
vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
|
|
|
|
#ifdef TEST
|
|
vmovupd [rcx + rax * 8], ymm0
|
|
lea rbx, [rcx + rdx * 8]
|
|
vmovupd [rbx + rax * 8], ymm1
|
|
lea r9, [rbx + rdx * 8]
|
|
vmovupd [r9 + rax * 8], ymm2
|
|
#endif
|
|
|
|
addq rax, 4
|
|
cmpq rax, rdx
|
|
jl 1b
|
|
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop r11
|
|
pop r10
|
|
pop r9
|
|
pop rbx
|
|
mov rsp, rbp
|
|
pop rbp
|
|
ret
|
|
.size gather_aos, .-gather_aos
|