Cleanup and move gather-bench to util folder
This commit is contained in:
		
							
								
								
									
										52
									
								
								util/gather-bench/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								util/gather-bench/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,52 @@
 | 
			
		||||
# Prerequisites
 | 
			
		||||
*.d
 | 
			
		||||
 | 
			
		||||
# Object files
 | 
			
		||||
*.o
 | 
			
		||||
*.ko
 | 
			
		||||
*.obj
 | 
			
		||||
*.elf
 | 
			
		||||
 | 
			
		||||
# Linker output
 | 
			
		||||
*.ilk
 | 
			
		||||
*.map
 | 
			
		||||
*.exp
 | 
			
		||||
 | 
			
		||||
# Precompiled Headers
 | 
			
		||||
*.gch
 | 
			
		||||
*.pch
 | 
			
		||||
 | 
			
		||||
# Libraries
 | 
			
		||||
*.lib
 | 
			
		||||
*.a
 | 
			
		||||
*.la
 | 
			
		||||
*.lo
 | 
			
		||||
 | 
			
		||||
# Shared objects (inc. Windows DLLs)
 | 
			
		||||
*.dll
 | 
			
		||||
*.so
 | 
			
		||||
*.so.*
 | 
			
		||||
*.dylib
 | 
			
		||||
 | 
			
		||||
# Executables
 | 
			
		||||
*.exe
 | 
			
		||||
*.out
 | 
			
		||||
*.app
 | 
			
		||||
*.i*86
 | 
			
		||||
*.x86_64
 | 
			
		||||
*.hex
 | 
			
		||||
 | 
			
		||||
# Debug files
 | 
			
		||||
*.dSYM/
 | 
			
		||||
*.su
 | 
			
		||||
*.idb
 | 
			
		||||
*.pdb
 | 
			
		||||
 | 
			
		||||
# Kernel Module Compile Results
 | 
			
		||||
*.mod*
 | 
			
		||||
*.cmd
 | 
			
		||||
.tmp_versions/
 | 
			
		||||
modules.order
 | 
			
		||||
Module.symvers
 | 
			
		||||
Mkfile.old
 | 
			
		||||
dkms.conf
 | 
			
		||||
							
								
								
									
										21
									
								
								util/gather-bench/LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								util/gather-bench/LICENSE
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,21 @@
 | 
			
		||||
MIT License
 | 
			
		||||
 | 
			
		||||
Copyright (c) 2021 RRZE-HPC
 | 
			
		||||
 | 
			
		||||
Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
in the Software without restriction, including without limitation the rights
 | 
			
		||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
furnished to do so, subject to the following conditions:
 | 
			
		||||
 | 
			
		||||
The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
copies or substantial portions of the Software.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
SOFTWARE.
 | 
			
		||||
							
								
								
									
										126
									
								
								util/gather-bench/Makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								util/gather-bench/Makefile
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,126 @@
 | 
			
		||||
#CONFIGURE BUILD SYSTEM
 | 
			
		||||
TARGET	   = gather-bench-$(TAG)
 | 
			
		||||
BUILD_DIR  = ./$(TAG)
 | 
			
		||||
SRC_DIR	= ./src
 | 
			
		||||
MAKE_DIR   = ./
 | 
			
		||||
ISA_DIR	= ./src/$(ISA)
 | 
			
		||||
Q		 ?= @
 | 
			
		||||
 | 
			
		||||
#DO NOT EDIT BELOW
 | 
			
		||||
include $(MAKE_DIR)/config.mk
 | 
			
		||||
include $(MAKE_DIR)/include_$(TAG).mk
 | 
			
		||||
include $(MAKE_DIR)/include_LIKWID.mk
 | 
			
		||||
INCLUDES  += -I./src/includes
 | 
			
		||||
 | 
			
		||||
VPATH	 = $(SRC_DIR) ${ISA_DIR}
 | 
			
		||||
ASM	   = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 | 
			
		||||
ASM	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90))
 | 
			
		||||
OBJ	   = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
 | 
			
		||||
OBJ	  += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S))
 | 
			
		||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA)
 | 
			
		||||
 | 
			
		||||
ifneq ($(VARIANT),)
 | 
			
		||||
	.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
 | 
			
		||||
    CPPFLAGS += -DAOS
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(TEST)),true)
 | 
			
		||||
    CPPFLAGS += -DTEST
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(PADDING)),true)
 | 
			
		||||
    CPPFLAGS += -DPADDING
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true)
 | 
			
		||||
    CPPFLAGS += -DMEASURE_GATHER_CYCLES
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true)
 | 
			
		||||
    CPPFLAGS += -DONLY_FIRST_DIMENSION
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(MEM_TRACER)),true)
 | 
			
		||||
    CPPFLAGS += -DMEM_TRACER
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c
 | 
			
		||||
	@echo "===>  LINKING  $(TARGET)"
 | 
			
		||||
	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS)
 | 
			
		||||
 | 
			
		||||
${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c
 | 
			
		||||
	@echo "===>  LINKING  $(TARGET)-$* "
 | 
			
		||||
	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS)
 | 
			
		||||
 | 
			
		||||
asm:  $(BUILD_DIR) $(ASM)
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.c
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
 | 
			
		||||
	$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.s:  %.c
 | 
			
		||||
	@echo "===>  GENERATE ASM  $@"
 | 
			
		||||
	$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.s:  %.f90
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(FC) -S  $(FCFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.cc
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
 | 
			
		||||
	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.cpp
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
 | 
			
		||||
	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.f90
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(FC) -c  $(FCFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.F90
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(FC) -c  $(CPPFLAGS)  $(FCFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.s
 | 
			
		||||
	@echo "===>  ASSEMBLE  $@"
 | 
			
		||||
	$(Q)$(AS)  $(ASFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.S
 | 
			
		||||
	@echo "===>  ASSEMBLE  $@"
 | 
			
		||||
	$(Q)$(CC) -c $(CPPFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
tags:
 | 
			
		||||
	@echo "===>  GENERATE  TAGS"
 | 
			
		||||
	$(Q)ctags -R
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR):
 | 
			
		||||
	@mkdir $(BUILD_DIR)
 | 
			
		||||
 | 
			
		||||
ifeq ($(findstring $(MAKECMDGOALS),clean),)
 | 
			
		||||
-include $(OBJ:.o=.d)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
.PHONY: clean distclean
 | 
			
		||||
 | 
			
		||||
clean:
 | 
			
		||||
	@echo "===>  CLEAN"
 | 
			
		||||
	@rm -rf $(BUILD_DIR)
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
 | 
			
		||||
distclean: clean
 | 
			
		||||
	@echo "===>  DIST CLEAN"
 | 
			
		||||
	@rm -f $(TARGET)
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
							
								
								
									
										2
									
								
								util/gather-bench/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								util/gather-bench/README.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
			
		||||
# gather-bench
 | 
			
		||||
A X86 gather instruction performance benchmark
 | 
			
		||||
							
								
								
									
										22
									
								
								util/gather-bench/config.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								util/gather-bench/config.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,22 @@
 | 
			
		||||
# Supported: GCC, CLANG, ICC
 | 
			
		||||
TAG ?= ICC
 | 
			
		||||
# Supported: avx2, avx512
 | 
			
		||||
ISA ?= avx512
 | 
			
		||||
# Use likwid?
 | 
			
		||||
ENABLE_LIKWID ?= false
 | 
			
		||||
 | 
			
		||||
# SP or DP
 | 
			
		||||
DATA_TYPE ?= DP
 | 
			
		||||
# AOS or SOA
 | 
			
		||||
DATA_LAYOUT ?= AOS
 | 
			
		||||
# Padding byte for AoS
 | 
			
		||||
PADDING ?= false
 | 
			
		||||
# Measure cycles for each gather separately
 | 
			
		||||
MEASURE_GATHER_CYCLES ?= false
 | 
			
		||||
# Gather data only for first dimension (one gather per iteration)
 | 
			
		||||
ONLY_FIRST_DIMENSION ?= false
 | 
			
		||||
 | 
			
		||||
# Trace memory addresses for cache simulator
 | 
			
		||||
MEM_TRACER ?= false
 | 
			
		||||
# Test correctness of gather kernels
 | 
			
		||||
TEST ?= false
 | 
			
		||||
							
								
								
									
										9
									
								
								util/gather-bench/include_CLANG.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								util/gather-bench/include_CLANG.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
CC  = clang
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP   =# -fopenmp
 | 
			
		||||
CFLAGS   = -Ofast -std=c11 -march=core-avx2 -mavx -mfma  $(OPENMP)
 | 
			
		||||
LFLAGS   = $(OPENMP) -march=core-avx2 -mavx -mfma
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE
 | 
			
		||||
INCLUDES =
 | 
			
		||||
LIBS     =
 | 
			
		||||
							
								
								
									
										11
									
								
								util/gather-bench/include_GCC.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								util/gather-bench/include_GCC.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
CC  = gcc
 | 
			
		||||
AS  = as
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP   = -fopenmp
 | 
			
		||||
CFLAGS   = -Ofast -std=c11 -mavx2 -mfma $(OPENMP)
 | 
			
		||||
ASFLAGS  =
 | 
			
		||||
LFLAGS   = $(OPENMP) -mavx2 -mfma
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE
 | 
			
		||||
INCLUDES =
 | 
			
		||||
LIBS     =
 | 
			
		||||
							
								
								
									
										9
									
								
								util/gather-bench/include_ICC.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								util/gather-bench/include_ICC.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
CC  = icc
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP   = -qopenmp
 | 
			
		||||
CFLAGS   = -Ofast -xhost -std=c11 $(OPENMP)
 | 
			
		||||
LFLAGS   = $(OPENMP)
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE
 | 
			
		||||
INCLUDES =
 | 
			
		||||
LIBS     =
 | 
			
		||||
							
								
								
									
										10
									
								
								util/gather-bench/include_LIKWID.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								util/gather-bench/include_LIKWID.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
LIKWID_INC ?= -I/usr/local/include
 | 
			
		||||
LIKWID_DEFINES ?= -DLIKWID_PERFMON
 | 
			
		||||
LIKWID_LIB ?= -L/usr/local/lib
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(ENABLE_LIKWID)),true)
 | 
			
		||||
INCLUDES += ${LIKWID_INC}
 | 
			
		||||
DEFINES +=  ${LIKWID_DEFINES}
 | 
			
		||||
LIBS += -llikwid
 | 
			
		||||
LFLAGS += ${LIKWID_LIB}
 | 
			
		||||
endif
 | 
			
		||||
							
								
								
									
										57
									
								
								util/gather-bench/src/allocate.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								util/gather-bench/src/allocate.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,57 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <errno.h>
 | 
			
		||||
 | 
			
		||||
void* allocate (int alignment, size_t bytesize)
 | 
			
		||||
{
 | 
			
		||||
    int errorCode;
 | 
			
		||||
    void* ptr;
 | 
			
		||||
 | 
			
		||||
    errorCode =  posix_memalign(&ptr, alignment, bytesize);
 | 
			
		||||
 | 
			
		||||
    if (errorCode) {
 | 
			
		||||
        if (errorCode == EINVAL) {
 | 
			
		||||
            fprintf(stderr,
 | 
			
		||||
                    "Error: Alignment parameter is not a power of two\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
        if (errorCode == ENOMEM) {
 | 
			
		||||
            fprintf(stderr,
 | 
			
		||||
                    "Error: Insufficient memory to fulfill the request\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (ptr == NULL) {
 | 
			
		||||
        fprintf(stderr, "Error: posix_memalign failed!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return ptr;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										63
									
								
								util/gather-bench/src/avx2/gather.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								util/gather-bench/src/avx2/gather.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,63 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather
 | 
			
		||||
.type gather, @function
 | 
			
		||||
gather :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
vpcmpeqd ymm0, ymm0, ymm0
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
vmovups xmm1, [rsi + rax * 4]
 | 
			
		||||
vmovups xmm2, [rsi + rax * 4 + 16]
 | 
			
		||||
vmovups xmm3, [rsi + rax * 4 + 32]
 | 
			
		||||
vmovups xmm4, [rsi + rax * 4 + 48]
 | 
			
		||||
vmovdqa ymm5, ymm0
 | 
			
		||||
vmovdqa ymm6, ymm0
 | 
			
		||||
vmovdqa ymm7, ymm0
 | 
			
		||||
vmovdqa ymm8, ymm0
 | 
			
		||||
vxorpd ymm9,  ymm9,  ymm9
 | 
			
		||||
vxorpd ymm10, ymm10, ymm10
 | 
			
		||||
vxorpd ymm11, ymm11, ymm11
 | 
			
		||||
vxorpd ymm12, ymm12, ymm12
 | 
			
		||||
vgatherdpd ymm9,  [rdi + xmm1 * 8], ymm5
 | 
			
		||||
vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
 | 
			
		||||
vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
 | 
			
		||||
vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovapd [rcx + rax * 8],      ymm9
 | 
			
		||||
vmovapd [rcx + rax * 8 + 32], ymm10
 | 
			
		||||
vmovapd [rcx + rax * 8 + 64], ymm11
 | 
			
		||||
vmovapd [rcx + rax * 8 + 96], ymm12
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 16
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather, .-gather
 | 
			
		||||
							
								
								
									
										71
									
								
								util/gather-bench/src/avx2/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								util/gather-bench/src/avx2/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,71 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_aos
 | 
			
		||||
.type gather_aos, @function
 | 
			
		||||
gather_aos :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
vpcmpeqd ymm8, ymm8, ymm8
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd xmm4, xmm3, xmm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd xmm3, xmm4, xmm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd xmm3, xmm3, xmm4
 | 
			
		||||
#endif
 | 
			
		||||
vmovdqa ymm5, ymm8
 | 
			
		||||
vmovdqa ymm6, ymm8
 | 
			
		||||
vmovdqa ymm7, ymm8
 | 
			
		||||
vxorpd ymm0, ymm0, ymm0
 | 
			
		||||
vxorpd ymm1, ymm1, ymm1
 | 
			
		||||
vxorpd ymm2, ymm2, ymm2
 | 
			
		||||
vgatherdpd ymm0, [     rdi + xmm3 * 8], ymm5
 | 
			
		||||
vgatherdpd ymm1, [8  + rdi + xmm3 * 8], ymm6
 | 
			
		||||
vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], ymm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], ymm1
 | 
			
		||||
lea r9,  [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r9  + rax * 8], ymm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 4
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_aos, .-gather_aos
 | 
			
		||||
							
								
								
									
										67
									
								
								util/gather-bench/src/avx2/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								util/gather-bench/src/avx2/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_soa
 | 
			
		||||
.type gather_soa, @function
 | 
			
		||||
gather_soa :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor rax, rax
 | 
			
		||||
vpcmpeqd ymm8, ymm8, ymm8
 | 
			
		||||
lea r8, [rdi + rdx * 8]
 | 
			
		||||
lea r9, [r8  + rdx * 8]
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vmovdqa ymm5, ymm8
 | 
			
		||||
vmovdqa ymm6, ymm8
 | 
			
		||||
vmovdqa ymm7, ymm8
 | 
			
		||||
vxorpd ymm0, ymm0, ymm0
 | 
			
		||||
vxorpd ymm1, ymm1, ymm1
 | 
			
		||||
vxorpd ymm2, ymm2, ymm2
 | 
			
		||||
vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5
 | 
			
		||||
vgatherdpd ymm1, [r8  + xmm3 * 8], ymm6
 | 
			
		||||
vgatherdpd ymm2, [r9  + xmm3 * 8], ymm7
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], ymm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], ymm1
 | 
			
		||||
lea r10, [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r10 + rax * 8], ymm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 4
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_soa, .-gather_soa
 | 
			
		||||
							
								
								
									
										62
									
								
								util/gather-bench/src/avx512/gather.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								util/gather-bench/src/avx512/gather.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,62 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather
 | 
			
		||||
.type gather, @function
 | 
			
		||||
gather :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
vpcmpeqb k1, xmm0, xmm0
 | 
			
		||||
vpcmpeqb k2, xmm0, xmm0
 | 
			
		||||
vpcmpeqb k3, xmm0, xmm0
 | 
			
		||||
vpcmpeqb k4, xmm0, xmm0
 | 
			
		||||
vmovdqu ymm0, [rsi + rax * 4]
 | 
			
		||||
vmovdqu ymm1, [rsi + rax * 4 + 32]
 | 
			
		||||
vmovdqu ymm2, [rsi + rax * 4 + 64]
 | 
			
		||||
vmovdqu ymm3, [rsi + rax * 4 + 96]
 | 
			
		||||
vpxord zmm4, zmm4, zmm4
 | 
			
		||||
vpxord zmm5, zmm5, zmm5
 | 
			
		||||
vpxord zmm6, zmm6, zmm6
 | 
			
		||||
vpxord zmm7, zmm7, zmm7
 | 
			
		||||
vgatherdpd zmm4{k1}, [rdi + ymm0 * 8]
 | 
			
		||||
vgatherdpd zmm5{k2}, [rdi + ymm1 * 8]
 | 
			
		||||
vgatherdpd zmm6{k3}, [rdi + ymm2 * 8]
 | 
			
		||||
vgatherdpd zmm7{k4}, [rdi + ymm3 * 8]
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovapd [rcx + rax * 8],       zmm4
 | 
			
		||||
vmovapd [rcx + rax * 8 + 64],  zmm5
 | 
			
		||||
vmovapd [rcx + rax * 8 + 128], zmm6
 | 
			
		||||
vmovapd [rcx + rax * 8 + 192], zmm7
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 32
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather, .-gather
 | 
			
		||||
							
								
								
									
										151
									
								
								util/gather-bench/src/avx512/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								util/gather-bench/src/avx512/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,151 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
# r8  -> cycles
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_aos
 | 
			
		||||
.type gather_aos, @function
 | 
			
		||||
gather_aos :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd ymm4, ymm3, ymm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd ymm3, ymm4, ymm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd ymm3, ymm3, ymm4
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
# Prefetching instructions
 | 
			
		||||
#mov ebx, DWORD PTR[rsi + rax*4]
 | 
			
		||||
#mov r9d, DWORD PTR[4 + rsi + rax*4]
 | 
			
		||||
#mov r10d, DWORD PTR[8 + rsi + rax*4]
 | 
			
		||||
#mov r11d, DWORD PTR[12 + rsi + rax*4]
 | 
			
		||||
#mov r12d, DWORD PTR[16 + rsi + rax*4]
 | 
			
		||||
#mov r13d, DWORD PTR[20 + rsi + rax*4]
 | 
			
		||||
#mov r14d, DWORD PTR[24 + rsi + rax*4]
 | 
			
		||||
#mov r15d, DWORD PTR[28 + rsi + rax*4]
 | 
			
		||||
#lea ebx, DWORD PTR[rbx]
 | 
			
		||||
#lea r9d, DWORD PTR[r9]
 | 
			
		||||
#lea r10d, DWORD PTR[r10]
 | 
			
		||||
#lea r11d, DWORD PTR[r11]
 | 
			
		||||
#lea r12d, DWORD PTR[r12]
 | 
			
		||||
#lea r13d, DWORD PTR[r13]
 | 
			
		||||
#lea r14d, DWORD PTR[r14]
 | 
			
		||||
#lea r15d, DWORD PTR[r15]
 | 
			
		||||
 | 
			
		||||
vpcmpeqb k1, xmm5, xmm5
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpcmpeqb k2, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k3, xmm5, xmm5
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vpxord zmm0, zmm0, zmm0
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpxord zmm1, zmm1, zmm1
 | 
			
		||||
vpxord zmm2, zmm2, zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEASURE_GATHER_CYCLES
 | 
			
		||||
 | 
			
		||||
mov r9, rax
 | 
			
		||||
mov r10, rdx
 | 
			
		||||
xor r11, r11
 | 
			
		||||
add r11, rax
 | 
			
		||||
add r11, rax
 | 
			
		||||
add r11, rax
 | 
			
		||||
#shr r11, 3
 | 
			
		||||
 | 
			
		||||
xor rbx, rbx
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
add ebx, eax
 | 
			
		||||
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
sub eax, ebx
 | 
			
		||||
#movdiri [r8 + r11], rax
 | 
			
		||||
movnti [r8 + r11], rax
 | 
			
		||||
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
xor rbx, rbx
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
add ebx, eax
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
sub eax, ebx
 | 
			
		||||
#movdiri [8 + r8 + r11], rax
 | 
			
		||||
movnti [8 + r8 + r11], rax
 | 
			
		||||
 | 
			
		||||
xor rbx, rbx
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
add ebx, eax
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
sub eax, ebx
 | 
			
		||||
#movdiri [16 + r8 + r11], rax
 | 
			
		||||
movnti [16 + r8 + r11], rax
 | 
			
		||||
#endif // ONLY_FIRST_DIMENSION
 | 
			
		||||
 | 
			
		||||
mov rax, r9
 | 
			
		||||
mov rdx, r10
 | 
			
		||||
 | 
			
		||||
#else // MEASURE_GATHER_CYCLES
 | 
			
		||||
 | 
			
		||||
vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 | 
			
		||||
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // MEASURE_GATHER_CYCLES
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r9,  [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r9  + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 8
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_aos, .-gather_aos
 | 
			
		||||
							
								
								
									
										147
									
								
								util/gather-bench/src/avx512/gather_md_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								util/gather-bench/src/avx512/gather_md_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,147 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
.section .rodata, "a"
 | 
			
		||||
.align 64
 | 
			
		||||
.align 64
 | 
			
		||||
.ymm_reg_mask.1:
 | 
			
		||||
	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 | 
			
		||||
	.type	.ymm_reg_mask.1,@object
 | 
			
		||||
	.size	.ymm_reg_mask.1,32
 | 
			
		||||
	.align 8
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> neighbors
 | 
			
		||||
# rdx -> numneighs[i]
 | 
			
		||||
# rcx -> &t[t_idx]
 | 
			
		||||
# r8  -> ntest
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_md_aos
 | 
			
		||||
.type gather_md_aos, @function
 | 
			
		||||
gather_md_aos :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
 | 
			
		||||
mov r15, rdx
 | 
			
		||||
xor rax, rax
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd ymm4, ymm3, ymm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd ymm3, ymm4, ymm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd ymm3, ymm3, ymm4
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
# Prefetching instructions
 | 
			
		||||
#mov ebx, DWORD PTR[rsi + rax*4]
 | 
			
		||||
#mov r9d, DWORD PTR[4 + rsi + rax*4]
 | 
			
		||||
#mov r10d, DWORD PTR[8 + rsi + rax*4]
 | 
			
		||||
#mov r11d, DWORD PTR[12 + rsi + rax*4]
 | 
			
		||||
#mov r12d, DWORD PTR[16 + rsi + rax*4]
 | 
			
		||||
#mov r13d, DWORD PTR[20 + rsi + rax*4]
 | 
			
		||||
#mov r14d, DWORD PTR[24 + rsi + rax*4]
 | 
			
		||||
#mov r15d, DWORD PTR[28 + rsi + rax*4]
 | 
			
		||||
#lea ebx, DWORD PTR[rbx]
 | 
			
		||||
#lea r9d, DWORD PTR[r9]
 | 
			
		||||
#lea r10d, DWORD PTR[r10]
 | 
			
		||||
#lea r11d, DWORD PTR[r11]
 | 
			
		||||
#lea r12d, DWORD PTR[r12]
 | 
			
		||||
#lea r13d, DWORD PTR[r13]
 | 
			
		||||
#lea r14d, DWORD PTR[r14]
 | 
			
		||||
#lea r15d, DWORD PTR[r15]
 | 
			
		||||
 | 
			
		||||
vpcmpeqb k1, xmm5, xmm5
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpcmpeqb k2, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k3, xmm5, xmm5
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vpxord zmm0, zmm0, zmm0
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpxord zmm1, zmm1, zmm1
 | 
			
		||||
vpxord zmm2, zmm2, zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + r8  * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r10, [rbx + r8  * 8]
 | 
			
		||||
vmovupd  [r10 + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
# TODO: see if this logic can be optimized
 | 
			
		||||
addq rax, 8
 | 
			
		||||
subq r15, 8
 | 
			
		||||
cmpq r15, 8
 | 
			
		||||
jge 1b
 | 
			
		||||
 | 
			
		||||
cmpq r15, 0
 | 
			
		||||
jle .end_func
 | 
			
		||||
 | 
			
		||||
vpbroadcastd ymm6, r15d
 | 
			
		||||
vpcmpgtd k1, ymm6, ymm7
 | 
			
		||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd ymm4, ymm3, ymm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd ymm3, ymm4, ymm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd ymm3, ymm3, ymm4
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vpxord    zmm0, zmm1, zmm2
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
kmovw     k2, k1
 | 
			
		||||
kmovw     k3, k1
 | 
			
		||||
vpxord    zmm1, zmm1, zmm1
 | 
			
		||||
vpxord    zmm2, zmm2, zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + r8  * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r10, [rbx + r8  * 8]
 | 
			
		||||
vmovupd  [r10  + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, r15
 | 
			
		||||
 | 
			
		||||
.end_func:
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_md_aos, .-gather_md_aos
 | 
			
		||||
							
								
								
									
										67
									
								
								util/gather-bench/src/avx512/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								util/gather-bench/src/avx512/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_soa
 | 
			
		||||
.type gather_soa, @function
 | 
			
		||||
gather_soa :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
vpcmpeqd ymm8, ymm8, ymm8
 | 
			
		||||
lea r8, [rdi + rdx * 8]
 | 
			
		||||
lea r9, [r8  + rdx * 8]
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpcmpeqb k1, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k2, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k3, xmm5, xmm5
 | 
			
		||||
vpxord zmm0, zmm0, zmm0
 | 
			
		||||
vpxord zmm1, zmm1, zmm1
 | 
			
		||||
vpxord zmm2, zmm2, zmm2
 | 
			
		||||
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm1{k2}, [r8  + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [r9  + ymm3 * 8]
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r10, [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r10 + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 8
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_soa, .-gather_soa
 | 
			
		||||
							
								
								
									
										23
									
								
								util/gather-bench/src/avx512/load_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								util/gather-bench/src/avx512/load_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,23 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> &a[i * snbytes]
 | 
			
		||||
 | 
			
		||||
.text
 | 
			
		||||
.globl load_aos
 | 
			
		||||
.type load_aos, @function
 | 
			
		||||
load_aos :
 | 
			
		||||
 | 
			
		||||
vmovsd xmm0, QWORD PTR [rdi]
 | 
			
		||||
vmovsd xmm1, QWORD PTR [8  + rdi]
 | 
			
		||||
vmovsd xmm2, QWORD PTR [16 + rdi]
 | 
			
		||||
 | 
			
		||||
vbroadcastsd zmm3, xmm0
 | 
			
		||||
vbroadcastsd zmm4, xmm1
 | 
			
		||||
vbroadcastsd zmm5, xmm2
 | 
			
		||||
 | 
			
		||||
ret
 | 
			
		||||
.size load_aos, .-load_aos
 | 
			
		||||
							
								
								
									
										32
									
								
								util/gather-bench/src/includes/allocate.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								util/gather-bench/src/includes/allocate.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,32 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#ifndef __ALLOCATE_H_
 | 
			
		||||
#define __ALLOCATE_H_
 | 
			
		||||
 | 
			
		||||
extern void* allocate (int alignment, size_t bytesize);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										53
									
								
								util/gather-bench/src/includes/likwid-marker.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								util/gather-bench/src/includes/likwid-marker.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,53 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#ifndef LIKWID_MARKERS_H
 | 
			
		||||
#define LIKWID_MARKERS_H
 | 
			
		||||
 | 
			
		||||
#ifdef LIKWID_PERFMON
 | 
			
		||||
#include <likwid.h>
 | 
			
		||||
#define LIKWID_MARKER_INIT likwid_markerInit()
 | 
			
		||||
#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
 | 
			
		||||
#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
 | 
			
		||||
#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_CLOSE likwid_markerClose()
 | 
			
		||||
#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
 | 
			
		||||
#else  /* LIKWID_PERFMON */
 | 
			
		||||
#define LIKWID_MARKER_INIT
 | 
			
		||||
#define LIKWID_MARKER_THREADINIT
 | 
			
		||||
#define LIKWID_MARKER_SWITCH
 | 
			
		||||
#define LIKWID_MARKER_REGISTER(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_START(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_STOP(regionTag)
 | 
			
		||||
#define LIKWID_MARKER_CLOSE
 | 
			
		||||
#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
 | 
			
		||||
#define LIKWID_MARKER_RESET(regionTag)
 | 
			
		||||
#endif /* LIKWID_PERFMON */
 | 
			
		||||
 | 
			
		||||
#endif /*LIKWID_MARKERS_H*/
 | 
			
		||||
							
								
								
									
										34
									
								
								util/gather-bench/src/includes/timing.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								util/gather-bench/src/includes/timing.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,34 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#ifndef __TIMING_H_
 | 
			
		||||
#define __TIMING_H_
 | 
			
		||||
 | 
			
		||||
extern double getTimeStamp();
 | 
			
		||||
extern double getTimeResolution();
 | 
			
		||||
extern double getTimeStamp_();
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										441
									
								
								util/gather-bench/src/main-md-trace.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										441
									
								
								util/gather-bench/src/main-md-trace.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,441 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#include <float.h>
 | 
			
		||||
#include <getopt.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <x86intrin.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <likwid-marker.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <allocate.h>
 | 
			
		||||
#include <timing.h>
 | 
			
		||||
 | 
			
		||||
#if !defined(ISA_avx2) && !defined (ISA_avx512)
 | 
			
		||||
#error "Invalid ISA macro, possible values are: avx2 and avx512"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
 | 
			
		||||
#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define HLINE "----------------------------------------------------------------------------\n"
 | 
			
		||||
 | 
			
		||||
#ifndef MIN
 | 
			
		||||
#define MIN(x,y) ((x)<(y)?(x):(y))
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(x,y) ((x)>(y)?(x):(y))
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef ABS
 | 
			
		||||
#define ABS(a) ((a) >= 0 ? (a) : -(a))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define ARRAY_ALIGNMENT  64
 | 
			
		||||
 | 
			
		||||
#ifdef ISA_avx512
 | 
			
		||||
#define _VL_  8
 | 
			
		||||
#define ISA_STRING "avx512"
 | 
			
		||||
#else
 | 
			
		||||
#define _VL_  4
 | 
			
		||||
#define ISA_STRING "avx2"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
#define GATHER gather_md_aos
 | 
			
		||||
#define LOAD(a, i, d, n) load_aos(&a[i * d])
 | 
			
		||||
#define LAYOUT_STRING "AoS"
 | 
			
		||||
#else
 | 
			
		||||
#define GATHER gather_md_soa
 | 
			
		||||
#define LOAD(a, i, d, n) load_soa(a, i, n)
 | 
			
		||||
#define LAYOUT_STRING "SoA"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(PADDING) && defined(AOS)
 | 
			
		||||
#define PADDING_BYTES 1
 | 
			
		||||
#else
 | 
			
		||||
#define PADDING_BYTES 0
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEM_TRACER
 | 
			
		||||
#   define MEM_TRACER_INIT(trace_file)    FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(trace_file), "w");
 | 
			
		||||
#   define MEM_TRACER_END                 fclose(mem_tracer_fp);
 | 
			
		||||
#   define MEM_TRACE(addr, op)            fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
 | 
			
		||||
#else
 | 
			
		||||
#   define MEM_TRACER_INIT
 | 
			
		||||
#   define MEM_TRACER_END
 | 
			
		||||
#   define MEM_TRACE(addr, op)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
int gather_md_aos(double*, int*, int, double*, int);
 | 
			
		||||
int gather_md_soa(double*, int*, int, double*, int);
 | 
			
		||||
void load_aos(double*);
 | 
			
		||||
void load_soa(double*, int, int);
 | 
			
		||||
 | 
			
		||||
const char *get_mem_tracer_filename(const char *trace_file) {
 | 
			
		||||
    static char fname[64];
 | 
			
		||||
    snprintf(fname, sizeof fname, "mem_tracer_%s.txt", trace_file);
 | 
			
		||||
    return fname;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int log2_uint(unsigned int x) {
 | 
			
		||||
    int ans = 0;
 | 
			
		||||
    while(x >>= 1) { ans++; }
 | 
			
		||||
    return ans;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main (int argc, char** argv) {
 | 
			
		||||
    LIKWID_MARKER_INIT;
 | 
			
		||||
    LIKWID_MARKER_REGISTER("gather");
 | 
			
		||||
    char *trace_file = NULL;
 | 
			
		||||
    int cl_size = 64;
 | 
			
		||||
    int ntimesteps = 200;
 | 
			
		||||
    int reneigh_every = 20;
 | 
			
		||||
    int opt = 0;
 | 
			
		||||
    double freq = 2.5;
 | 
			
		||||
    struct option long_opts[] = {
 | 
			
		||||
        {"trace" ,      required_argument,   NULL,   't'},
 | 
			
		||||
        {"freq",        required_argument,   NULL,   'f'},
 | 
			
		||||
        {"line",        required_argument,   NULL,   'l'},
 | 
			
		||||
        {"timesteps",   required_argument,   NULL,   'n'},
 | 
			
		||||
        {"reneigh",     required_argument,   NULL,   'r'},
 | 
			
		||||
        {"help",        required_argument,   NULL,   'h'}
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    while((opt = getopt_long(argc, argv, "t:f:l:n:r:h", long_opts, NULL)) != -1) {
 | 
			
		||||
        switch(opt) {
 | 
			
		||||
            case 't':
 | 
			
		||||
                trace_file = strdup(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'f':
 | 
			
		||||
                freq = atof(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'l':
 | 
			
		||||
                cl_size = atoi(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'n':
 | 
			
		||||
                ntimesteps = atoi(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'r':
 | 
			
		||||
                reneigh_every = atoi(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'h':
 | 
			
		||||
            case '?':
 | 
			
		||||
            default:
 | 
			
		||||
                printf("Usage: %s [OPTION]...\n", argv[0]);
 | 
			
		||||
                printf("MD variant for gather benchmark.\n\n");
 | 
			
		||||
                printf("Mandatory arguments to long options are also mandatory for short options.\n");
 | 
			
		||||
                printf("\t-t, --trace=STRING        input file with traced indexes from MD-Bench.\n");
 | 
			
		||||
                printf("\t-f, --freq=REAL           CPU frequency in GHz (default 2.5).\n");
 | 
			
		||||
                printf("\t-l, --line=NUMBER         cache line size in bytes (default 64).\n");
 | 
			
		||||
                printf("\t-n, --timesteps=NUMBER    number of timesteps to simulate (default 200).\n");
 | 
			
		||||
                printf("\t-r, --reneigh=NUMBER      reneighboring frequency in timesteps (default 20).\n");
 | 
			
		||||
                printf("\t-h, --help                display this help message.\n");
 | 
			
		||||
                printf("\n\n");
 | 
			
		||||
                return EXIT_FAILURE;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(trace_file == NULL) {
 | 
			
		||||
        fprintf(stderr, "Trace file not specified!\n");
 | 
			
		||||
        return EXIT_FAILURE;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    FILE *fp;
 | 
			
		||||
    char *line = NULL;
 | 
			
		||||
    int *neighborlists = NULL;
 | 
			
		||||
    int *numneighs = NULL;
 | 
			
		||||
    int atom = -1;
 | 
			
		||||
    int nlocal, nghost, maxneighs;
 | 
			
		||||
    int nall = 0;
 | 
			
		||||
    int N_alloc = 0;
 | 
			
		||||
    size_t ntest = 0;
 | 
			
		||||
    size_t llen;
 | 
			
		||||
    ssize_t read;
 | 
			
		||||
    double *a = NULL;
 | 
			
		||||
    double *f = NULL;
 | 
			
		||||
    double *t = NULL;
 | 
			
		||||
    double time = 0.0;
 | 
			
		||||
    double E, S;
 | 
			
		||||
    const int dims = 3;
 | 
			
		||||
    const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
 | 
			
		||||
    long long int niters = 0;
 | 
			
		||||
    long long int ngathered = 0;
 | 
			
		||||
 | 
			
		||||
    printf("ISA,Layout,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e)\n");
 | 
			
		||||
    printf("%s,%s,%d,%f,%d,%d\n\n", ISA_STRING, LAYOUT_STRING, dims, freq, cl_size, _VL_);
 | 
			
		||||
    freq = freq * 1e9;
 | 
			
		||||
 | 
			
		||||
    #ifdef ONLY_FIRST_DIMENSION
 | 
			
		||||
    const int gathered_dims = 1;
 | 
			
		||||
    #else
 | 
			
		||||
    const int gathered_dims = dims;
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    for(int ts = -1; ts < ntimesteps; ts++) {
 | 
			
		||||
        if(!((ts + 1) % reneigh_every)) {
 | 
			
		||||
            char ts_trace_file[128];
 | 
			
		||||
            snprintf(ts_trace_file, sizeof ts_trace_file, "%s_%d.out", trace_file, ts + 1);
 | 
			
		||||
            if((fp = fopen(ts_trace_file, "r")) == NULL) {
 | 
			
		||||
                fprintf(stderr, "Error: could not open trace file!\n");
 | 
			
		||||
                return EXIT_FAILURE;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            while((read = getline(&line, &llen, fp)) != -1) {
 | 
			
		||||
                int i = 2;
 | 
			
		||||
                if(strncmp(line, "N:", 2) == 0) {
 | 
			
		||||
                    while(line[i] == ' ') { i++; }
 | 
			
		||||
                    nlocal = atoi(strtok(&line[i], " "));
 | 
			
		||||
                    nghost = atoi(strtok(NULL, " "));
 | 
			
		||||
                    nall = nlocal + nghost;
 | 
			
		||||
                    maxneighs = atoi(strtok(NULL, " "));
 | 
			
		||||
 | 
			
		||||
                    if(nlocal <= 0 || maxneighs <= 0) {
 | 
			
		||||
                        fprintf(stderr, "Number of local atoms and neighbor lists capacity cannot be less or equal than zero!\n");
 | 
			
		||||
                        return EXIT_FAILURE;
 | 
			
		||||
                    }
 | 
			
		||||
 | 
			
		||||
                    if(neighborlists == NULL) {
 | 
			
		||||
                        neighborlists = (int *) allocate( ARRAY_ALIGNMENT, nlocal * maxneighs * sizeof(int) );
 | 
			
		||||
                        numneighs = (int *) allocate( ARRAY_ALIGNMENT, nlocal * sizeof(int) );
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                if(strncmp(line, "A:", 2) == 0) {
 | 
			
		||||
                    while(line[i] == ' ') { i++; }
 | 
			
		||||
                    atom = atoi(strtok(&line[i], " "));
 | 
			
		||||
                    numneighs[atom] = 0;
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                if(strncmp(line, "I:", 2) == 0) {
 | 
			
		||||
                    while(line[i] == ' ') { i++; }
 | 
			
		||||
                    char *neigh_idx = strtok(&line[i], " ");
 | 
			
		||||
 | 
			
		||||
                    while(neigh_idx != NULL && *neigh_idx != '\n') {
 | 
			
		||||
                        int j = numneighs[atom];
 | 
			
		||||
                        neighborlists[atom * maxneighs + j] = atoi(neigh_idx);
 | 
			
		||||
                        numneighs[atom]++;
 | 
			
		||||
                        ntest++;
 | 
			
		||||
                        neigh_idx = strtok(NULL, " ");
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            fclose(fp);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(N_alloc == 0) {
 | 
			
		||||
            N_alloc = nall * 2;
 | 
			
		||||
            a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
 | 
			
		||||
            f = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        #ifdef TEST
 | 
			
		||||
        if(t != NULL) { free(t); }
 | 
			
		||||
        ntest += 100;
 | 
			
		||||
        t = (double*) allocate( ARRAY_ALIGNMENT, ntest * dims * sizeof(double) );
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        for(int i = 0; i < N_alloc; ++i) {
 | 
			
		||||
            #ifdef AOS
 | 
			
		||||
            a[i * snbytes + 0] = i * dims + 0;
 | 
			
		||||
            a[i * snbytes + 1] = i * dims + 1;
 | 
			
		||||
            a[i * snbytes + 2] = i * dims + 2;
 | 
			
		||||
            #else
 | 
			
		||||
            a[N * 0 + i] = N * 0 + i;
 | 
			
		||||
            a[N * 1 + i] = N * 1 + i;
 | 
			
		||||
            a[N * 2 + i] = N * 2 + i;
 | 
			
		||||
            #endif
 | 
			
		||||
            f[i * dims + 0] = 0.0;
 | 
			
		||||
            f[i * dims + 1] = 0.0;
 | 
			
		||||
            f[i * dims + 2] = 0.0;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        int t_idx = 0;
 | 
			
		||||
        S = getTimeStamp();
 | 
			
		||||
        LIKWID_MARKER_START("gather");
 | 
			
		||||
        for(int i = 0; i < nlocal; i++) {
 | 
			
		||||
            int *neighbors = &neighborlists[i * maxneighs];
 | 
			
		||||
            // We inline the assembly for AVX512 with AoS layout to evaluate the impact
 | 
			
		||||
            // of calling external assembly procedures in the overall runtime
 | 
			
		||||
            #ifdef ISA_avx512
 | 
			
		||||
            __m256i ymm_reg_mask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
 | 
			
		||||
            __asm__ __volatile__(   "vmovsd 0(%0), %%xmm3;"
 | 
			
		||||
                                    "vmovsd 8(%0), %%xmm4;"
 | 
			
		||||
                                    "vmovsd 16(%0), %%xmm5;"
 | 
			
		||||
                                    "vbroadcastsd %%xmm3, %%zmm0;"
 | 
			
		||||
                                    "vbroadcastsd %%xmm4, %%zmm1;"
 | 
			
		||||
                                    "vbroadcastsd %%xmm5, %%zmm2;"
 | 
			
		||||
                                    :
 | 
			
		||||
                                    : "r" (&a[i * snbytes])
 | 
			
		||||
                                    : "%xmm3", "%xmm4", "%xmm5", "%zmm0", "%zmm1", "%zmm2"  );
 | 
			
		||||
 | 
			
		||||
            __asm__ __volatile__(   "xor %%rax, %%rax;"
 | 
			
		||||
                                    "movq %%rdx, %%r15;"
 | 
			
		||||
                                    "1: vmovdqu (%1,%%rax,4), %%ymm3;"
 | 
			
		||||
                                    "vpaddd %%ymm3, %%ymm3, %%ymm4;"
 | 
			
		||||
                                    #ifdef PADDING
 | 
			
		||||
                                    "vpaddd %%ymm4, %%ymm4, %%ymm3;"
 | 
			
		||||
                                    #else
 | 
			
		||||
                                    "vpaddd %%ymm3, %%ymm4, %%ymm3;"
 | 
			
		||||
                                    #endif
 | 
			
		||||
                                    "vpcmpeqb %%xmm5, %%xmm5, %%k1;"
 | 
			
		||||
                                    "vpcmpeqb %%xmm5, %%xmm5, %%k2;"
 | 
			
		||||
                                    "vpcmpeqb %%xmm5, %%xmm5, %%k3;"
 | 
			
		||||
                                    "vpxord %%zmm0, %%zmm0, %%zmm0;"
 | 
			
		||||
                                    "vpxord %%zmm1, %%zmm1, %%zmm1;"
 | 
			
		||||
                                    "vpxord %%zmm2, %%zmm2, %%zmm2;"
 | 
			
		||||
                                    "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
 | 
			
		||||
                                    "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
 | 
			
		||||
                                    "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
 | 
			
		||||
                                    "addq $8, %%rax;"
 | 
			
		||||
                                    "subq $8, %%r15;"
 | 
			
		||||
                                    "cmpq $8, %%r15;"
 | 
			
		||||
                                    "jge 1b;"
 | 
			
		||||
                                    "cmpq $0, %%r15;"
 | 
			
		||||
                                    "jle 2;"
 | 
			
		||||
                                    "vpbroadcastd %%r15d, %%ymm5;"
 | 
			
		||||
                                    "vpcmpgtd %%ymm5, %2, %%k1;"
 | 
			
		||||
                                    "vmovdqu32 (%1,%%rax,4), %%ymm3{{%%k1}}{{z}};"
 | 
			
		||||
                                    "vpaddd %%ymm3, %%ymm3, %%ymm4;"
 | 
			
		||||
                                    #ifdef PADDING
 | 
			
		||||
                                    "vpaddd %%ymm4, %%ymm4, %%ymm3;"
 | 
			
		||||
                                    #else
 | 
			
		||||
                                    "vpaddd %%ymm3, %%ymm4, %%ymm3;"
 | 
			
		||||
                                    #endif
 | 
			
		||||
                                    "vpxord %%zmm0, %%zmm0, %%zmm0;"
 | 
			
		||||
                                    "kmovw %%k1, %%k2;"
 | 
			
		||||
                                    "kmovw %%k1, %%k3;"
 | 
			
		||||
                                    "vpxord %%zmm1, %%zmm1, %%zmm1;"
 | 
			
		||||
                                    "vpxord %%zmm2, %%zmm2, %%zmm2;"
 | 
			
		||||
                                    "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
 | 
			
		||||
                                    "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
 | 
			
		||||
                                    "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
 | 
			
		||||
                                    "addq %%r15, %%rax;"
 | 
			
		||||
                                    "2:;"
 | 
			
		||||
                                    :
 | 
			
		||||
                                    : "d" (numneighs[i]), "r" (neighbors), "x" (ymm_reg_mask), "r" (a)
 | 
			
		||||
                                    : "%rax", "%r15", "%ymm3", "%ymm4", "%ymm5", "%k1", "%k2", "%k3", "%zmm0", "%zmm1", "%zmm2" );
 | 
			
		||||
            #else
 | 
			
		||||
            LOAD(a, i, snbytes, N_alloc);
 | 
			
		||||
            t_idx += GATHER(a, neighbors, numneighs[i], &t[t_idx], ntest);
 | 
			
		||||
            #endif
 | 
			
		||||
            f[i * dims + 0] += i;
 | 
			
		||||
            f[i * dims + 1] += i;
 | 
			
		||||
            f[i * dims + 2] += i;
 | 
			
		||||
        }
 | 
			
		||||
        LIKWID_MARKER_STOP("gather");
 | 
			
		||||
        E = getTimeStamp();
 | 
			
		||||
        time += E - S;
 | 
			
		||||
 | 
			
		||||
        #ifdef MEM_TRACER
 | 
			
		||||
        MEM_TRACER_INIT(trace_file);
 | 
			
		||||
        for(int i = 0; i < nlocal; i++) {
 | 
			
		||||
            int *neighbors = &neighborlists[i * maxneighs];
 | 
			
		||||
 | 
			
		||||
            for(int d = 0; d < gathered_dims; d++) {
 | 
			
		||||
                #ifdef AOS
 | 
			
		||||
                MEM_TRACE('R', a[i * snbytes + d])
 | 
			
		||||
                #else
 | 
			
		||||
                MEM_TRACE('R', a[d * N + i])
 | 
			
		||||
                #endif
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            for(int j = 0; j < numneighs[i]; j += _VL_) {
 | 
			
		||||
                for(int jj = j; jj < MIN(j + _VL_, numneighs[i]); j++) {
 | 
			
		||||
                    int k = neighbors[jj];
 | 
			
		||||
                    for(int d = 0; d < gathered_dims; d++) {
 | 
			
		||||
                        #ifdef AOS
 | 
			
		||||
                        MEM_TRACE('R', a[k * snbytes + d])
 | 
			
		||||
                        #else
 | 
			
		||||
                        MEM_TRACE('R', a[d * N + k])
 | 
			
		||||
                        #endif
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        MEM_TRACER_END;
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        #ifdef TEST
 | 
			
		||||
        int test_failed = 0;
 | 
			
		||||
        t_idx = 0;
 | 
			
		||||
        for(int i = 0; i < nlocal; ++i) {
 | 
			
		||||
            int *neighbors = &neighborlists[i * maxneighs];
 | 
			
		||||
            for(int j = 0; j < numneighs[i]; ++j) {
 | 
			
		||||
                int k = neighbors[j];
 | 
			
		||||
                for(int d = 0; d < dims; ++d) {
 | 
			
		||||
                    #ifdef AOS
 | 
			
		||||
                    if(t[d * ntest + t_idx] != k * dims + d) {
 | 
			
		||||
                    #else
 | 
			
		||||
                    if(t[d * ntest + t_idx] != d * N + k) {
 | 
			
		||||
                    #endif
 | 
			
		||||
                        test_failed = 1;
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                t_idx++;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(test_failed) {
 | 
			
		||||
            printf("Test failed!\n");
 | 
			
		||||
            return EXIT_FAILURE;
 | 
			
		||||
        }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        for(int i = 0; i < nlocal; i++) {
 | 
			
		||||
            niters += (numneighs[i] / _VL_) + ((numneighs[i] % _VL_ == 0) ? 0 : 1);
 | 
			
		||||
            ngathered += numneighs[i];
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%14s,%14s,%14s,%14s,%14s,%14s", "tot. time(s)", "time/step(ms)", "time/iter(us)", "cy/it", "cy/gather", "cy/elem");
 | 
			
		||||
    printf("\n");
 | 
			
		||||
    const double time_per_step = time * 1e3 / ((double) ntimesteps);
 | 
			
		||||
    const double time_per_it = time * 1e6 / ((double) niters);
 | 
			
		||||
    const double cy_per_it = time * freq * _VL_ / ((double) niters);
 | 
			
		||||
    const double cy_per_gather = time * freq * _VL_ / ((double) niters * gathered_dims);
 | 
			
		||||
    const double cy_per_elem = time * freq / ((double) ngathered * gathered_dims);
 | 
			
		||||
    printf("%14.6f,%14.6f,%14.6f,%14.6f,%14.6f,%14.6f\n", time, time_per_step, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
 | 
			
		||||
 | 
			
		||||
    #ifdef TEST
 | 
			
		||||
    printf("Test passed!\n");
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_CLOSE;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										361
									
								
								util/gather-bench/src/main-md.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										361
									
								
								util/gather-bench/src/main-md.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,361 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#include <float.h>
 | 
			
		||||
#include <getopt.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <likwid-marker.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <allocate.h>
 | 
			
		||||
#include <timing.h>
 | 
			
		||||
 | 
			
		||||
#if !defined(ISA_avx2) && !defined (ISA_avx512)
 | 
			
		||||
#error "Invalid ISA macro, possible values are: avx2 and avx512"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
 | 
			
		||||
#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define HLINE "----------------------------------------------------------------------------\n"
 | 
			
		||||
 | 
			
		||||
#ifndef MIN
 | 
			
		||||
#define MIN(x,y) ((x)<(y)?(x):(y))
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(x,y) ((x)>(y)?(x):(y))
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef ABS
 | 
			
		||||
#define ABS(a) ((a) >= 0 ? (a) : -(a))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define ARRAY_ALIGNMENT  64
 | 
			
		||||
#define SIZE  20000
 | 
			
		||||
 | 
			
		||||
#ifdef ISA_avx512
 | 
			
		||||
#define _VL_  8
 | 
			
		||||
#define ISA_STRING "avx512"
 | 
			
		||||
#else
 | 
			
		||||
#define _VL_  4
 | 
			
		||||
#define ISA_STRING "avx2"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
#define GATHER gather_aos
 | 
			
		||||
#define LAYOUT_STRING "AoS"
 | 
			
		||||
#else
 | 
			
		||||
#define GATHER gather_soa
 | 
			
		||||
#define LAYOUT_STRING "SoA"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#if defined(PADDING) && defined(AOS)
 | 
			
		||||
#define PADDING_BYTES 1
 | 
			
		||||
#else
 | 
			
		||||
#define PADDING_BYTES 0
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEM_TRACER
 | 
			
		||||
#   define MEM_TRACER_INIT(stride, size)  FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(stride, size), "w");
 | 
			
		||||
#   define MEM_TRACER_END                 fclose(mem_tracer_fp);
 | 
			
		||||
#   define MEM_TRACE(addr, op)            fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
 | 
			
		||||
#else
 | 
			
		||||
#   define MEM_TRACER_INIT
 | 
			
		||||
#   define MEM_TRACER_END
 | 
			
		||||
#   define MEM_TRACE(addr, op)
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
extern void gather_aos(double*, int*, int, double*, long int*);
 | 
			
		||||
extern void gather_soa(double*, int*, int, double*, long int*);
 | 
			
		||||
 | 
			
		||||
const char *get_mem_tracer_filename(int stride, int size) {
 | 
			
		||||
    static char fname[64];
 | 
			
		||||
    snprintf(fname, sizeof fname, "mem_tracer_%d_%d.txt", stride, size);
 | 
			
		||||
    return fname;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int log2_uint(unsigned int x) {
 | 
			
		||||
    int ans = 0;
 | 
			
		||||
    while(x >>= 1) { ans++; }
 | 
			
		||||
    return ans;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main (int argc, char** argv) {
 | 
			
		||||
    LIKWID_MARKER_INIT;
 | 
			
		||||
    LIKWID_MARKER_REGISTER("gather");
 | 
			
		||||
    int stride = 1;
 | 
			
		||||
    int cl_size = 64;
 | 
			
		||||
    int opt = 0;
 | 
			
		||||
    double freq = 2.5;
 | 
			
		||||
    struct option long_opts[] = {
 | 
			
		||||
        {"stride", required_argument,   NULL,   's'},
 | 
			
		||||
        {"freq",   required_argument,   NULL,   'f'},
 | 
			
		||||
        {"line",   required_argument,   NULL,   'l'},
 | 
			
		||||
        {"help",   required_argument,   NULL,   'h'}
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    while((opt = getopt_long(argc, argv, "s:f:l:h", long_opts, NULL)) != -1) {
 | 
			
		||||
        switch(opt) {
 | 
			
		||||
            case 's':
 | 
			
		||||
                stride = atoi(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'f':
 | 
			
		||||
                freq = atof(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'l':
 | 
			
		||||
                cl_size = atoi(optarg);
 | 
			
		||||
                break;
 | 
			
		||||
 | 
			
		||||
            case 'h':
 | 
			
		||||
            case '?':
 | 
			
		||||
            default:
 | 
			
		||||
                printf("Usage: %s [OPTION]...\n", argv[0]);
 | 
			
		||||
                printf("MD variant for gather benchmark.\n\n");
 | 
			
		||||
                printf("Mandatory arguments to long options are also mandatory for short options.\n");
 | 
			
		||||
                printf("\t-s, --stride=NUMBER   stride between two successive elements (default 1).\n");
 | 
			
		||||
                printf("\t-f, --freq=REAL       CPU frequency in GHz (default 2.5).\n");
 | 
			
		||||
                printf("\t-l, --line=NUMBER     cache line size in bytes (default 64).\n");
 | 
			
		||||
                printf("\t-h, --help            display this help message.\n");
 | 
			
		||||
                printf("\n\n");
 | 
			
		||||
                return EXIT_FAILURE;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    size_t bytesPerWord = sizeof(double);
 | 
			
		||||
    const int dims = 3;
 | 
			
		||||
    const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
 | 
			
		||||
    #ifdef AOS
 | 
			
		||||
    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ * snbytes / (cl_size / sizeof(double)), 1), _VL_);
 | 
			
		||||
    #else
 | 
			
		||||
    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_) * dims;
 | 
			
		||||
    #endif
 | 
			
		||||
    size_t N = SIZE;
 | 
			
		||||
    double E, S;
 | 
			
		||||
 | 
			
		||||
    printf("ISA,Layout,Stride,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e),Cache Lines/Gather\n");
 | 
			
		||||
    printf("%s,%s,%d,%d,%f,%d,%d,%lu\n\n", ISA_STRING, LAYOUT_STRING, stride, dims, freq, cl_size, _VL_, cacheLinesPerGather);
 | 
			
		||||
    printf("%14s,%14s,%14s,", "N", "Size(kB)", "cut CLs");
 | 
			
		||||
 | 
			
		||||
#ifndef MEASURE_GATHER_CYCLES
 | 
			
		||||
    printf("%14s,%14s,%14s,%14s,%14s", "tot. time", "time/LUP(ms)", "cy/it", "cy/gather", "cy/elem");
 | 
			
		||||
#else
 | 
			
		||||
 | 
			
		||||
#ifdef ONLY_FIRST_DIMENSION
 | 
			
		||||
    printf("%27s,%27s,%27s", "min/max/avg cy(x)", "min/max/avg cy(y)", "min/max/avg cy(z)");
 | 
			
		||||
#else
 | 
			
		||||
    printf("%27s", "min/max/avg cy(x)");
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    printf("\n");
 | 
			
		||||
    freq = freq * 1e9;
 | 
			
		||||
 | 
			
		||||
    for(int N = 512; N < 80000000; N = 1.5 * N) {
 | 
			
		||||
        // Currently this only works when the array size (in elements) is multiple of the vector length (no preamble and prelude)
 | 
			
		||||
        if(N % _VL_ != 0) {
 | 
			
		||||
            N += _VL_ - (N % _VL_);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        MEM_TRACER_INIT(stride, N);
 | 
			
		||||
 | 
			
		||||
        int N_gathers_per_dim = N / _VL_;
 | 
			
		||||
        int N_alloc = N * 2;
 | 
			
		||||
        int N_cycles_alloc = N_gathers_per_dim * 2;
 | 
			
		||||
        int cut_cl = 0;
 | 
			
		||||
        double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
 | 
			
		||||
        int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
 | 
			
		||||
        int rep;
 | 
			
		||||
        double time;
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
        double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
 | 
			
		||||
#else
 | 
			
		||||
        double* t = (double*) NULL;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEASURE_GATHER_CYCLES
 | 
			
		||||
        long int* cycles = (long int*) allocate( ARRAY_ALIGNMENT, N_cycles_alloc * dims * sizeof(long int)) ;
 | 
			
		||||
#else
 | 
			
		||||
        long int* cycles = (long int*) NULL;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        for(int i = 0; i < N_alloc; ++i) {
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
            a[i * snbytes + 0] = i * dims + 0;
 | 
			
		||||
            a[i * snbytes + 1] = i * dims + 1;
 | 
			
		||||
            a[i * snbytes + 2] = i * dims + 2;
 | 
			
		||||
#else
 | 
			
		||||
            a[N * 0 + i] = N * 0 + i;
 | 
			
		||||
            a[N * 1 + i] = N * 1 + i;
 | 
			
		||||
            a[N * 2 + i] = N * 2 + i;
 | 
			
		||||
#endif
 | 
			
		||||
            idx[i] = (i * stride) % N;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
#ifdef ONLY_FIRST_DIMENSION
 | 
			
		||||
        const int gathered_dims = 1;
 | 
			
		||||
#else
 | 
			
		||||
        const int gathered_dims = dims;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEM_TRACER
 | 
			
		||||
        for(int i = 0; i < N; i += _VL_) {
 | 
			
		||||
            for(int j = 0; j < _VL_; j++) {
 | 
			
		||||
                MEM_TRACE(idx[i + j], 'R');
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            for(int d = 0; d < gathered_dims; d++) {
 | 
			
		||||
                for(int j = 0; j < _VL_; j++) {
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
                    MEM_TRACE(a[idx[i + j] * snbytes + d], 'R');
 | 
			
		||||
#else
 | 
			
		||||
                    MEM_TRACE(a[N * d + idx[i + j]], 'R');
 | 
			
		||||
#endif
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
        const int cl_shift = log2_uint((unsigned int) cl_size);
 | 
			
		||||
        for(int i = 0; i < N; i++) {
 | 
			
		||||
            const int first_cl = (idx[i] * snbytes * sizeof(double)) >> cl_shift;
 | 
			
		||||
            const int last_cl = ((idx[i] * snbytes + gathered_dims - 1) * sizeof(double)) >> cl_shift;
 | 
			
		||||
            if(first_cl != last_cl) {
 | 
			
		||||
                cut_cl++;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        S = getTimeStamp();
 | 
			
		||||
        for(int r = 0; r < 100; ++r) {
 | 
			
		||||
            GATHER(a, idx, N, t, cycles);
 | 
			
		||||
        }
 | 
			
		||||
        E = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
#ifdef MEASURE_GATHER_CYCLES
 | 
			
		||||
        for(int i = 0; i < N_cycles_alloc; i++) {
 | 
			
		||||
            cycles[i * 3 + 0] = 0;
 | 
			
		||||
            cycles[i * 3 + 1] = 0;
 | 
			
		||||
            cycles[i * 3 + 2] = 0;
 | 
			
		||||
        }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        rep = 100 * (0.5 / (E - S));
 | 
			
		||||
        S = getTimeStamp();
 | 
			
		||||
        LIKWID_MARKER_START("gather");
 | 
			
		||||
        for(int r = 0; r < rep; ++r) {
 | 
			
		||||
            GATHER(a, idx, N, t, cycles);
 | 
			
		||||
        }
 | 
			
		||||
        LIKWID_MARKER_STOP("gather");
 | 
			
		||||
        E = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
        time = E - S;
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
        int test_failed = 0;
 | 
			
		||||
        for(int i = 0; i < N; ++i) {
 | 
			
		||||
            for(int d = 0; d < dims; ++d) {
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
                if(t[d * N + i] != ((i * stride) % N) * dims + d) {
 | 
			
		||||
#else
 | 
			
		||||
                if(t[d * N + i] != d * N + ((i * stride) % N)) {
 | 
			
		||||
#endif
 | 
			
		||||
                    test_failed = 1;
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(test_failed) {
 | 
			
		||||
            printf("Test failed!\n");
 | 
			
		||||
            return EXIT_FAILURE;
 | 
			
		||||
        } else {
 | 
			
		||||
            printf("Test passed!\n");
 | 
			
		||||
        }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        const double size = N * (dims * sizeof(double) + sizeof(int)) / 1000.0;
 | 
			
		||||
        printf("%14d,%14.2f,%14d,", N, size, cut_cl);
 | 
			
		||||
 | 
			
		||||
#ifndef MEASURE_GATHER_CYCLES
 | 
			
		||||
        const double time_per_it = time * 1e6 / ((double) N * rep);
 | 
			
		||||
        const double cy_per_it = time * freq * _VL_ / ((double) N * rep);
 | 
			
		||||
        const double cy_per_gather = time * freq * _VL_ / ((double) N * rep * gathered_dims);
 | 
			
		||||
        const double cy_per_elem = time * freq / ((double) N * rep * gathered_dims);
 | 
			
		||||
        printf("%14.10f,%14.10f,%14.6f,%14.6f,%14.6f", time, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
 | 
			
		||||
#else
 | 
			
		||||
        double cy_min[dims];
 | 
			
		||||
        double cy_max[dims];
 | 
			
		||||
        double cy_avg[dims];
 | 
			
		||||
 | 
			
		||||
        for(int d = 0; d < dims; d++) {
 | 
			
		||||
            cy_min[d] = 100000.0;
 | 
			
		||||
            cy_max[d] = 0.0;
 | 
			
		||||
            cy_avg[d] = 0.0;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int i = 0; i < N_gathers_per_dim; ++i) {
 | 
			
		||||
            for(int d = 0; d < gathered_dims; d++) {
 | 
			
		||||
                const double cy_d = (double)(cycles[i * 3 + d]);
 | 
			
		||||
                cy_min[d] = MIN(cy_min[d], cy_d);
 | 
			
		||||
                cy_max[d] = MAX(cy_max[d], cy_d);
 | 
			
		||||
                cy_avg[d] += cy_d;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int d = 0; d < gathered_dims; d++) {
 | 
			
		||||
            char tmp_str[64];
 | 
			
		||||
            cy_avg[d] /= (double) N_gathers_per_dim;
 | 
			
		||||
            snprintf(tmp_str, sizeof tmp_str, "%4.4f/%4.4f/%4.4f", cy_min[d], cy_max[d], cy_avg[d]);
 | 
			
		||||
            printf("%27s%c", tmp_str, (d < gathered_dims - 1) ? ',' : ' ');
 | 
			
		||||
        }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        printf("\n");
 | 
			
		||||
        free(a);
 | 
			
		||||
        free(idx);
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
        free(t);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEASURE_GATHER_CYCLES
 | 
			
		||||
        free(cycles);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        MEM_TRACER_END;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_CLOSE;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										166
									
								
								util/gather-bench/src/main.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										166
									
								
								util/gather-bench/src/main.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,166 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <float.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <likwid-marker.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <timing.h>
 | 
			
		||||
#include <allocate.h>
 | 
			
		||||
 | 
			
		||||
#if !defined(ISA_avx2) && !defined (ISA_avx512)
 | 
			
		||||
#error "Invalid ISA macro, possible values are: avx2 and avx512"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define HLINE "----------------------------------------------------------------------------\n"
 | 
			
		||||
 | 
			
		||||
#ifndef MIN
 | 
			
		||||
#define MIN(x,y) ((x)<(y)?(x):(y))
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef MAX
 | 
			
		||||
#define MAX(x,y) ((x)>(y)?(x):(y))
 | 
			
		||||
#endif
 | 
			
		||||
#ifndef ABS
 | 
			
		||||
#define ABS(a) ((a) >= 0 ? (a) : -(a))
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define ARRAY_ALIGNMENT  64
 | 
			
		||||
#define SIZE  20000
 | 
			
		||||
 | 
			
		||||
#ifdef ISA_avx512
 | 
			
		||||
#define _VL_  8
 | 
			
		||||
#define ISA_STRING "avx512"
 | 
			
		||||
#else
 | 
			
		||||
#define _VL_  4
 | 
			
		||||
#define ISA_STRING "avx2"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
extern void gather(double*, int*, int, double*);
 | 
			
		||||
#else
 | 
			
		||||
extern void gather(double*, int*, int);
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
int main (int argc, char** argv) {
 | 
			
		||||
    LIKWID_MARKER_INIT;
 | 
			
		||||
    LIKWID_MARKER_REGISTER("gather");
 | 
			
		||||
 | 
			
		||||
    if (argc < 3) {
 | 
			
		||||
        printf("Please provide stride and frequency\n");
 | 
			
		||||
        printf("%s <stride> <freq (GHz)> [cache line size (B)]\n", argv[0]);
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    int stride = atoi(argv[1]);
 | 
			
		||||
    double freq = atof(argv[2]);
 | 
			
		||||
    int cl_size = (argc == 3) ? 64 : atoi(argv[3]);
 | 
			
		||||
    size_t bytesPerWord = sizeof(double);
 | 
			
		||||
    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_);
 | 
			
		||||
    size_t N = SIZE;
 | 
			
		||||
    double E, S;
 | 
			
		||||
 | 
			
		||||
    printf("ISA,Stride (elems),Frequency (GHz),Cache Line Size (B),Vector Width (elems),Cache Lines/Gather\n");
 | 
			
		||||
    printf("%s,%d,%f,%d,%d,%lu\n\n", ISA_STRING, stride, freq, cl_size, _VL_, cacheLinesPerGather);
 | 
			
		||||
    printf("%14s,%14s,%14s,%14s,%14s,%14s\n", "N", "Size(kB)", "tot. time", "time/LUP(ms)", "cy/gather", "cy/elem");
 | 
			
		||||
 | 
			
		||||
    freq = freq * 1e9;
 | 
			
		||||
    for(int N = 1024; N < 400000; N = 1.5 * N) {
 | 
			
		||||
        int N_alloc = N * 2;
 | 
			
		||||
        double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
 | 
			
		||||
        int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
 | 
			
		||||
        int rep;
 | 
			
		||||
        double time;
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
        double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        for(int i = 0; i < N_alloc; ++i) {
 | 
			
		||||
            a[i] = i;
 | 
			
		||||
            idx[i] = (i * stride) % N;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        S = getTimeStamp();
 | 
			
		||||
        for(int r = 0; r < 100; ++r) {
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
            gather(a, idx, N, t);
 | 
			
		||||
#else
 | 
			
		||||
            gather(a, idx, N);
 | 
			
		||||
#endif
 | 
			
		||||
        }
 | 
			
		||||
        E = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
        rep = 100 * (0.5 / (E - S));
 | 
			
		||||
        S = getTimeStamp();
 | 
			
		||||
        LIKWID_MARKER_START("gather");
 | 
			
		||||
        for(int r = 0; r < rep; ++r) {
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
            gather(a, idx, N, t);
 | 
			
		||||
#else
 | 
			
		||||
            gather(a, idx, N);
 | 
			
		||||
#endif
 | 
			
		||||
        }
 | 
			
		||||
        LIKWID_MARKER_STOP("gather");
 | 
			
		||||
        E = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
        time = E - S;
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
        int test_failed = 0;
 | 
			
		||||
        for(int i = 0; i < N; ++i) {
 | 
			
		||||
            if(t[i] != i * stride % N) {
 | 
			
		||||
                test_failed = 1;
 | 
			
		||||
                break;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(test_failed) {
 | 
			
		||||
            printf("Test failed!\n");
 | 
			
		||||
            return EXIT_FAILURE;
 | 
			
		||||
        } else {
 | 
			
		||||
            printf("Test passed!\n");
 | 
			
		||||
        }
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
        const double size = N * (sizeof(double) + sizeof(int)) / 1000.0;
 | 
			
		||||
        const double time_per_it = time * 1e6 / ((double) N * rep);
 | 
			
		||||
        const double cy_per_gather = time * freq * _VL_ / ((double) N * rep);
 | 
			
		||||
        const double cy_per_elem = time * freq / ((double) N * rep);
 | 
			
		||||
        printf("%14d,%14.2f,%14.10f,%14.10f,%14.6f,%14.6f\n", N, size, time, time_per_it, cy_per_gather, cy_per_elem);
 | 
			
		||||
        free(a);
 | 
			
		||||
        free(idx);
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
        free(t);
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_CLOSE;
 | 
			
		||||
    return EXIT_SUCCESS;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										47
									
								
								util/gather-bench/src/timing.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								util/gather-bench/src/timing.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,47 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <time.h>
 | 
			
		||||
 | 
			
		||||
double getTimeStamp()
 | 
			
		||||
{
 | 
			
		||||
    struct timespec ts;
 | 
			
		||||
    clock_gettime(CLOCK_MONOTONIC, &ts);
 | 
			
		||||
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double getTimeResolution()
 | 
			
		||||
{
 | 
			
		||||
    struct timespec ts;
 | 
			
		||||
    clock_getres(CLOCK_MONOTONIC, &ts);
 | 
			
		||||
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double getTimeStamp_()
 | 
			
		||||
{
 | 
			
		||||
    return getTimeStamp();
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user