diff --git a/Makefile b/Makefile index bd4f835..a4e369c 100644 --- a/Makefile +++ b/Makefile @@ -1,109 +1,30 @@ #CONFIGURE BUILD SYSTEM -IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE) -TARGET = MDBench-$(IDENTIFIER) -BUILD_DIR = ./build-$(IDENTIFIER) -SRC_DIR = ./$(OPT_SCHEME) -ASM_DIR = ./asm -COMMON_DIR = ./common -CUDA_DIR = ./$(SRC_DIR)/cuda -MAKE_DIR = ./ +TAG = $(OPT_TAG)-$(TOOLCHAIN)-$(DATA_TYPE) +TARGET = MDBench-$(TAG) +BUILD_DIR = ./build/build-$(TAG) +SRC_ROOT = ./src +SRC_DIR = $(SRC_ROOT)/$(OPT_SCHEME) +COMMON_DIR = $(SRC_ROOT)/common +CUDA_DIR = $(SRC_DIR)/cuda +MAKE_DIR = ./make Q ?= @ #DO NOT EDIT BELOW -include $(MAKE_DIR)/config.mk -include $(MAKE_DIR)/include_$(TAG).mk +include config.mk +include $(MAKE_DIR)/include_$(TOOLCHAIN).mk include $(MAKE_DIR)/include_LIKWID.mk +ifneq ($(strip $(ISA)),NONE) include $(MAKE_DIR)/include_ISA.mk +endif include $(MAKE_DIR)/include_GROMACS.mk -INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes +INCLUDES += -I./$(SRC_DIR) -I./$(COMMON_DIR) -ifeq ($(strip $(DATA_LAYOUT)),AOS) - DEFINES += -DAOS -endif -ifeq ($(strip $(DATA_TYPE)),SP) - DEFINES += -DPRECISION=1 -else - DEFINES += -DPRECISION=2 -endif - -ifneq ($(ASM_SYNTAX), ATT) - ASFLAGS += -masm=intel -endif - -ifeq ($(strip $(SORT_ATOMS)),true) - DEFINES += -DSORT_ATOMS -endif - -ifeq ($(strip $(EXPLICIT_TYPES)),true) - DEFINES += -DEXPLICIT_TYPES -endif - -ifeq ($(strip $(MEM_TRACER)),true) - DEFINES += -DMEM_TRACER -endif - -ifeq ($(strip $(INDEX_TRACER)),true) - DEFINES += -DINDEX_TRACER -endif - -ifeq ($(strip $(COMPUTE_STATS)),true) - DEFINES += -DCOMPUTE_STATS -endif - -ifeq ($(strip $(XTC_OUTPUT)),true) - DEFINES += -DXTC_OUTPUT -endif - -ifeq ($(strip $(USE_REFERENCE_VERSION)),true) - DEFINES += -DUSE_REFERENCE_VERSION -endif - -ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true) - DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ -endif - -ifeq ($(strip $(DEBUG)),true) - DEFINES += -DDEBUG -endif - -ifneq ($(VECTOR_WIDTH),) - DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH) -endif - -ifeq ($(strip $(__SIMD_KERNEL__)),true) - DEFINES += -D__SIMD_KERNEL__ -endif - -ifeq ($(strip $(__SSE__)),true) - DEFINES += -D__ISA_SSE__ -endif - -ifeq ($(strip $(__ISA_AVX__)),true) - DEFINES += -D__ISA_AVX__ -endif - -ifeq ($(strip $(__ISA_AVX_FMA__)),true) - DEFINES += -D__ISA_AVX_FMA__ -endif - -ifeq ($(strip $(__ISA_AVX2__)),true) - DEFINES += -D__ISA_AVX2__ -endif - -ifeq ($(strip $(__ISA_AVX512__)),true) - DEFINES += -D__ISA_AVX512__ -endif - -ifeq ($(strip $(ENABLE_OMP_SIMD)),true) - DEFINES += -DENABLE_OMP_SIMD -endif - -VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR) +VPATH = $(SRC_DIR) $(COMMON_DIR) $(CUDA_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))) OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s)) -OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%-common.o,$(wildcard $(COMMON_DIR)/*.c)) +OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(COMMON_DIR)/*.c)) ifeq ($(strip $(TAG)),NVCC) OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu)) endif @@ -129,11 +50,6 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@ $(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d -$(BUILD_DIR)/%-common.o: $(COMMON_DIR)/%.c - $(info ===> COMPILE $@) - $(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@ - $(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d - $(BUILD_DIR)/%-cuda.o: %.cu $(info ===> COMPILE $@) $(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@ @@ -152,18 +68,16 @@ $(BUILD_DIR)/%.o: %.s clean: $(info ===> CLEAN) @rm -rf $(BUILD_DIR) - @rm -rf $(TARGET)* - @rm -f tags cleanall: $(info ===> CLEAN) - @rm -rf build-* + @rm -rf build @rm -rf MDBench-* @rm -f tags distclean: clean $(info ===> DIST CLEAN) - @rm -f $(TARGET)* + @rm -f $(TARGET) @rm -f tags info: @@ -177,6 +91,6 @@ tags: $(Q)ctags -R $(BUILD_DIR): - @mkdir $(BUILD_DIR) + @mkdir -p $(BUILD_DIR) -include $(OBJ:.o=.d) diff --git a/config.mk b/config.mk index a1662cc..3f31b0f 100644 --- a/config.mk +++ b/config.mk @@ -1,7 +1,8 @@ -# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC) -TAG ?= CLANG -# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512) -ISA ?= SSE +# Compiler tool chain (GCC/CLANG/ICC/ICX/ONEAPI/NVCC) +TOOLCHAIN ?= CLANG +# Instruction set for instrinsic kernels (NONE/SSE/AVX/AVX_FMA/AVX2/AVX512) +ISA ?= ARM +SIMD ?= NONE # Optimization scheme (verletlist/clusterpair/clusters_per_bin) OPT_SCHEME ?= verletlist # Enable likwid (true or false) @@ -47,3 +48,93 @@ USE_CUDA_HOST_MEMORY ?= false #Feature options OPTIONS = -DALIGNMENT=64 #OPTIONS += More options + +#DO NOT EDIT BELOW +ifeq ($(strip $(DATA_LAYOUT)),AOS) + DEFINES += -DAOS +endif +ifeq ($(strip $(DATA_TYPE)),SP) + DEFINES += -DPRECISION=1 +else + DEFINES += -DPRECISION=2 +endif + +ifneq ($(ASM_SYNTAX), ATT) + ASFLAGS += -masm=intel +endif + +ifeq ($(strip $(SORT_ATOMS)),true) + DEFINES += -DSORT_ATOMS +endif + +ifeq ($(strip $(EXPLICIT_TYPES)),true) + DEFINES += -DEXPLICIT_TYPES +endif + +ifeq ($(strip $(MEM_TRACER)),true) + DEFINES += -DMEM_TRACER +endif + +ifeq ($(strip $(INDEX_TRACER)),true) + DEFINES += -DINDEX_TRACER +endif + +ifeq ($(strip $(COMPUTE_STATS)),true) + DEFINES += -DCOMPUTE_STATS +endif + +ifeq ($(strip $(XTC_OUTPUT)),true) + DEFINES += -DXTC_OUTPUT +endif + +ifeq ($(strip $(USE_REFERENCE_VERSION)),true) + DEFINES += -DUSE_REFERENCE_VERSION +endif + +ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true) + DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ +endif + +ifeq ($(strip $(DEBUG)),true) + DEFINES += -DDEBUG +endif + +ifneq ($(VECTOR_WIDTH),) + DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH) +endif + +ifeq ($(strip $(__SIMD_KERNEL__)),true) + DEFINES += -D__SIMD_KERNEL__ +endif + +ifeq ($(strip $(__SSE__)),true) + DEFINES += -D__ISA_SSE__ +endif + +ifeq ($(strip $(__ISA_AVX__)),true) + DEFINES += -D__ISA_AVX__ +endif + +ifeq ($(strip $(__ISA_AVX_FMA__)),true) + DEFINES += -D__ISA_AVX_FMA__ +endif + +ifeq ($(strip $(__ISA_AVX2__)),true) + DEFINES += -D__ISA_AVX2__ +endif + +ifeq ($(strip $(__ISA_AVX512__)),true) + DEFINES += -D__ISA_AVX512__ +endif + +ifeq ($(strip $(ENABLE_OMP_SIMD)),true) + DEFINES += -DENABLE_OMP_SIMD +endif + +ifeq ($(strip $(OPT_SCHEME)),verletlist) + OPT_TAG = VL +endif + +ifneq ($(strip $(SIMD)),NONE) + TOOLCHAIN = $(TOOLCHAIN)-$(ISA)-$(SIMD) +endif diff --git a/figures/features-v3.png b/figures/features-v3.png deleted file mode 100644 index 07fffad..0000000 Binary files a/figures/features-v3.png and /dev/null differ diff --git a/figures/gather_bench.png b/figures/gather_bench.png deleted file mode 100644 index 5d6528d..0000000 Binary files a/figures/gather_bench.png and /dev/null differ diff --git a/figures/gather_bench.svg b/figures/gather_bench.svg deleted file mode 100644 index 4843b7d..0000000 --- a/figures/gather_bench.svg +++ /dev/null @@ -1,523 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - gather-bench - - L1 - - L2 - - L3 - - DRAM - - Single gather - - MD gathers - - Contiguous - - "Random" - - - - - - - - - - - - - - - - - - Application Level - Hardware Level - vgather instructions - - diff --git a/figures/gromacs_mxn_v2.pdf b/figures/gromacs_mxn_v2.pdf deleted file mode 100644 index 474fafd..0000000 Binary files a/figures/gromacs_mxn_v2.pdf and /dev/null differ diff --git a/figures/gromacs_mxn_v2.png b/figures/gromacs_mxn_v2.png deleted file mode 100644 index 1733fd7..0000000 Binary files a/figures/gromacs_mxn_v2.png and /dev/null differ diff --git a/figures/stub_new_v3.pdf b/figures/stub_new_v3.pdf deleted file mode 100644 index 71d8d8b..0000000 Binary files a/figures/stub_new_v3.pdf and /dev/null differ diff --git a/figures/stub_new_v3.png b/figures/stub_new_v3.png deleted file mode 100644 index a275199..0000000 Binary files a/figures/stub_new_v3.png and /dev/null differ diff --git a/figures/verlet_v2.pdf b/figures/verlet_v2.pdf deleted file mode 100644 index af6e7bc..0000000 Binary files a/figures/verlet_v2.pdf and /dev/null differ diff --git a/figures/verlet_v2.png b/figures/verlet_v2.png deleted file mode 100644 index d3a0523..0000000 Binary files a/figures/verlet_v2.png and /dev/null differ diff --git a/include_CLANG.mk b/make/include_CLANG.mk similarity index 50% rename from include_CLANG.mk rename to make/include_CLANG.mk index fb69d21..085097f 100644 --- a/include_CLANG.mk +++ b/make/include_CLANG.mk @@ -1,17 +1,18 @@ -CC = clang +CC = /opt/homebrew/Cellar/llvm/18.1.5/bin/clang LINKER = $(CC) ANSI_CFLAGS = -ansi ANSI_CFLAGS += -std=c99 ANSI_CFLAGS += -pedantic -ANSI_CFLAGS += -Wextra +# ANSI_CFLAGS += -Wextra -CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g +CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) -Xpreprocessor -fopenmp #-g #CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g #CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g #CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g -ASFLAGS = -masm=intel +ASFLAGS = #-masm=intel LFLAGS = DEFINES = -D_GNU_SOURCE -INCLUDES = -LIBS = -lm #-lomp +# MacOSX with Apple Silicon and homebrew +INCLUDES = -I/opt/homebrew/Cellar/libomp/18.1.5/include/ +LIBS = -lm -L/opt/homebrew/Cellar/libomp/18.1.5/lib/ -lomp diff --git a/include_GCC.mk b/make/include_GCC.mk similarity index 100% rename from include_GCC.mk rename to make/include_GCC.mk diff --git a/include_GROMACS.mk b/make/include_GROMACS.mk similarity index 100% rename from include_GROMACS.mk rename to make/include_GROMACS.mk diff --git a/include_ICC.mk b/make/include_ICC.mk similarity index 100% rename from include_ICC.mk rename to make/include_ICC.mk diff --git a/include_ICX.mk b/make/include_ICX.mk similarity index 100% rename from include_ICX.mk rename to make/include_ICX.mk diff --git a/include_ISA.mk b/make/include_ISA.mk similarity index 100% rename from include_ISA.mk rename to make/include_ISA.mk diff --git a/include_LIKWID.mk b/make/include_LIKWID.mk similarity index 100% rename from include_LIKWID.mk rename to make/include_LIKWID.mk diff --git a/include_NVCC.mk b/make/include_NVCC.mk similarity index 100% rename from include_NVCC.mk rename to make/include_NVCC.mk diff --git a/include_ONEAPI.mk b/make/include_ONEAPI.mk similarity index 100% rename from include_ONEAPI.mk rename to make/include_ONEAPI.mk diff --git a/clusterpair/atom.c b/src/clusterpair/atom.c similarity index 100% rename from clusterpair/atom.c rename to src/clusterpair/atom.c diff --git a/clusterpair/includes/atom.h b/src/clusterpair/atom.h similarity index 100% rename from clusterpair/includes/atom.h rename to src/clusterpair/atom.h diff --git a/clusterpair/cuda/force_lj.cu b/src/clusterpair/cuda/force_lj.cu similarity index 100% rename from clusterpair/cuda/force_lj.cu rename to src/clusterpair/cuda/force_lj.cu diff --git a/clusterpair/force_eam.c b/src/clusterpair/force_eam.c similarity index 100% rename from clusterpair/force_eam.c rename to src/clusterpair/force_eam.c diff --git a/clusterpair/force_lj.c b/src/clusterpair/force_lj.c similarity index 100% rename from clusterpair/force_lj.c rename to src/clusterpair/force_lj.c diff --git a/clusterpair/includes/integrate.h b/src/clusterpair/integrate.h similarity index 100% rename from clusterpair/includes/integrate.h rename to src/clusterpair/integrate.h diff --git a/clusterpair/main-stub.c b/src/clusterpair/main-stub.c similarity index 100% rename from clusterpair/main-stub.c rename to src/clusterpair/main-stub.c diff --git a/clusterpair/main.c b/src/clusterpair/main.c similarity index 100% rename from clusterpair/main.c rename to src/clusterpair/main.c diff --git a/clusterpair/neighbor.c b/src/clusterpair/neighbor.c similarity index 100% rename from clusterpair/neighbor.c rename to src/clusterpair/neighbor.c diff --git a/clusterpair/includes/neighbor.h b/src/clusterpair/neighbor.h similarity index 100% rename from clusterpair/includes/neighbor.h rename to src/clusterpair/neighbor.h diff --git a/clusterpair/pbc.c b/src/clusterpair/pbc.c similarity index 100% rename from clusterpair/pbc.c rename to src/clusterpair/pbc.c diff --git a/clusterpair/includes/pbc.h b/src/clusterpair/pbc.h similarity index 100% rename from clusterpair/includes/pbc.h rename to src/clusterpair/pbc.h diff --git a/clusterpair/stats.c b/src/clusterpair/stats.c similarity index 100% rename from clusterpair/stats.c rename to src/clusterpair/stats.c diff --git a/clusterpair/includes/stats.h b/src/clusterpair/stats.h similarity index 100% rename from clusterpair/includes/stats.h rename to src/clusterpair/stats.h diff --git a/clusterpair/tracing.c b/src/clusterpair/tracing.c similarity index 100% rename from clusterpair/tracing.c rename to src/clusterpair/tracing.c diff --git a/clusterpair/includes/tracing.h b/src/clusterpair/tracing.h similarity index 100% rename from clusterpair/includes/tracing.h rename to src/clusterpair/tracing.h diff --git a/clusterpair/vtk.c b/src/clusterpair/vtk.c similarity index 100% rename from clusterpair/vtk.c rename to src/clusterpair/vtk.c diff --git a/clusterpair/includes/vtk.h b/src/clusterpair/vtk.h similarity index 100% rename from clusterpair/includes/vtk.h rename to src/clusterpair/vtk.h diff --git a/clusterpair/xtc.c b/src/clusterpair/xtc.c similarity index 100% rename from clusterpair/xtc.c rename to src/clusterpair/xtc.c diff --git a/clusterpair/includes/xtc.h b/src/clusterpair/xtc.h similarity index 100% rename from clusterpair/includes/xtc.h rename to src/clusterpair/xtc.h diff --git a/common/allocate.c b/src/common/allocate.c similarity index 100% rename from common/allocate.c rename to src/common/allocate.c diff --git a/common/includes/allocate.h b/src/common/allocate.h similarity index 100% rename from common/includes/allocate.h rename to src/common/allocate.h diff --git a/common/device.c b/src/common/device.c similarity index 100% rename from common/device.c rename to src/common/device.c diff --git a/common/includes/device.h b/src/common/device.h similarity index 100% rename from common/includes/device.h rename to src/common/device.h diff --git a/common/includes/eam.h b/src/common/eam.h similarity index 100% rename from common/includes/eam.h rename to src/common/eam.h diff --git a/common/eam_utils.c b/src/common/eam_utils.c similarity index 100% rename from common/eam_utils.c rename to src/common/eam_utils.c diff --git a/common/includes/likwid-marker.h b/src/common/likwid-marker.h similarity index 100% rename from common/includes/likwid-marker.h rename to src/common/likwid-marker.h diff --git a/common/parameter.c b/src/common/parameter.c similarity index 100% rename from common/parameter.c rename to src/common/parameter.c diff --git a/common/includes/parameter.h b/src/common/parameter.h similarity index 100% rename from common/includes/parameter.h rename to src/common/parameter.h diff --git a/common/includes/simd.h b/src/common/simd.h similarity index 100% rename from common/includes/simd.h rename to src/common/simd.h diff --git a/common/includes/simd/avx2_double.h b/src/common/simd/avx2_double.h similarity index 100% rename from common/includes/simd/avx2_double.h rename to src/common/simd/avx2_double.h diff --git a/common/includes/simd/avx2_float.h b/src/common/simd/avx2_float.h similarity index 100% rename from common/includes/simd/avx2_float.h rename to src/common/simd/avx2_float.h diff --git a/common/includes/simd/avx512_double.h b/src/common/simd/avx512_double.h similarity index 100% rename from common/includes/simd/avx512_double.h rename to src/common/simd/avx512_double.h diff --git a/common/includes/simd/avx512_float.h b/src/common/simd/avx512_float.h similarity index 100% rename from common/includes/simd/avx512_float.h rename to src/common/simd/avx512_float.h diff --git a/common/includes/simd/avx_double.h b/src/common/simd/avx_double.h similarity index 100% rename from common/includes/simd/avx_double.h rename to src/common/simd/avx_double.h diff --git a/common/includes/simd/avx_float.h b/src/common/simd/avx_float.h similarity index 100% rename from common/includes/simd/avx_float.h rename to src/common/simd/avx_float.h diff --git a/common/thermo.c b/src/common/thermo.c similarity index 100% rename from common/thermo.c rename to src/common/thermo.c diff --git a/common/includes/thermo.h b/src/common/thermo.h similarity index 100% rename from common/includes/thermo.h rename to src/common/thermo.h diff --git a/common/includes/timers.h b/src/common/timers.h similarity index 100% rename from common/includes/timers.h rename to src/common/timers.h diff --git a/common/timing.c b/src/common/timing.c similarity index 80% rename from common/timing.c rename to src/common/timing.c index ba8cbbd..f48a7c3 100644 --- a/common/timing.c +++ b/src/common/timing.c @@ -4,24 +4,18 @@ * Use of this source code is governed by a LGPL-3.0 * license that can be found in the LICENSE file. */ -#include #include -double getTimeStamp() +double getTimeStamp(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9; } -double getTimeResolution() +double getTimeResolution(void) { struct timespec ts; clock_getres(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9; } - -double getTimeStamp_() -{ - return getTimeStamp(); -} diff --git a/common/includes/timing.h b/src/common/timing.h similarity index 90% rename from common/includes/timing.h rename to src/common/timing.h index 8ea6540..36a59c4 100644 --- a/common/includes/timing.h +++ b/src/common/timing.h @@ -9,6 +9,5 @@ extern double getTimeStamp(void); extern double getTimeResolution(void); -extern double getTimeStamp_(void); #endif diff --git a/common/util.c b/src/common/util.c similarity index 53% rename from common/util.c rename to src/common/util.c index ee96197..5012853 100644 --- a/common/util.c +++ b/src/common/util.c @@ -5,34 +5,35 @@ * license that can be found in the LICENSE file. */ #include -#include #include #include #include #include /* Park/Miller RNG w/out MASKING, so as to be like f90s version */ -#define IA 16807 -#define IM 2147483647 -#define AM (1.0/IM) -#define IQ 127773 -#define IR 2836 +#define IA 16807 +#define IM 2147483647 +#define AM (1.0 / IM) +#define IQ 127773 +#define IR 2836 #define MASK 123459876 -double myrandom(int* seed) { - int k= (*seed) / IQ; +double myrandom(int* seed) +{ + int k = (*seed) / IQ; double ans; *seed = IA * (*seed - k * IQ) - IR * k; - if(*seed < 0) *seed += IM; + if (*seed < 0) *seed += IM; ans = AM * (*seed); return ans; } -void random_reset(int *seed, int ibase, double *coord) { +void random_reset(int* seed, int ibase, double* coord) +{ int i; - char *str = (char *) &ibase; - int n = sizeof(int); + char* str = (char*)&ibase; + int n = sizeof(int); unsigned int hash = 0; for (i = 0; i < n; i++) { @@ -41,8 +42,8 @@ void random_reset(int *seed, int ibase, double *coord) { hash ^= (hash >> 6); } - str = (char *) coord; - n = 3 * sizeof(double); + str = (char*)coord; + n = 3 * sizeof(double); for (i = 0; i < n; i++) { hash += str[i]; hash += (hash << 10); @@ -61,45 +62,59 @@ void random_reset(int *seed, int ibase, double *coord) { // warm up the RNG - for (i = 0; i < 5; i++) myrandom(seed); - //save = 0; + for (i = 0; i < 5; i++) + myrandom(seed); + // save = 0; } -int str2ff(const char *string) { - if(strncmp(string, "lj", 2) == 0) return FF_LJ; - if(strncmp(string, "eam", 3) == 0) return FF_EAM; - if(strncmp(string, "dem", 3) == 0) return FF_DEM; +int str2ff(const char* string) +{ + if (strncmp(string, "lj", 2) == 0) return FF_LJ; + if (strncmp(string, "eam", 3) == 0) return FF_EAM; + if (strncmp(string, "dem", 3) == 0) return FF_DEM; return -1; } -const char* ff2str(int ff) { - if(ff == FF_LJ) { return "lj"; } - if(ff == FF_EAM) { return "eam"; } - if(ff == FF_DEM) { return "dem"; } +const char* ff2str(int ff) +{ + if (ff == FF_LJ) { + return "lj"; + } + if (ff == FF_EAM) { + return "eam"; + } + if (ff == FF_DEM) { + return "dem"; + } return "invalid"; } -int get_cuda_num_threads() { - const char *num_threads_env = getenv("NUM_THREADS"); +int get_cuda_num_threads(void) +{ + const char* num_threads_env = getenv("NUM_THREADS"); return (num_threads_env == NULL) ? 32 : atoi(num_threads_env); } -void readline(char *line, FILE *fp) { - if(fgets(line, MAXLINE, fp) == NULL) { - if(errno != 0) { +void readline(char* line, FILE* fp) +{ + if (fgets(line, MAXLINE, fp) == NULL) { + if (errno != 0) { perror("readline()"); exit(-1); } } } -void debug_printf(const char *format, ...) { - #ifdef DEBUG +void debug_printf(const char* format, ...) +{ +#ifdef DEBUG va_list arg; int ret; va_start(arg, format); - if((vfprintf(stdout, format, arg)) < 0) { perror("debug_printf()"); } + if ((vfprintf(stdout, format, arg)) < 0) { + perror("debug_printf()"); + } va_end(arg); - #endif +#endif } diff --git a/common/includes/util.h b/src/common/util.h similarity index 100% rename from common/includes/util.h rename to src/common/util.h diff --git a/verletlist/atom.c b/src/verletlist/atom.c similarity index 100% rename from verletlist/atom.c rename to src/verletlist/atom.c diff --git a/src/verletlist/atom.h b/src/verletlist/atom.h new file mode 100644 index 0000000..909dbfb --- /dev/null +++ b/src/verletlist/atom.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +#include + +#ifndef __ATOM_H_ +#define __ATOM_H_ + +#ifdef CUDA_TARGET +#define KERNEL_NAME "CUDA" +#define computeForceLJFullNeigh computeForceLJFullNeigh_cuda +#define initialIntegrate initialIntegrate_cuda +#define finalIntegrate finalIntegrate_cuda +#define buildNeighbor buildNeighbor_cuda +#define updatePbc updatePbc_cuda +#define updateAtomsPbc updateAtomsPbc_cuda +#else +#ifdef USE_SIMD_KERNEL +#define KERNEL_NAME "SIMD" +#define computeForceLJFullNeigh computeForceLJFullNeigh_simd +#else +#define KERNEL_NAME "PLAIN" +#endif +#define initialIntegrate initialIntegrate_cpu +#define finalIntegrate finalIntegrate_cpu +#define buildNeighbor buildNeighbor_cpu +#define updatePbc updatePbc_cpu +#define updateAtomsPbc updateAtomsPbc_cpu +#endif + +typedef struct { + MD_FLOAT *x, *y, *z; + MD_FLOAT *vx, *vy, *vz; + MD_FLOAT *fx, *fy, *fz; + int* border_map; + int* type; + MD_FLOAT* epsilon; + MD_FLOAT* sigma6; + MD_FLOAT* cutforcesq; + MD_FLOAT* cutneighsq; +} DeviceAtom; + +typedef struct { + int Natoms, Nlocal, Nghost, Nmax; + MD_FLOAT *x, *y, *z; + MD_FLOAT *vx, *vy, *vz; + MD_FLOAT *fx, *fy, *fz; + int* border_map; + int* type; + int ntypes; + MD_FLOAT* epsilon; + MD_FLOAT* sigma6; + MD_FLOAT* cutforcesq; + MD_FLOAT* cutneighsq; + + // DEM + MD_FLOAT* radius; + MD_FLOAT* av; + MD_FLOAT* r; + + // Device data + DeviceAtom d_atom; +} Atom; + +extern void initAtom(Atom*); +extern void createAtom(Atom*, Parameter*); +extern int readAtom(Atom*, Parameter*); +extern int readAtom_pdb(Atom*, Parameter*); +extern int readAtom_gro(Atom*, Parameter*); +extern int readAtom_dmp(Atom*, Parameter*); +extern int readAtom_in(Atom*, Parameter*); +extern void writeAtom(Atom*, Parameter*); +extern void growAtom(Atom*); + +#ifdef AOS +#define POS_DATA_LAYOUT "AoS" +#define atom_x(i) atom->x[(i) * 3 + 0] +#define atom_y(i) atom->x[(i) * 3 + 1] +#define atom_z(i) atom->x[(i) * 3 + 2] +#define atom_vx(i) atom->vx[(i) * 3 + 0] +#define atom_vy(i) atom->vx[(i) * 3 + 1] +#define atom_vz(i) atom->vx[(i) * 3 + 2] +#define atom_fx(i) atom->fx[(i) * 3 + 0] +#define atom_fy(i) atom->fx[(i) * 3 + 1] +#define atom_fz(i) atom->fx[(i) * 3 + 2] +#else +#define POS_DATA_LAYOUT "SoA" +#define atom_x(i) atom->x[i] +#define atom_y(i) atom->y[i] +#define atom_z(i) atom->z[i] +#define atom_vx(i) atom->vx[i] +#define atom_vy(i) atom->vy[i] +#define atom_vz(i) atom->vz[i] +#define atom_fx(i) atom->fx[i] +#define atom_fy(i) atom->fy[i] +#define atom_fz(i) atom->fz[i] +#endif + +#endif diff --git a/verletlist/cuda/force.cu b/src/verletlist/cuda/force.cu similarity index 100% rename from verletlist/cuda/force.cu rename to src/verletlist/cuda/force.cu diff --git a/verletlist/cuda/neighbor.cu b/src/verletlist/cuda/neighbor.cu similarity index 100% rename from verletlist/cuda/neighbor.cu rename to src/verletlist/cuda/neighbor.cu diff --git a/verletlist/cuda/pbc.cu b/src/verletlist/cuda/pbc.cu similarity index 100% rename from verletlist/cuda/pbc.cu rename to src/verletlist/cuda/pbc.cu diff --git a/verletlist/device_spec.c b/src/verletlist/device_spec.c similarity index 100% rename from verletlist/device_spec.c rename to src/verletlist/device_spec.c diff --git a/verletlist/force_dem.c b/src/verletlist/force_dem.c similarity index 100% rename from verletlist/force_dem.c rename to src/verletlist/force_dem.c diff --git a/verletlist/force_eam.c b/src/verletlist/force_eam.c similarity index 100% rename from verletlist/force_eam.c rename to src/verletlist/force_eam.c diff --git a/src/verletlist/force_lj-x86.c b/src/verletlist/force_lj-x86.c new file mode 100644 index 0000000..9a6578c --- /dev/null +++ b/src/verletlist/force_lj-x86.c @@ -0,0 +1,112 @@ +/* + * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +#include +#include +//--- +#include +#include +#include +#include +#include +#include + +#ifdef __SIMD_KERNEL__ +#include +#endif + +double computeForceLJFullNeigh_simd( + Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats) +{ + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + + for (int i = 0; i < Nlocal; i++) { + atom_fx(i) = 0.0; + atom_fy(i) = 0.0; + atom_fz(i) = 0.0; + } + + double S = getTimeStamp(); + +#ifndef __SIMD_KERNEL__ + fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!"); + exit(-1); +#else + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + +#pragma omp parallel + { + LIKWID_MARKER_START("force"); + +#pragma omp for schedule(runtime) + for (int i = 0; i < Nlocal; i++) { + neighs = &neighbor->neighbors[i * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[i]; + MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs); + MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i)); + MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i)); + MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i)); + MD_SIMD_FLOAT fix = simd_zero(); + MD_SIMD_FLOAT fiy = simd_zero(); + MD_SIMD_FLOAT fiz = simd_zero(); + + for (int k = 0; k < numneighs; k += VECTOR_WIDTH) { + // If the last iteration of this loop is separated from the rest, this + // mask can be set only there + MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt( + simd_int_add(simd_int_broadcast(k), simd_int_seq()), + numneighs_vec); + MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs); +#ifdef AOS + MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3 + MD_SIMD_FLOAT delx = xtmp - + simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT)); + MD_SIMD_FLOAT dely = ytmp - + simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT)); + MD_SIMD_FLOAT delz = ztmp - + simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT)); +#else + MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT)); + MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT)); + MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT)); +#endif + MD_SIMD_FLOAT rsq = simd_fma(delx, + delx, + simd_fma(dely, dely, simd_mul(delz, delz))); + MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, + simd_mask_cond_lt(rsq, cutforcesq_vec)); + MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq); + MD_SIMD_FLOAT sr6 = simd_mul(sr2, + simd_mul(sr2, simd_mul(sr2, sigma6_vec))); + MD_SIMD_FLOAT force = simd_mul(c48_vec, + simd_mul(sr6, + simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec)))); + + fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask); + fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask); + fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask); + } + + atom_fx(i) += simd_h_reduce_sum(fix); + atom_fy(i) += simd_h_reduce_sum(fiy); + atom_fz(i) += simd_h_reduce_sum(fiz); + } + + LIKWID_MARKER_STOP("force"); + } +#endif + + double E = getTimeStamp(); + return E - S; +} diff --git a/verletlist/force_lj.c b/src/verletlist/force_lj.c similarity index 61% rename from verletlist/force_lj.c rename to src/verletlist/force_lj.c index a5d9e2e..a90c514 100644 --- a/verletlist/force_lj.c +++ b/src/verletlist/force_lj.c @@ -4,9 +4,6 @@ * Use of this source code is governed by a LGPL-3.0 * license that can be found in the LICENSE file. */ -#include -#include -//--- #include #include #include @@ -14,10 +11,6 @@ #include #include -#ifdef __SIMD_KERNEL__ -#include -#endif - double computeForceLJFullNeigh( Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats) { @@ -203,96 +196,3 @@ double computeForceLJHalfNeigh( double timeStop = getTimeStamp(); return timeStop - timeStart; } - -double computeForceLJFullNeigh_simd( - Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats) -{ - int Nlocal = atom->Nlocal; - int* neighs; - MD_FLOAT cutforcesq = param->cutforce * param->cutforce; - MD_FLOAT sigma6 = param->sigma6; - MD_FLOAT epsilon = param->epsilon; - - for (int i = 0; i < Nlocal; i++) { - atom_fx(i) = 0.0; - atom_fy(i) = 0.0; - atom_fz(i) = 0.0; - } - - double S = getTimeStamp(); - -#ifndef __SIMD_KERNEL__ - fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!"); - exit(-1); -#else - MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); - MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); - MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); - MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - -#pragma omp parallel - { - LIKWID_MARKER_START("force"); - -#pragma omp for schedule(runtime) - for (int i = 0; i < Nlocal; i++) { - neighs = &neighbor->neighbors[i * neighbor->maxneighs]; - int numneighs = neighbor->numneigh[i]; - MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs); - MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i)); - MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i)); - MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i)); - MD_SIMD_FLOAT fix = simd_zero(); - MD_SIMD_FLOAT fiy = simd_zero(); - MD_SIMD_FLOAT fiz = simd_zero(); - - for (int k = 0; k < numneighs; k += VECTOR_WIDTH) { - // If the last iteration of this loop is separated from the rest, this - // mask can be set only there - MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt( - simd_int_add(simd_int_broadcast(k), simd_int_seq()), - numneighs_vec); - MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs); -#ifdef AOS - MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3 - MD_SIMD_FLOAT delx = xtmp - - simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT)); - MD_SIMD_FLOAT dely = ytmp - - simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT)); - MD_SIMD_FLOAT delz = ztmp - - simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT)); -#else - MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT)); - MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT)); - MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT)); -#endif - MD_SIMD_FLOAT rsq = simd_fma(delx, - delx, - simd_fma(dely, dely, simd_mul(delz, delz))); - MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, - simd_mask_cond_lt(rsq, cutforcesq_vec)); - MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq); - MD_SIMD_FLOAT sr6 = simd_mul(sr2, - simd_mul(sr2, simd_mul(sr2, sigma6_vec))); - MD_SIMD_FLOAT force = simd_mul(c48_vec, - simd_mul(sr6, - simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec)))); - - fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask); - fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask); - fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask); - } - - atom_fx(i) += simd_h_reduce_sum(fix); - atom_fy(i) += simd_h_reduce_sum(fiy); - atom_fz(i) += simd_h_reduce_sum(fiz); - } - - LIKWID_MARKER_STOP("force"); - } -#endif - - double E = getTimeStamp(); - return E - S; -} diff --git a/verletlist/includes/integrate.h b/src/verletlist/integrate.h similarity index 100% rename from verletlist/includes/integrate.h rename to src/verletlist/integrate.h diff --git a/verletlist/main-stub.c b/src/verletlist/main-stub.c similarity index 100% rename from verletlist/main-stub.c rename to src/verletlist/main-stub.c diff --git a/verletlist/main.c b/src/verletlist/main.c similarity index 91% rename from verletlist/main.c rename to src/verletlist/main.c index 970bf03..3564261 100644 --- a/verletlist/main.c +++ b/src/verletlist/main.c @@ -10,8 +10,8 @@ #include #include -// #include #include +#include #include #include @@ -30,8 +30,8 @@ #define HLINE "------------------------------------------------------------------\n" -extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*); +extern double computeForceLJFullNeigh(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*); @@ -325,35 +325,38 @@ int main(int argc, char** argv) timer[TOTAL] - timer[FORCE] - timer[NEIGH]); printf(HLINE); - // int nthreads = 0; - // int chunkSize = 0; - // omp_sched_t schedKind; - // char schedType[10]; - // #pragma omp parallel - // #pragma omp master - // { - // omp_get_schedule(&schedKind, &chunkSize); - // - // switch (schedKind) { - // case omp_sched_static: - // strcpy(schedType, "static"); - // break; - // case omp_sched_dynamic: - // strcpy(schedType, "dynamic"); - // break; - // case omp_sched_guided: - // strcpy(schedType, "guided"); - // break; - // case omp_sched_auto: - // strcpy(schedType, "auto"); - // break; - // } - // - // nthreads = omp_get_max_threads(); - // } - // - // printf("Num threads: %d\n", nthreads); - // printf("Schedule: (%s,%d)\n", schedType, chunkSize); + int nthreads = 0; + int chunkSize = 0; + omp_sched_t schedKind; + char schedType[10]; +#pragma omp parallel +#pragma omp master + { + omp_get_schedule(&schedKind, &chunkSize); + + switch (schedKind) { + case omp_sched_static: + strcpy(schedType, "static"); + break; + case omp_sched_dynamic: + strcpy(schedType, "dynamic"); + break; + case omp_sched_guided: + strcpy(schedType, "guided"); + break; + case omp_sched_auto: + strcpy(schedType, "auto"); + break; + case omp_sched_monotonic: + strcpy(schedType, "auto"); + break; + } + + nthreads = omp_get_max_threads(); + } + + printf("Num threads: %d\n", nthreads); + printf("Schedule: (%s,%d)\n", schedType, chunkSize); printf("Performance: %.2f million atom updates per second\n", 1e-6 * (double)atom.Natoms * param.ntimes / timer[TOTAL]); diff --git a/verletlist/neighbor.c b/src/verletlist/neighbor.c similarity index 100% rename from verletlist/neighbor.c rename to src/verletlist/neighbor.c diff --git a/verletlist/includes/neighbor.h b/src/verletlist/neighbor.h similarity index 100% rename from verletlist/includes/neighbor.h rename to src/verletlist/neighbor.h diff --git a/verletlist/pbc.c b/src/verletlist/pbc.c similarity index 100% rename from verletlist/pbc.c rename to src/verletlist/pbc.c diff --git a/verletlist/includes/pbc.h b/src/verletlist/pbc.h similarity index 100% rename from verletlist/includes/pbc.h rename to src/verletlist/pbc.h diff --git a/verletlist/stats.c b/src/verletlist/stats.c similarity index 100% rename from verletlist/stats.c rename to src/verletlist/stats.c diff --git a/verletlist/includes/stats.h b/src/verletlist/stats.h similarity index 100% rename from verletlist/includes/stats.h rename to src/verletlist/stats.h diff --git a/verletlist/tracing.c b/src/verletlist/tracing.c similarity index 100% rename from verletlist/tracing.c rename to src/verletlist/tracing.c diff --git a/verletlist/includes/tracing.h b/src/verletlist/tracing.h similarity index 100% rename from verletlist/includes/tracing.h rename to src/verletlist/tracing.h diff --git a/verletlist/vtk.c b/src/verletlist/vtk.c similarity index 100% rename from verletlist/vtk.c rename to src/verletlist/vtk.c diff --git a/verletlist/includes/vtk.h b/src/verletlist/vtk.h similarity index 100% rename from verletlist/includes/vtk.h rename to src/verletlist/vtk.h diff --git a/util/README.md b/util/README.md deleted file mode 100644 index 957b6ef..0000000 --- a/util/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# Utility tools for MD-Bench - -**mdBench.c:** Single file version for MD-Bench, used mostly for teaching purposes. - -**run_stub.sh:** Bash script to run the MD-Bench stubbed force calculation for different configurations and evaluate the performance. -The configuration parameters are: -- **-a :** specify the number of atoms per unit cell (the number of neighbors per atom is this value minus 1), the default is 8. -- **-n :** timesteps to run the simulation, the default is 200. -- **-nx :** number of unit cells in the x dimension, the default is 4. -- **-ny :** number of unit cells in the y dimension, the default is 4. -- **-nz :** number of unit cells in the z dimension, the default is 2. - -Notice that these parameters can also be specified as lists, which executes the stubbed force calculation several times varying the specific parameter to each element of the list, and hence all combinations of parameters will be executed. For example, the following command: - -```bash -bash run_stub.sh -a "8 16" -nx "4 8" -ny 8 -nz 4 -``` - -Will execute the stubbed force calculation for the following 4 configurations: - -```bash -1> 8 atoms per unit cell on a 4x8x4 grid of unit cells, 200 timesteps -2> 16 atoms per unit cell on a 4x8x4 grid of unit cells, 200 timesteps -3> 8 atoms per unit cell on a 8x8x4 grid of unit cells, 200 timesteps -4> 16 atoms per unit cell on a 8x8x4 grid of unit cells, 200 timesteps -``` - -The following parameters are also available: -- **-f :** CPU frequency in GHz (assure your CPU frequency is fixed by disabling Turbo mode), more performance metrics such as cycles per iteration are displayed if this option is defined. -- **-o :** output file (.txt) for the results, the default is *run_results.txt*. -- **-r :** number of runs for each configuration (only the values for the best run are displayed), the default is 3. - -**plot_run_stub_data.py:** Python script to plot the data generated by the *run_stub.sh* script. Just provide the name of the .txt file as a parameter and this script generates a corresponding PDF with the same file name. - -**plot_gather_data.py:** Python script to plot the data generated by the gather benchmark. Just provide the name of the .txt file containing the gather output as a parameter and this script generates a corresponding PDF with the same file name. Multiple outputs with different strides can be included in the text file by concatenating the outputs. The script handles output from both standard simple array case and MD variant. - -**cache.py:** Python script to run the cache simulator with the data obtained from the memory tracer. Just run it with the tracer output file name as a parameter. The cache specifications can be directly adapted in the script to match those of the target processor of interest. diff --git a/util/cache.py b/util/cache.py deleted file mode 100644 index a02fbda..0000000 --- a/util/cache.py +++ /dev/null @@ -1,33 +0,0 @@ -import sys -from cachesim import CacheSimulator, Cache, MainMemory - -filename = sys.argv[1] -mem = MainMemory() - -#l3 = Cache("L3", 20480, 16, 64, "LRU") # 20MB: 20480 sets, 16-ways with cacheline size of 64 bytes -#l2 = Cache("L2", 256, 4, 64, "LRU", store_to=l3, load_from=l3) # 256KB -#l1 = Cache("L1", 64, 8, 64, "LRU", store_to=l2, load_from=l2) # 32KB - -# Cascade Lake -l3 = Cache("L3", 14336, 16, 64, "LRU", write_allocate=False) -l2 = Cache("L2", 1024, 16, 64, "LRU", store_to=l3, victims_to=l3) -l1 = Cache("L1", 64, 8, 64, "LRU", store_to=l2, load_from=l2) -mem.load_to(l2) -mem.store_from(l3) -cs = CacheSimulator(l1, mem) - -with open(filename, 'r') as fp: - for line in fp.readlines(): - op, addr = line.split(": ") - op = op[0] - addr = int(addr, 16) - - if op == 'W': - cs.store(addr, length=8) - elif op == 'R': - cs.load(addr, length=8) - else: - sys.exit("Invalid operation: {}".format(op)) - -cs.force_write_back() -cs.print_stats() diff --git a/util/cache_sets_histogram.py b/util/cache_sets_histogram.py deleted file mode 100644 index 58596a0..0000000 --- a/util/cache_sets_histogram.py +++ /dev/null @@ -1,39 +0,0 @@ -import sys -from cachesim import CacheSimulator, Cache, MainMemory - -def get_set_id(cache, addr): - return (addr >> cache.cl_bits) % cache.sets - -filename = sys.argv[1] -N = sys.argv[2] -mem = MainMemory() - -# Cascade Lake -l3 = Cache("L3", 14336, 16, 64, "LRU", write_allocate=False) -l2 = Cache("L2", 1024, 16, 64, "LRU", store_to=l3, victims_to=l3) -l1 = Cache("L1", 64, 8, 64, "LRU", store_to=l2, load_from=l2) -mem.load_to(l2) -mem.store_from(l3) -cs = CacheSimulator(l1, mem) - -sets_hist = { - 'l1': {s: 0 for s in range(l1.sets)}, - 'l2': {s: 0 for s in range(l2.sets)}, - 'l3': {s: 0 for s in range(l3.sets)} -} - -with open(filename, 'r') as fp: - for line in fp.readlines(): - op, addr = line.split(": ") - op = op[0] - addr = int(addr, 16) - sets_hist['l1'][get_set_id(l1, addr)] += 1 - sets_hist['l2'][get_set_id(l2, addr)] += 1 - sets_hist['l3'][get_set_id(l3, addr)] += 1 - -for cache_level, data in sets_hist.items(): - if cache_level != 'l3': - print(cache_level, ": ") - for set_id in data: - if data[set_id] > 0: - print(set_id, " -> ", data[set_id]) diff --git a/util/evaluate_latency_and_cfd.sh b/util/evaluate_latency_and_cfd.sh deleted file mode 100644 index e93c50a..0000000 --- a/util/evaluate_latency_and_cfd.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -[[ -z "$1" ]] && echo "Use: $0 [-c ] [-f ] [-n ] [-l ] [-s]" && exit -[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit -[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit - -MDBENCH_BIN=$1 -BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC -OPT_SCHEME="${BIN_INFO%%-*}" -PREC="${BIN_INFO##*-}" -BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC -BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA -TAG="${BIN_INFO%%-*}" -ISA="${BIN_INFO##*-}" -CORE="${CORE:-0}" -FREQ="${FREQ:-2.4}" -NRUNS="${NRUNS:-3}" -LOG="${LOG:-latencies_and_cfds.$(hostname).log}" -STUB_ONLY="${STUB_ONLY:-false}" -SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}" - -OPTIND=2 -while getopts "c:f:n:l:s" flag; do - case "${flag}" in - c) CORE=${OPTARG};; - f) FREQ=${OPTARG};; - n) NRUNS=${OPTARG};; - l) LOG=${OPTARG};; - s) STUB_ONLY=true;; - esac -done - -# Other useful variables -MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC -FIXED_PARAMS="--freq $FREQ" -CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3) - -if [ "$CPU_VENDOR" == "GenuineIntel" ]; then - ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER" - DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE") -else - ALL_PREFETCHERS="" - DEFAULT_PREFETCHERS=("IGNORE") -fi - -if [ -z ${PREFETCHERS+x} ]; then - PREFETCHERS=${DEFAULT_PREFETCHERS} -fi - -if [ "$OPT_SCHEME" == "gromacs" ]; then - STUB1_NAME=stub-33 - STUB1_PARAMS="-na 4 -nn 33" - STUB2_NAME=stub-128 - STUB2_PARAMS="-na 4 -nn 128" -else - STUB1_NAME=stub-76 - STUB1_PARAMS="-nn 76" - STUB2_NAME=stub-1024 - STUB2_PARAMS="-nn 1024" -fi - -function run_benchmark() { - BEST=10000000 - for i in $(seq $NRUNS); do - RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3) - if (( $(echo "$BEST > $RES" | bc -l ) )); then - BEST=$RES - fi - done -} - -echo "Tag: $TAG" | tee -a $LOG -echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG -echo "Instruction set: $ISA" | tee -a $LOG -echo "Precision: $PREC" | tee -a $LOG -echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG -echo "Frequency: $FREQ" | tee -a $LOG -echo "Number of runs: $NRUNS" | tee -a $LOG -echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG - -if [ "$SKIP_SET_FREQ" == "false" ]; then - echo "Fixing frequencies..." - likwid-setFrequencies -f $FREQ -t 0 -fi - -for p in $PREFETCHERS; do - if [ "$p" != "IGNORE" ]; then - if [ "$p" == "ALL" ]; then - likwid-features -c $CORE -e $ALL_PREFETCHERS - elif [ "$p" == "NONE" ]; then - likwid-features -c $CORE -d $ALL_PREFETCHERS - else - likwid-features -c $CORE -d $ALL_PREFETCHERS - likwid-features -c $CORE -e $p - fi - - echo "Prefetcher settings: $p" - likwid-features -c $CORE -l - fi - - MSG="$p: " - if [ "$STUB_ONLY" == "false" ]; then - run_benchmark $MDBENCH_BIN - MSG+="standard=$BEST, " - run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp - MSG+="melt=$BEST, " - run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro - MSG+="argon=$BEST, " - fi - - run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS - MSG+="$STUB1_NAME=$BEST, " - run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS - MSG+="$STUB2_NAME=$BEST" - echo $MSG | tee -a $LOG -done diff --git a/util/gather-bench/.gitignore b/util/gather-bench/.gitignore deleted file mode 100644 index c6127b3..0000000 --- a/util/gather-bench/.gitignore +++ /dev/null @@ -1,52 +0,0 @@ -# Prerequisites -*.d - -# Object files -*.o -*.ko -*.obj -*.elf - -# Linker output -*.ilk -*.map -*.exp - -# Precompiled Headers -*.gch -*.pch - -# Libraries -*.lib -*.a -*.la -*.lo - -# Shared objects (inc. Windows DLLs) -*.dll -*.so -*.so.* -*.dylib - -# Executables -*.exe -*.out -*.app -*.i*86 -*.x86_64 -*.hex - -# Debug files -*.dSYM/ -*.su -*.idb -*.pdb - -# Kernel Module Compile Results -*.mod* -*.cmd -.tmp_versions/ -modules.order -Module.symvers -Mkfile.old -dkms.conf diff --git a/util/gather-bench/LICENSE b/util/gather-bench/LICENSE deleted file mode 100644 index 7a6aa02..0000000 --- a/util/gather-bench/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) RRZE-HPC - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/util/gather-bench/Makefile b/util/gather-bench/Makefile deleted file mode 100644 index 12259e1..0000000 --- a/util/gather-bench/Makefile +++ /dev/null @@ -1,126 +0,0 @@ -#CONFIGURE BUILD SYSTEM -TARGET = gather-bench-$(TAG) -BUILD_DIR = ./$(TAG) -SRC_DIR = ./src -MAKE_DIR = ./ -ISA_DIR = ./src/$(ISA) -Q ?= @ - -#DO NOT EDIT BELOW -include $(MAKE_DIR)/config.mk -include $(MAKE_DIR)/include_$(TAG).mk -include $(MAKE_DIR)/include_LIKWID.mk -INCLUDES += -I./src/includes - -VPATH = $(SRC_DIR) ${ISA_DIR} -ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) -ASM += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90)) -OBJ = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))) -OBJ += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc)) -OBJ += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp)) -OBJ += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90)) -OBJ += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90)) -OBJ += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s)) -OBJ += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S)) -CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA) - -ifneq ($(VARIANT),) - .DEFAULT_GOAL := ${TARGET}-$(VARIANT) -endif - -ifeq ($(strip $(DATA_LAYOUT)),AOS) - CPPFLAGS += -DAOS -endif - -ifeq ($(strip $(TEST)),true) - CPPFLAGS += -DTEST -endif - -ifeq ($(strip $(PADDING)),true) - CPPFLAGS += -DPADDING -endif - -ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true) - CPPFLAGS += -DMEASURE_GATHER_CYCLES -endif - -ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true) - CPPFLAGS += -DONLY_FIRST_DIMENSION -endif - -ifeq ($(strip $(MEM_TRACER)),true) - CPPFLAGS += -DMEM_TRACER -endif - -${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c - @echo "===> LINKING $(TARGET)" - $(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS) - -${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c - @echo "===> LINKING $(TARGET)-$* " - $(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS) - -asm: $(BUILD_DIR) $(ASM) - -$(BUILD_DIR)/%.o: %.c - @echo "===> COMPILE $@" - $(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@ - $(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d - -$(BUILD_DIR)/%.s: %.c - @echo "===> GENERATE ASM $@" - $(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@ - -$(BUILD_DIR)/%.s: %.f90 - @echo "===> COMPILE $@" - $(Q)$(FC) -S $(FCFLAGS) $< -o $@ - -$(BUILD_DIR)/%.o: %.cc - @echo "===> COMPILE $@" - $(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@ - $(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d - -$(BUILD_DIR)/%.o: %.cpp - @echo "===> COMPILE $@" - $(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@ - $(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d - -$(BUILD_DIR)/%.o: %.f90 - @echo "===> COMPILE $@" - $(Q)$(FC) -c $(FCFLAGS) $< -o $@ - -$(BUILD_DIR)/%.o: %.F90 - @echo "===> COMPILE $@" - $(Q)$(FC) -c $(CPPFLAGS) $(FCFLAGS) $< -o $@ - -$(BUILD_DIR)/%.o: %.s - @echo "===> ASSEMBLE $@" - $(Q)$(AS) $(ASFLAGS) $< -o $@ - -$(BUILD_DIR)/%.o: %.S - @echo "===> ASSEMBLE $@" - $(Q)$(CC) -c $(CPPFLAGS) $< -o $@ - -tags: - @echo "===> GENERATE TAGS" - $(Q)ctags -R - - -$(BUILD_DIR): - @mkdir $(BUILD_DIR) - -ifeq ($(findstring $(MAKECMDGOALS),clean),) --include $(OBJ:.o=.d) -endif - -.PHONY: clean distclean - -clean: - @echo "===> CLEAN" - @rm -rf $(BUILD_DIR) - @rm -f tags - -distclean: clean - @echo "===> DIST CLEAN" - @rm -f $(TARGET) - @rm -f tags diff --git a/util/gather-bench/README.md b/util/gather-bench/README.md deleted file mode 100644 index 5f6601b..0000000 --- a/util/gather-bench/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# gather-bench -A X86 gather instruction performance benchmark diff --git a/util/gather-bench/config.mk b/util/gather-bench/config.mk deleted file mode 100644 index e76a104..0000000 --- a/util/gather-bench/config.mk +++ /dev/null @@ -1,22 +0,0 @@ -# Supported: GCC, CLANG, ICC -TAG ?= ICC -# Supported: avx2, avx512 -ISA ?= avx512 -# Use likwid? -ENABLE_LIKWID ?= false - -# SP or DP -DATA_TYPE ?= DP -# AOS or SOA -DATA_LAYOUT ?= AOS -# Padding byte for AoS -PADDING ?= false -# Measure cycles for each gather separately -MEASURE_GATHER_CYCLES ?= false -# Gather data only for first dimension (one gather per iteration) -ONLY_FIRST_DIMENSION ?= false - -# Trace memory addresses for cache simulator -MEM_TRACER ?= false -# Test correctness of gather kernels -TEST ?= false diff --git a/util/gather-bench/include_CLANG.mk b/util/gather-bench/include_CLANG.mk deleted file mode 100644 index fda89f8..0000000 --- a/util/gather-bench/include_CLANG.mk +++ /dev/null @@ -1,9 +0,0 @@ -CC = clang -LINKER = $(CC) - -OPENMP =# -fopenmp -CFLAGS = -Ofast -std=c11 -march=core-avx2 -mavx -mfma $(OPENMP) -LFLAGS = $(OPENMP) -march=core-avx2 -mavx -mfma -DEFINES = -D_GNU_SOURCE -INCLUDES = -LIBS = diff --git a/util/gather-bench/include_GCC.mk b/util/gather-bench/include_GCC.mk deleted file mode 100644 index 16f1072..0000000 --- a/util/gather-bench/include_GCC.mk +++ /dev/null @@ -1,11 +0,0 @@ -CC = gcc -AS = as -LINKER = $(CC) - -OPENMP = -fopenmp -CFLAGS = -Ofast -std=c11 -mavx2 -mfma $(OPENMP) -ASFLAGS = -LFLAGS = $(OPENMP) -mavx2 -mfma -DEFINES = -D_GNU_SOURCE -INCLUDES = -LIBS = diff --git a/util/gather-bench/include_ICC.mk b/util/gather-bench/include_ICC.mk deleted file mode 100644 index 09613ac..0000000 --- a/util/gather-bench/include_ICC.mk +++ /dev/null @@ -1,9 +0,0 @@ -CC = icc -LINKER = $(CC) - -OPENMP = -qopenmp -CFLAGS = -Ofast -xhost -std=c11 $(OPENMP) -LFLAGS = $(OPENMP) -DEFINES = -D_GNU_SOURCE -INCLUDES = -LIBS = diff --git a/util/gather-bench/include_LIKWID.mk b/util/gather-bench/include_LIKWID.mk deleted file mode 100644 index 4ca5456..0000000 --- a/util/gather-bench/include_LIKWID.mk +++ /dev/null @@ -1,10 +0,0 @@ -LIKWID_INC ?= -I/usr/local/include -LIKWID_DEFINES ?= -DLIKWID_PERFMON -LIKWID_LIB ?= -L/usr/local/lib - -ifeq ($(strip $(ENABLE_LIKWID)),true) -INCLUDES += ${LIKWID_INC} -DEFINES += ${LIKWID_DEFINES} -LIBS += -llikwid -LFLAGS += ${LIKWID_LIB} -endif diff --git a/util/gather-bench/src/allocate.c b/util/gather-bench/src/allocate.c deleted file mode 100644 index c5781f9..0000000 --- a/util/gather-bench/src/allocate.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#include -#include -#include - -void* allocate (int alignment, size_t bytesize) -{ - int errorCode; - void* ptr; - - errorCode = posix_memalign(&ptr, alignment, bytesize); - - if (errorCode) { - if (errorCode == EINVAL) { - fprintf(stderr, - "Error: Alignment parameter is not a power of two\n"); - exit(EXIT_FAILURE); - } - if (errorCode == ENOMEM) { - fprintf(stderr, - "Error: Insufficient memory to fulfill the request\n"); - exit(EXIT_FAILURE); - } - } - - if (ptr == NULL) { - fprintf(stderr, "Error: posix_memalign failed!\n"); - exit(EXIT_FAILURE); - } - - return ptr; -} diff --git a/util/gather-bench/src/avx2/gather.S b/util/gather-bench/src/avx2/gather.S deleted file mode 100644 index ff7c162..0000000 --- a/util/gather-bench/src/avx2/gather.S +++ /dev/null @@ -1,63 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> a -# rsi -> idx -# rdx -> N -# rcx -> t -.text -.globl gather -.type gather, @function -gather : -push rbp -mov rbp, rsp -push rbx -push r12 -push r13 -push r14 -push r15 - -xor rax, rax -vpcmpeqd ymm0, ymm0, ymm0 -.align 16 -1: -vmovups xmm1, [rsi + rax * 4] -vmovups xmm2, [rsi + rax * 4 + 16] -vmovups xmm3, [rsi + rax * 4 + 32] -vmovups xmm4, [rsi + rax * 4 + 48] -vmovdqa ymm5, ymm0 -vmovdqa ymm6, ymm0 -vmovdqa ymm7, ymm0 -vmovdqa ymm8, ymm0 -vxorpd ymm9, ymm9, ymm9 -vxorpd ymm10, ymm10, ymm10 -vxorpd ymm11, ymm11, ymm11 -vxorpd ymm12, ymm12, ymm12 -vgatherdpd ymm9, [rdi + xmm1 * 8], ymm5 -vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6 -vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7 -vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8 - -#ifdef TEST -vmovapd [rcx + rax * 8], ymm9 -vmovapd [rcx + rax * 8 + 32], ymm10 -vmovapd [rcx + rax * 8 + 64], ymm11 -vmovapd [rcx + rax * 8 + 96], ymm12 -#endif - -addq rax, 16 -cmpq rax, rdx -jl 1b - -pop r15 -pop r14 -pop r13 -pop r12 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather, .-gather diff --git a/util/gather-bench/src/avx2/gather_aos.S b/util/gather-bench/src/avx2/gather_aos.S deleted file mode 100644 index dce42c7..0000000 --- a/util/gather-bench/src/avx2/gather_aos.S +++ /dev/null @@ -1,71 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> a -# rsi -> idx -# rdx -> N -# rcx -> t -.text -.globl gather_aos -.type gather_aos, @function -gather_aos : -push rbp -mov rbp, rsp -push rbx -push r9 -push r10 -push r11 -push r12 -push r13 -push r14 -push r15 - -xor rax, rax -vpcmpeqd ymm8, ymm8, ymm8 -.align 16 -1: - -vmovups xmm3, XMMWORD PTR [rsi + rax * 4] -vpaddd xmm4, xmm3, xmm3 -#ifdef PADDING -vpaddd xmm3, xmm4, xmm4 -#else -vpaddd xmm3, xmm3, xmm4 -#endif -vmovdqa ymm5, ymm8 -vmovdqa ymm6, ymm8 -vmovdqa ymm7, ymm8 -vxorpd ymm0, ymm0, ymm0 -vxorpd ymm1, ymm1, ymm1 -vxorpd ymm2, ymm2, ymm2 -vgatherdpd ymm0, [ rdi + xmm3 * 8], ymm5 -vgatherdpd ymm1, [8 + rdi + xmm3 * 8], ymm6 -vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7 - -#ifdef TEST -vmovupd [rcx + rax * 8], ymm0 -lea rbx, [rcx + rdx * 8] -vmovupd [rbx + rax * 8], ymm1 -lea r9, [rbx + rdx * 8] -vmovupd [r9 + rax * 8], ymm2 -#endif - -addq rax, 4 -cmpq rax, rdx -jl 1b - -pop r15 -pop r14 -pop r13 -pop r12 -pop r11 -pop r10 -pop r9 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather_aos, .-gather_aos diff --git a/util/gather-bench/src/avx2/gather_soa.S b/util/gather-bench/src/avx2/gather_soa.S deleted file mode 100644 index c506156..0000000 --- a/util/gather-bench/src/avx2/gather_soa.S +++ /dev/null @@ -1,67 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> a -# rsi -> idx -# rdx -> N -# rcx -> t -.text -.globl gather_soa -.type gather_soa, @function -gather_soa : -push rbp -mov rbp, rsp -push rbx -push r9 -push r10 -push r11 -push r12 -push r13 -push r14 -push r15 - -xor rax, rax -vpcmpeqd ymm8, ymm8, ymm8 -lea r8, [rdi + rdx * 8] -lea r9, [r8 + rdx * 8] -.align 16 -1: - -vmovups xmm3, XMMWORD PTR [rsi + rax * 4] -vmovdqa ymm5, ymm8 -vmovdqa ymm6, ymm8 -vmovdqa ymm7, ymm8 -vxorpd ymm0, ymm0, ymm0 -vxorpd ymm1, ymm1, ymm1 -vxorpd ymm2, ymm2, ymm2 -vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5 -vgatherdpd ymm1, [r8 + xmm3 * 8], ymm6 -vgatherdpd ymm2, [r9 + xmm3 * 8], ymm7 - -#ifdef TEST -vmovupd [rcx + rax * 8], ymm0 -lea rbx, [rcx + rdx * 8] -vmovupd [rbx + rax * 8], ymm1 -lea r10, [rbx + rdx * 8] -vmovupd [r10 + rax * 8], ymm2 -#endif - -addq rax, 4 -cmpq rax, rdx -jl 1b - -pop r15 -pop r14 -pop r13 -pop r12 -pop r11 -pop r10 -pop r9 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather_soa, .-gather_soa diff --git a/util/gather-bench/src/avx512/gather.S b/util/gather-bench/src/avx512/gather.S deleted file mode 100644 index 54e78f4..0000000 --- a/util/gather-bench/src/avx512/gather.S +++ /dev/null @@ -1,62 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> a -# rsi -> idx -# rdx -> N -# rcx -> t -.text -.globl gather -.type gather, @function -gather : -push rbp -mov rbp, rsp -push rbx -push r12 -push r13 -push r14 -push r15 - -xor rax, rax -.align 16 -1: -vpcmpeqb k1, xmm0, xmm0 -vpcmpeqb k2, xmm0, xmm0 -vpcmpeqb k3, xmm0, xmm0 -vpcmpeqb k4, xmm0, xmm0 -vmovdqu ymm0, [rsi + rax * 4] -vmovdqu ymm1, [rsi + rax * 4 + 32] -vmovdqu ymm2, [rsi + rax * 4 + 64] -vmovdqu ymm3, [rsi + rax * 4 + 96] -vpxord zmm4, zmm4, zmm4 -vpxord zmm5, zmm5, zmm5 -vpxord zmm6, zmm6, zmm6 -vpxord zmm7, zmm7, zmm7 -vgatherdpd zmm4{k1}, [rdi + ymm0 * 8] -vgatherdpd zmm5{k2}, [rdi + ymm1 * 8] -vgatherdpd zmm6{k3}, [rdi + ymm2 * 8] -vgatherdpd zmm7{k4}, [rdi + ymm3 * 8] - -#ifdef TEST -vmovapd [rcx + rax * 8], zmm4 -vmovapd [rcx + rax * 8 + 64], zmm5 -vmovapd [rcx + rax * 8 + 128], zmm6 -vmovapd [rcx + rax * 8 + 192], zmm7 -#endif - -addq rax, 32 -cmpq rax, rdx -jl 1b - -pop r15 -pop r14 -pop r13 -pop r12 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather, .-gather diff --git a/util/gather-bench/src/avx512/gather_aos.S b/util/gather-bench/src/avx512/gather_aos.S deleted file mode 100644 index 484144e..0000000 --- a/util/gather-bench/src/avx512/gather_aos.S +++ /dev/null @@ -1,151 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> a -# rsi -> idx -# rdx -> N -# rcx -> t -# r8 -> cycles -.text -.globl gather_aos -.type gather_aos, @function -gather_aos : -push rbp -mov rbp, rsp -push rbx -push r9 -push r10 -push r11 -push r12 -push r13 -push r14 -push r15 - -xor rax, rax -.align 16 -1: - -vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4] -vpaddd ymm4, ymm3, ymm3 -#ifdef PADDING -vpaddd ymm3, ymm4, ymm4 -#else -vpaddd ymm3, ymm3, ymm4 -#endif - -# Prefetching instructions -#mov ebx, DWORD PTR[rsi + rax*4] -#mov r9d, DWORD PTR[4 + rsi + rax*4] -#mov r10d, DWORD PTR[8 + rsi + rax*4] -#mov r11d, DWORD PTR[12 + rsi + rax*4] -#mov r12d, DWORD PTR[16 + rsi + rax*4] -#mov r13d, DWORD PTR[20 + rsi + rax*4] -#mov r14d, DWORD PTR[24 + rsi + rax*4] -#mov r15d, DWORD PTR[28 + rsi + rax*4] -#lea ebx, DWORD PTR[rbx] -#lea r9d, DWORD PTR[r9] -#lea r10d, DWORD PTR[r10] -#lea r11d, DWORD PTR[r11] -#lea r12d, DWORD PTR[r12] -#lea r13d, DWORD PTR[r13] -#lea r14d, DWORD PTR[r14] -#lea r15d, DWORD PTR[r15] - -vpcmpeqb k1, xmm5, xmm5 -#ifndef ONLY_FIRST_DIMENSION -vpcmpeqb k2, xmm5, xmm5 -vpcmpeqb k3, xmm5, xmm5 -#endif - -vpxord zmm0, zmm0, zmm0 -#ifndef ONLY_FIRST_DIMENSION -vpxord zmm1, zmm1, zmm1 -vpxord zmm2, zmm2, zmm2 -#endif - -#ifdef MEASURE_GATHER_CYCLES - -mov r9, rax -mov r10, rdx -xor r11, r11 -add r11, rax -add r11, rax -add r11, rax -#shr r11, 3 - -xor rbx, rbx -lfence -rdtsc -add ebx, eax -vgatherdpd zmm0{k1}, [rdi + ymm3 * 8] -lfence -rdtsc -sub eax, ebx -#movdiri [r8 + r11], rax -movnti [r8 + r11], rax - -#ifndef ONLY_FIRST_DIMENSION -xor rbx, rbx -lfence -rdtsc -add ebx, eax -vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8] -lfence -rdtsc -sub eax, ebx -#movdiri [8 + r8 + r11], rax -movnti [8 + r8 + r11], rax - -xor rbx, rbx -lfence -rdtsc -add ebx, eax -vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8] -lfence -rdtsc -sub eax, ebx -#movdiri [16 + r8 + r11], rax -movnti [16 + r8 + r11], rax -#endif // ONLY_FIRST_DIMENSION - -mov rax, r9 -mov rdx, r10 - -#else // MEASURE_GATHER_CYCLES - -vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8] - -#ifndef ONLY_FIRST_DIMENSION -vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8] -vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8] -#endif - -#endif // MEASURE_GATHER_CYCLES - -#ifdef TEST -vmovupd [rcx + rax * 8], zmm0 -lea rbx, [rcx + rdx * 8] -vmovupd [rbx + rax * 8], zmm1 -lea r9, [rbx + rdx * 8] -vmovupd [r9 + rax * 8], zmm2 -#endif - -addq rax, 8 -cmpq rax, rdx -jl 1b - -pop r15 -pop r14 -pop r13 -pop r12 -pop r11 -pop r10 -pop r9 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather_aos, .-gather_aos diff --git a/util/gather-bench/src/avx512/gather_md_aos.S b/util/gather-bench/src/avx512/gather_md_aos.S deleted file mode 100644 index 573c86e..0000000 --- a/util/gather-bench/src/avx512/gather_md_aos.S +++ /dev/null @@ -1,147 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -.section .rodata, "a" -.align 64 -.align 64 -.ymm_reg_mask.1: - .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 - .type .ymm_reg_mask.1,@object - .size .ymm_reg_mask.1,32 - .align 8 - -# rdi -> a -# rsi -> neighbors -# rdx -> numneighs[i] -# rcx -> &t[t_idx] -# r8 -> ntest -.text -.globl gather_md_aos -.type gather_md_aos, @function -gather_md_aos : -push rbp -mov rbp, rsp -push rbx -push r10 -push r11 -push r12 -push r13 -push r14 -push r15 - -vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip] -mov r15, rdx -xor rax, rax -.align 16 -1: - -vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4] -vpaddd ymm4, ymm3, ymm3 -#ifdef PADDING -vpaddd ymm3, ymm4, ymm4 -#else -vpaddd ymm3, ymm3, ymm4 -#endif - -# Prefetching instructions -#mov ebx, DWORD PTR[rsi + rax*4] -#mov r9d, DWORD PTR[4 + rsi + rax*4] -#mov r10d, DWORD PTR[8 + rsi + rax*4] -#mov r11d, DWORD PTR[12 + rsi + rax*4] -#mov r12d, DWORD PTR[16 + rsi + rax*4] -#mov r13d, DWORD PTR[20 + rsi + rax*4] -#mov r14d, DWORD PTR[24 + rsi + rax*4] -#mov r15d, DWORD PTR[28 + rsi + rax*4] -#lea ebx, DWORD PTR[rbx] -#lea r9d, DWORD PTR[r9] -#lea r10d, DWORD PTR[r10] -#lea r11d, DWORD PTR[r11] -#lea r12d, DWORD PTR[r12] -#lea r13d, DWORD PTR[r13] -#lea r14d, DWORD PTR[r14] -#lea r15d, DWORD PTR[r15] - -vpcmpeqb k1, xmm5, xmm5 -#ifndef ONLY_FIRST_DIMENSION -vpcmpeqb k2, xmm5, xmm5 -vpcmpeqb k3, xmm5, xmm5 -#endif - -vpxord zmm0, zmm0, zmm0 -#ifndef ONLY_FIRST_DIMENSION -vpxord zmm1, zmm1, zmm1 -vpxord zmm2, zmm2, zmm2 -#endif - -vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8] -#ifndef ONLY_FIRST_DIMENSION -vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8] -vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8] -#endif - -#ifdef TEST -vmovupd [rcx + rax * 8], zmm0 -lea rbx, [rcx + r8 * 8] -vmovupd [rbx + rax * 8], zmm1 -lea r10, [rbx + r8 * 8] -vmovupd [r10 + rax * 8], zmm2 -#endif - -# TODO: see if this logic can be optimized -addq rax, 8 -subq r15, 8 -cmpq r15, 8 -jge 1b - -cmpq r15, 0 -jle .end_func - -vpbroadcastd ymm6, r15d -vpcmpgtd k1, ymm6, ymm7 -vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4] -vpaddd ymm4, ymm3, ymm3 -#ifdef PADDING -vpaddd ymm3, ymm4, ymm4 -#else -vpaddd ymm3, ymm3, ymm4 -#endif - -vpxord zmm0, zmm1, zmm2 -#ifndef ONLY_FIRST_DIMENSION -kmovw k2, k1 -kmovw k3, k1 -vpxord zmm1, zmm1, zmm1 -vpxord zmm2, zmm2, zmm2 -#endif - -vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8] -#ifndef ONLY_FIRST_DIMENSION -vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8] -vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8] -#endif - -#ifdef TEST -vmovupd [rcx + rax * 8], zmm0 -lea rbx, [rcx + r8 * 8] -vmovupd [rbx + rax * 8], zmm1 -lea r10, [rbx + r8 * 8] -vmovupd [r10 + rax * 8], zmm2 -#endif - -addq rax, r15 - -.end_func: -pop r15 -pop r14 -pop r13 -pop r12 -pop r11 -pop r10 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather_md_aos, .-gather_md_aos diff --git a/util/gather-bench/src/avx512/gather_soa.S b/util/gather-bench/src/avx512/gather_soa.S deleted file mode 100644 index 25b1eba..0000000 --- a/util/gather-bench/src/avx512/gather_soa.S +++ /dev/null @@ -1,67 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> a -# rsi -> idx -# rdx -> N -# rcx -> t -.text -.globl gather_soa -.type gather_soa, @function -gather_soa : -push rbp -mov rbp, rsp -push rbx -push r9 -push r10 -push r11 -push r12 -push r13 -push r14 -push r15 - -xor rax, rax -vpcmpeqd ymm8, ymm8, ymm8 -lea r8, [rdi + rdx * 8] -lea r9, [r8 + rdx * 8] -.align 16 -1: - -vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4] -vpcmpeqb k1, xmm5, xmm5 -vpcmpeqb k2, xmm5, xmm5 -vpcmpeqb k3, xmm5, xmm5 -vpxord zmm0, zmm0, zmm0 -vpxord zmm1, zmm1, zmm1 -vpxord zmm2, zmm2, zmm2 -vgatherdpd zmm0{k1}, [rdi + ymm3 * 8] -vgatherdpd zmm1{k2}, [r8 + ymm3 * 8] -vgatherdpd zmm2{k3}, [r9 + ymm3 * 8] - -#ifdef TEST -vmovupd [rcx + rax * 8], zmm0 -lea rbx, [rcx + rdx * 8] -vmovupd [rbx + rax * 8], zmm1 -lea r10, [rbx + rdx * 8] -vmovupd [r10 + rax * 8], zmm2 -#endif - -addq rax, 8 -cmpq rax, rdx -jl 1b - -pop r15 -pop r14 -pop r13 -pop r12 -pop r11 -pop r10 -pop r9 -pop rbx -mov rsp, rbp -pop rbp -ret -.size gather_soa, .-gather_soa diff --git a/util/gather-bench/src/avx512/load_aos.S b/util/gather-bench/src/avx512/load_aos.S deleted file mode 100644 index c5e1dac..0000000 --- a/util/gather-bench/src/avx512/load_aos.S +++ /dev/null @@ -1,23 +0,0 @@ -.intel_syntax noprefix -.data -.align 64 -SCALAR: -.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 - -# rdi -> &a[i * snbytes] - -.text -.globl load_aos -.type load_aos, @function -load_aos : - -vmovsd xmm0, QWORD PTR [rdi] -vmovsd xmm1, QWORD PTR [8 + rdi] -vmovsd xmm2, QWORD PTR [16 + rdi] - -vbroadcastsd zmm3, xmm0 -vbroadcastsd zmm4, xmm1 -vbroadcastsd zmm5, xmm2 - -ret -.size load_aos, .-load_aos diff --git a/util/gather-bench/src/includes/allocate.h b/util/gather-bench/src/includes/allocate.h deleted file mode 100644 index 1732898..0000000 --- a/util/gather-bench/src/includes/allocate.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#ifndef __ALLOCATE_H_ -#define __ALLOCATE_H_ - -extern void* allocate (int alignment, size_t bytesize); - -#endif diff --git a/util/gather-bench/src/includes/likwid-marker.h b/util/gather-bench/src/includes/likwid-marker.h deleted file mode 100644 index a35b495..0000000 --- a/util/gather-bench/src/includes/likwid-marker.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#ifndef LIKWID_MARKERS_H -#define LIKWID_MARKERS_H - -#ifdef LIKWID_PERFMON -#include -#define LIKWID_MARKER_INIT likwid_markerInit() -#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit() -#define LIKWID_MARKER_SWITCH likwid_markerNextGroup() -#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag) -#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag) -#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag) -#define LIKWID_MARKER_CLOSE likwid_markerClose() -#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag) -#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count) -#else /* LIKWID_PERFMON */ -#define LIKWID_MARKER_INIT -#define LIKWID_MARKER_THREADINIT -#define LIKWID_MARKER_SWITCH -#define LIKWID_MARKER_REGISTER(regionTag) -#define LIKWID_MARKER_START(regionTag) -#define LIKWID_MARKER_STOP(regionTag) -#define LIKWID_MARKER_CLOSE -#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) -#define LIKWID_MARKER_RESET(regionTag) -#endif /* LIKWID_PERFMON */ - -#endif /*LIKWID_MARKERS_H*/ diff --git a/util/gather-bench/src/includes/timing.h b/util/gather-bench/src/includes/timing.h deleted file mode 100644 index 6d9fb93..0000000 --- a/util/gather-bench/src/includes/timing.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#ifndef __TIMING_H_ -#define __TIMING_H_ - -extern double getTimeStamp(); -extern double getTimeResolution(); -extern double getTimeStamp_(); - -#endif diff --git a/util/gather-bench/src/main-md-trace.c b/util/gather-bench/src/main-md-trace.c deleted file mode 100644 index 9fade75..0000000 --- a/util/gather-bench/src/main-md-trace.c +++ /dev/null @@ -1,441 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#include -#include -#include -#include -#include -#include -#include -#include -//--- -#include -//--- -#include -#include - -#if !defined(ISA_avx2) && !defined (ISA_avx512) -#error "Invalid ISA macro, possible values are: avx2 and avx512" -#endif - -#if defined(TEST) && defined(ONLY_FIRST_DIMENSION) -#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!" -#endif - -#define HLINE "----------------------------------------------------------------------------\n" - -#ifndef MIN -#define MIN(x,y) ((x)<(y)?(x):(y)) -#endif -#ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) -#endif -#ifndef ABS -#define ABS(a) ((a) >= 0 ? (a) : -(a)) -#endif - -#define ARRAY_ALIGNMENT 64 - -#ifdef ISA_avx512 -#define _VL_ 8 -#define ISA_STRING "avx512" -#else -#define _VL_ 4 -#define ISA_STRING "avx2" -#endif - -#ifdef AOS -#define GATHER gather_md_aos -#define LOAD(a, i, d, n) load_aos(&a[i * d]) -#define LAYOUT_STRING "AoS" -#else -#define GATHER gather_md_soa -#define LOAD(a, i, d, n) load_soa(a, i, n) -#define LAYOUT_STRING "SoA" -#endif - -#if defined(PADDING) && defined(AOS) -#define PADDING_BYTES 1 -#else -#define PADDING_BYTES 0 -#endif - -#ifdef MEM_TRACER -# define MEM_TRACER_INIT(trace_file) FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(trace_file), "w"); -# define MEM_TRACER_END fclose(mem_tracer_fp); -# define MEM_TRACE(addr, op) fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); -#else -# define MEM_TRACER_INIT -# define MEM_TRACER_END -# define MEM_TRACE(addr, op) -#endif - -int gather_md_aos(double*, int*, int, double*, int); -int gather_md_soa(double*, int*, int, double*, int); -void load_aos(double*); -void load_soa(double*, int, int); - -const char *get_mem_tracer_filename(const char *trace_file) { - static char fname[64]; - snprintf(fname, sizeof fname, "mem_tracer_%s.txt", trace_file); - return fname; -} - -int log2_uint(unsigned int x) { - int ans = 0; - while(x >>= 1) { ans++; } - return ans; -} - -int main (int argc, char** argv) { - LIKWID_MARKER_INIT; - LIKWID_MARKER_REGISTER("gather"); - char *trace_file = NULL; - int cl_size = 64; - int ntimesteps = 200; - int reneigh_every = 20; - int opt = 0; - double freq = 2.5; - struct option long_opts[] = { - {"trace" , required_argument, NULL, 't'}, - {"freq", required_argument, NULL, 'f'}, - {"line", required_argument, NULL, 'l'}, - {"timesteps", required_argument, NULL, 'n'}, - {"reneigh", required_argument, NULL, 'r'}, - {"help", required_argument, NULL, 'h'} - }; - - while((opt = getopt_long(argc, argv, "t:f:l:n:r:h", long_opts, NULL)) != -1) { - switch(opt) { - case 't': - trace_file = strdup(optarg); - break; - - case 'f': - freq = atof(optarg); - break; - - case 'l': - cl_size = atoi(optarg); - break; - - case 'n': - ntimesteps = atoi(optarg); - break; - - case 'r': - reneigh_every = atoi(optarg); - break; - - case 'h': - case '?': - default: - printf("Usage: %s [OPTION]...\n", argv[0]); - printf("MD variant for gather benchmark.\n\n"); - printf("Mandatory arguments to long options are also mandatory for short options.\n"); - printf("\t-t, --trace=STRING input file with traced indexes from MD-Bench.\n"); - printf("\t-f, --freq=REAL CPU frequency in GHz (default 2.5).\n"); - printf("\t-l, --line=NUMBER cache line size in bytes (default 64).\n"); - printf("\t-n, --timesteps=NUMBER number of timesteps to simulate (default 200).\n"); - printf("\t-r, --reneigh=NUMBER reneighboring frequency in timesteps (default 20).\n"); - printf("\t-h, --help display this help message.\n"); - printf("\n\n"); - return EXIT_FAILURE; - } - } - - if(trace_file == NULL) { - fprintf(stderr, "Trace file not specified!\n"); - return EXIT_FAILURE; - } - - FILE *fp; - char *line = NULL; - int *neighborlists = NULL; - int *numneighs = NULL; - int atom = -1; - int nlocal, nghost, maxneighs; - int nall = 0; - int N_alloc = 0; - size_t ntest = 0; - size_t llen; - ssize_t read; - double *a = NULL; - double *f = NULL; - double *t = NULL; - double time = 0.0; - double E, S; - const int dims = 3; - const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding - long long int niters = 0; - long long int ngathered = 0; - - printf("ISA,Layout,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e)\n"); - printf("%s,%s,%d,%f,%d,%d\n\n", ISA_STRING, LAYOUT_STRING, dims, freq, cl_size, _VL_); - freq = freq * 1e9; - - #ifdef ONLY_FIRST_DIMENSION - const int gathered_dims = 1; - #else - const int gathered_dims = dims; - #endif - - for(int ts = -1; ts < ntimesteps; ts++) { - if(!((ts + 1) % reneigh_every)) { - char ts_trace_file[128]; - snprintf(ts_trace_file, sizeof ts_trace_file, "%s_%d.out", trace_file, ts + 1); - if((fp = fopen(ts_trace_file, "r")) == NULL) { - fprintf(stderr, "Error: could not open trace file!\n"); - return EXIT_FAILURE; - } - - while((read = getline(&line, &llen, fp)) != -1) { - int i = 2; - if(strncmp(line, "N:", 2) == 0) { - while(line[i] == ' ') { i++; } - nlocal = atoi(strtok(&line[i], " ")); - nghost = atoi(strtok(NULL, " ")); - nall = nlocal + nghost; - maxneighs = atoi(strtok(NULL, " ")); - - if(nlocal <= 0 || maxneighs <= 0) { - fprintf(stderr, "Number of local atoms and neighbor lists capacity cannot be less or equal than zero!\n"); - return EXIT_FAILURE; - } - - if(neighborlists == NULL) { - neighborlists = (int *) allocate( ARRAY_ALIGNMENT, nlocal * maxneighs * sizeof(int) ); - numneighs = (int *) allocate( ARRAY_ALIGNMENT, nlocal * sizeof(int) ); - } - } - - if(strncmp(line, "A:", 2) == 0) { - while(line[i] == ' ') { i++; } - atom = atoi(strtok(&line[i], " ")); - numneighs[atom] = 0; - } - - if(strncmp(line, "I:", 2) == 0) { - while(line[i] == ' ') { i++; } - char *neigh_idx = strtok(&line[i], " "); - - while(neigh_idx != NULL && *neigh_idx != '\n') { - int j = numneighs[atom]; - neighborlists[atom * maxneighs + j] = atoi(neigh_idx); - numneighs[atom]++; - ntest++; - neigh_idx = strtok(NULL, " "); - } - } - } - - fclose(fp); - } - - if(N_alloc == 0) { - N_alloc = nall * 2; - a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) ); - f = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) ); - } - - #ifdef TEST - if(t != NULL) { free(t); } - ntest += 100; - t = (double*) allocate( ARRAY_ALIGNMENT, ntest * dims * sizeof(double) ); - #endif - - for(int i = 0; i < N_alloc; ++i) { - #ifdef AOS - a[i * snbytes + 0] = i * dims + 0; - a[i * snbytes + 1] = i * dims + 1; - a[i * snbytes + 2] = i * dims + 2; - #else - a[N * 0 + i] = N * 0 + i; - a[N * 1 + i] = N * 1 + i; - a[N * 2 + i] = N * 2 + i; - #endif - f[i * dims + 0] = 0.0; - f[i * dims + 1] = 0.0; - f[i * dims + 2] = 0.0; - } - - int t_idx = 0; - S = getTimeStamp(); - LIKWID_MARKER_START("gather"); - for(int i = 0; i < nlocal; i++) { - int *neighbors = &neighborlists[i * maxneighs]; - // We inline the assembly for AVX512 with AoS layout to evaluate the impact - // of calling external assembly procedures in the overall runtime - #ifdef ISA_avx512 - __m256i ymm_reg_mask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); - __asm__ __volatile__( "vmovsd 0(%0), %%xmm3;" - "vmovsd 8(%0), %%xmm4;" - "vmovsd 16(%0), %%xmm5;" - "vbroadcastsd %%xmm3, %%zmm0;" - "vbroadcastsd %%xmm4, %%zmm1;" - "vbroadcastsd %%xmm5, %%zmm2;" - : - : "r" (&a[i * snbytes]) - : "%xmm3", "%xmm4", "%xmm5", "%zmm0", "%zmm1", "%zmm2" ); - - __asm__ __volatile__( "xor %%rax, %%rax;" - "movq %%rdx, %%r15;" - "1: vmovdqu (%1,%%rax,4), %%ymm3;" - "vpaddd %%ymm3, %%ymm3, %%ymm4;" - #ifdef PADDING - "vpaddd %%ymm4, %%ymm4, %%ymm3;" - #else - "vpaddd %%ymm3, %%ymm4, %%ymm3;" - #endif - "vpcmpeqb %%xmm5, %%xmm5, %%k1;" - "vpcmpeqb %%xmm5, %%xmm5, %%k2;" - "vpcmpeqb %%xmm5, %%xmm5, %%k3;" - "vpxord %%zmm0, %%zmm0, %%zmm0;" - "vpxord %%zmm1, %%zmm1, %%zmm1;" - "vpxord %%zmm2, %%zmm2, %%zmm2;" - "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};" - "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};" - "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};" - "addq $8, %%rax;" - "subq $8, %%r15;" - "cmpq $8, %%r15;" - "jge 1b;" - "cmpq $0, %%r15;" - "jle 2;" - "vpbroadcastd %%r15d, %%ymm5;" - "vpcmpgtd %%ymm5, %2, %%k1;" - "vmovdqu32 (%1,%%rax,4), %%ymm3{{%%k1}}{{z}};" - "vpaddd %%ymm3, %%ymm3, %%ymm4;" - #ifdef PADDING - "vpaddd %%ymm4, %%ymm4, %%ymm3;" - #else - "vpaddd %%ymm3, %%ymm4, %%ymm3;" - #endif - "vpxord %%zmm0, %%zmm0, %%zmm0;" - "kmovw %%k1, %%k2;" - "kmovw %%k1, %%k3;" - "vpxord %%zmm1, %%zmm1, %%zmm1;" - "vpxord %%zmm2, %%zmm2, %%zmm2;" - "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};" - "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};" - "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};" - "addq %%r15, %%rax;" - "2:;" - : - : "d" (numneighs[i]), "r" (neighbors), "x" (ymm_reg_mask), "r" (a) - : "%rax", "%r15", "%ymm3", "%ymm4", "%ymm5", "%k1", "%k2", "%k3", "%zmm0", "%zmm1", "%zmm2" ); - #else - LOAD(a, i, snbytes, N_alloc); - t_idx += GATHER(a, neighbors, numneighs[i], &t[t_idx], ntest); - #endif - f[i * dims + 0] += i; - f[i * dims + 1] += i; - f[i * dims + 2] += i; - } - LIKWID_MARKER_STOP("gather"); - E = getTimeStamp(); - time += E - S; - - #ifdef MEM_TRACER - MEM_TRACER_INIT(trace_file); - for(int i = 0; i < nlocal; i++) { - int *neighbors = &neighborlists[i * maxneighs]; - - for(int d = 0; d < gathered_dims; d++) { - #ifdef AOS - MEM_TRACE('R', a[i * snbytes + d]) - #else - MEM_TRACE('R', a[d * N + i]) - #endif - } - - for(int j = 0; j < numneighs[i]; j += _VL_) { - for(int jj = j; jj < MIN(j + _VL_, numneighs[i]); j++) { - int k = neighbors[jj]; - for(int d = 0; d < gathered_dims; d++) { - #ifdef AOS - MEM_TRACE('R', a[k * snbytes + d]) - #else - MEM_TRACE('R', a[d * N + k]) - #endif - } - } - } - } - MEM_TRACER_END; - #endif - - #ifdef TEST - int test_failed = 0; - t_idx = 0; - for(int i = 0; i < nlocal; ++i) { - int *neighbors = &neighborlists[i * maxneighs]; - for(int j = 0; j < numneighs[i]; ++j) { - int k = neighbors[j]; - for(int d = 0; d < dims; ++d) { - #ifdef AOS - if(t[d * ntest + t_idx] != k * dims + d) { - #else - if(t[d * ntest + t_idx] != d * N + k) { - #endif - test_failed = 1; - break; - } - } - - t_idx++; - } - } - - if(test_failed) { - printf("Test failed!\n"); - return EXIT_FAILURE; - } - #endif - - for(int i = 0; i < nlocal; i++) { - niters += (numneighs[i] / _VL_) + ((numneighs[i] % _VL_ == 0) ? 0 : 1); - ngathered += numneighs[i]; - } - } - - printf("%14s,%14s,%14s,%14s,%14s,%14s", "tot. time(s)", "time/step(ms)", "time/iter(us)", "cy/it", "cy/gather", "cy/elem"); - printf("\n"); - const double time_per_step = time * 1e3 / ((double) ntimesteps); - const double time_per_it = time * 1e6 / ((double) niters); - const double cy_per_it = time * freq * _VL_ / ((double) niters); - const double cy_per_gather = time * freq * _VL_ / ((double) niters * gathered_dims); - const double cy_per_elem = time * freq / ((double) ngathered * gathered_dims); - printf("%14.6f,%14.6f,%14.6f,%14.6f,%14.6f,%14.6f\n", time, time_per_step, time_per_it, cy_per_it, cy_per_gather, cy_per_elem); - - #ifdef TEST - printf("Test passed!\n"); - #endif - - LIKWID_MARKER_CLOSE; - return EXIT_SUCCESS; -} diff --git a/util/gather-bench/src/main-md.c b/util/gather-bench/src/main-md.c deleted file mode 100644 index 2ad8c9a..0000000 --- a/util/gather-bench/src/main-md.c +++ /dev/null @@ -1,361 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#include -#include -#include -#include -#include -#include -//--- -#include -//--- -#include -#include - -#if !defined(ISA_avx2) && !defined (ISA_avx512) -#error "Invalid ISA macro, possible values are: avx2 and avx512" -#endif - -#if defined(TEST) && defined(ONLY_FIRST_DIMENSION) -#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!" -#endif - -#define HLINE "----------------------------------------------------------------------------\n" - -#ifndef MIN -#define MIN(x,y) ((x)<(y)?(x):(y)) -#endif -#ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) -#endif -#ifndef ABS -#define ABS(a) ((a) >= 0 ? (a) : -(a)) -#endif - -#define ARRAY_ALIGNMENT 64 -#define SIZE 20000 - -#ifdef ISA_avx512 -#define _VL_ 8 -#define ISA_STRING "avx512" -#else -#define _VL_ 4 -#define ISA_STRING "avx2" -#endif - -#ifdef AOS -#define GATHER gather_aos -#define LAYOUT_STRING "AoS" -#else -#define GATHER gather_soa -#define LAYOUT_STRING "SoA" -#endif - -#if defined(PADDING) && defined(AOS) -#define PADDING_BYTES 1 -#else -#define PADDING_BYTES 0 -#endif - -#ifdef MEM_TRACER -# define MEM_TRACER_INIT(stride, size) FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(stride, size), "w"); -# define MEM_TRACER_END fclose(mem_tracer_fp); -# define MEM_TRACE(addr, op) fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); -#else -# define MEM_TRACER_INIT -# define MEM_TRACER_END -# define MEM_TRACE(addr, op) -#endif - -extern void gather_aos(double*, int*, int, double*, long int*); -extern void gather_soa(double*, int*, int, double*, long int*); - -const char *get_mem_tracer_filename(int stride, int size) { - static char fname[64]; - snprintf(fname, sizeof fname, "mem_tracer_%d_%d.txt", stride, size); - return fname; -} - -int log2_uint(unsigned int x) { - int ans = 0; - while(x >>= 1) { ans++; } - return ans; -} - -int main (int argc, char** argv) { - LIKWID_MARKER_INIT; - LIKWID_MARKER_REGISTER("gather"); - int stride = 1; - int cl_size = 64; - int opt = 0; - double freq = 2.5; - struct option long_opts[] = { - {"stride", required_argument, NULL, 's'}, - {"freq", required_argument, NULL, 'f'}, - {"line", required_argument, NULL, 'l'}, - {"help", required_argument, NULL, 'h'} - }; - - while((opt = getopt_long(argc, argv, "s:f:l:h", long_opts, NULL)) != -1) { - switch(opt) { - case 's': - stride = atoi(optarg); - break; - - case 'f': - freq = atof(optarg); - break; - - case 'l': - cl_size = atoi(optarg); - break; - - case 'h': - case '?': - default: - printf("Usage: %s [OPTION]...\n", argv[0]); - printf("MD variant for gather benchmark.\n\n"); - printf("Mandatory arguments to long options are also mandatory for short options.\n"); - printf("\t-s, --stride=NUMBER stride between two successive elements (default 1).\n"); - printf("\t-f, --freq=REAL CPU frequency in GHz (default 2.5).\n"); - printf("\t-l, --line=NUMBER cache line size in bytes (default 64).\n"); - printf("\t-h, --help display this help message.\n"); - printf("\n\n"); - return EXIT_FAILURE; - } - } - - size_t bytesPerWord = sizeof(double); - const int dims = 3; - const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding - #ifdef AOS - size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ * snbytes / (cl_size / sizeof(double)), 1), _VL_); - #else - size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_) * dims; - #endif - size_t N = SIZE; - double E, S; - - printf("ISA,Layout,Stride,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e),Cache Lines/Gather\n"); - printf("%s,%s,%d,%d,%f,%d,%d,%lu\n\n", ISA_STRING, LAYOUT_STRING, stride, dims, freq, cl_size, _VL_, cacheLinesPerGather); - printf("%14s,%14s,%14s,", "N", "Size(kB)", "cut CLs"); - -#ifndef MEASURE_GATHER_CYCLES - printf("%14s,%14s,%14s,%14s,%14s", "tot. time", "time/LUP(ms)", "cy/it", "cy/gather", "cy/elem"); -#else - -#ifdef ONLY_FIRST_DIMENSION - printf("%27s,%27s,%27s", "min/max/avg cy(x)", "min/max/avg cy(y)", "min/max/avg cy(z)"); -#else - printf("%27s", "min/max/avg cy(x)"); -#endif - -#endif - - printf("\n"); - freq = freq * 1e9; - - for(int N = 512; N < 80000000; N = 1.5 * N) { - // Currently this only works when the array size (in elements) is multiple of the vector length (no preamble and prelude) - if(N % _VL_ != 0) { - N += _VL_ - (N % _VL_); - } - - MEM_TRACER_INIT(stride, N); - - int N_gathers_per_dim = N / _VL_; - int N_alloc = N * 2; - int N_cycles_alloc = N_gathers_per_dim * 2; - int cut_cl = 0; - double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) ); - int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) ); - int rep; - double time; - -#ifdef TEST - double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) ); -#else - double* t = (double*) NULL; -#endif - -#ifdef MEASURE_GATHER_CYCLES - long int* cycles = (long int*) allocate( ARRAY_ALIGNMENT, N_cycles_alloc * dims * sizeof(long int)) ; -#else - long int* cycles = (long int*) NULL; -#endif - - for(int i = 0; i < N_alloc; ++i) { -#ifdef AOS - a[i * snbytes + 0] = i * dims + 0; - a[i * snbytes + 1] = i * dims + 1; - a[i * snbytes + 2] = i * dims + 2; -#else - a[N * 0 + i] = N * 0 + i; - a[N * 1 + i] = N * 1 + i; - a[N * 2 + i] = N * 2 + i; -#endif - idx[i] = (i * stride) % N; - } - -#ifdef ONLY_FIRST_DIMENSION - const int gathered_dims = 1; -#else - const int gathered_dims = dims; -#endif - -#ifdef MEM_TRACER - for(int i = 0; i < N; i += _VL_) { - for(int j = 0; j < _VL_; j++) { - MEM_TRACE(idx[i + j], 'R'); - } - - for(int d = 0; d < gathered_dims; d++) { - for(int j = 0; j < _VL_; j++) { -#ifdef AOS - MEM_TRACE(a[idx[i + j] * snbytes + d], 'R'); -#else - MEM_TRACE(a[N * d + idx[i + j]], 'R'); -#endif - } - } - } -#endif - -#ifdef AOS - const int cl_shift = log2_uint((unsigned int) cl_size); - for(int i = 0; i < N; i++) { - const int first_cl = (idx[i] * snbytes * sizeof(double)) >> cl_shift; - const int last_cl = ((idx[i] * snbytes + gathered_dims - 1) * sizeof(double)) >> cl_shift; - if(first_cl != last_cl) { - cut_cl++; - } - } -#endif - - S = getTimeStamp(); - for(int r = 0; r < 100; ++r) { - GATHER(a, idx, N, t, cycles); - } - E = getTimeStamp(); - -#ifdef MEASURE_GATHER_CYCLES - for(int i = 0; i < N_cycles_alloc; i++) { - cycles[i * 3 + 0] = 0; - cycles[i * 3 + 1] = 0; - cycles[i * 3 + 2] = 0; - } -#endif - - rep = 100 * (0.5 / (E - S)); - S = getTimeStamp(); - LIKWID_MARKER_START("gather"); - for(int r = 0; r < rep; ++r) { - GATHER(a, idx, N, t, cycles); - } - LIKWID_MARKER_STOP("gather"); - E = getTimeStamp(); - - time = E - S; - -#ifdef TEST - int test_failed = 0; - for(int i = 0; i < N; ++i) { - for(int d = 0; d < dims; ++d) { -#ifdef AOS - if(t[d * N + i] != ((i * stride) % N) * dims + d) { -#else - if(t[d * N + i] != d * N + ((i * stride) % N)) { -#endif - test_failed = 1; - break; - } - } - } - - if(test_failed) { - printf("Test failed!\n"); - return EXIT_FAILURE; - } else { - printf("Test passed!\n"); - } -#endif - - const double size = N * (dims * sizeof(double) + sizeof(int)) / 1000.0; - printf("%14d,%14.2f,%14d,", N, size, cut_cl); - -#ifndef MEASURE_GATHER_CYCLES - const double time_per_it = time * 1e6 / ((double) N * rep); - const double cy_per_it = time * freq * _VL_ / ((double) N * rep); - const double cy_per_gather = time * freq * _VL_ / ((double) N * rep * gathered_dims); - const double cy_per_elem = time * freq / ((double) N * rep * gathered_dims); - printf("%14.10f,%14.10f,%14.6f,%14.6f,%14.6f", time, time_per_it, cy_per_it, cy_per_gather, cy_per_elem); -#else - double cy_min[dims]; - double cy_max[dims]; - double cy_avg[dims]; - - for(int d = 0; d < dims; d++) { - cy_min[d] = 100000.0; - cy_max[d] = 0.0; - cy_avg[d] = 0.0; - } - - for(int i = 0; i < N_gathers_per_dim; ++i) { - for(int d = 0; d < gathered_dims; d++) { - const double cy_d = (double)(cycles[i * 3 + d]); - cy_min[d] = MIN(cy_min[d], cy_d); - cy_max[d] = MAX(cy_max[d], cy_d); - cy_avg[d] += cy_d; - } - } - - for(int d = 0; d < gathered_dims; d++) { - char tmp_str[64]; - cy_avg[d] /= (double) N_gathers_per_dim; - snprintf(tmp_str, sizeof tmp_str, "%4.4f/%4.4f/%4.4f", cy_min[d], cy_max[d], cy_avg[d]); - printf("%27s%c", tmp_str, (d < gathered_dims - 1) ? ',' : ' '); - } -#endif - - printf("\n"); - free(a); - free(idx); - -#ifdef TEST - free(t); -#endif - -#ifdef MEASURE_GATHER_CYCLES - free(cycles); -#endif - - MEM_TRACER_END; - } - - LIKWID_MARKER_CLOSE; - return EXIT_SUCCESS; -} diff --git a/util/gather-bench/src/main.c b/util/gather-bench/src/main.c deleted file mode 100644 index a0b9972..0000000 --- a/util/gather-bench/src/main.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#include -#include -#include -#include -#include -//--- -#include -//--- -#include -#include - -#if !defined(ISA_avx2) && !defined (ISA_avx512) -#error "Invalid ISA macro, possible values are: avx2 and avx512" -#endif - -#define HLINE "----------------------------------------------------------------------------\n" - -#ifndef MIN -#define MIN(x,y) ((x)<(y)?(x):(y)) -#endif -#ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) -#endif -#ifndef ABS -#define ABS(a) ((a) >= 0 ? (a) : -(a)) -#endif - -#define ARRAY_ALIGNMENT 64 -#define SIZE 20000 - -#ifdef ISA_avx512 -#define _VL_ 8 -#define ISA_STRING "avx512" -#else -#define _VL_ 4 -#define ISA_STRING "avx2" -#endif - -#ifdef TEST -extern void gather(double*, int*, int, double*); -#else -extern void gather(double*, int*, int); -#endif - -int main (int argc, char** argv) { - LIKWID_MARKER_INIT; - LIKWID_MARKER_REGISTER("gather"); - - if (argc < 3) { - printf("Please provide stride and frequency\n"); - printf("%s [cache line size (B)]\n", argv[0]); - return -1; - } - - int stride = atoi(argv[1]); - double freq = atof(argv[2]); - int cl_size = (argc == 3) ? 64 : atoi(argv[3]); - size_t bytesPerWord = sizeof(double); - size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_); - size_t N = SIZE; - double E, S; - - printf("ISA,Stride (elems),Frequency (GHz),Cache Line Size (B),Vector Width (elems),Cache Lines/Gather\n"); - printf("%s,%d,%f,%d,%d,%lu\n\n", ISA_STRING, stride, freq, cl_size, _VL_, cacheLinesPerGather); - printf("%14s,%14s,%14s,%14s,%14s,%14s\n", "N", "Size(kB)", "tot. time", "time/LUP(ms)", "cy/gather", "cy/elem"); - - freq = freq * 1e9; - for(int N = 1024; N < 400000; N = 1.5 * N) { - int N_alloc = N * 2; - double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) ); - int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) ); - int rep; - double time; - -#ifdef TEST - double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) ); -#endif - - for(int i = 0; i < N_alloc; ++i) { - a[i] = i; - idx[i] = (i * stride) % N; - } - - S = getTimeStamp(); - for(int r = 0; r < 100; ++r) { -#ifdef TEST - gather(a, idx, N, t); -#else - gather(a, idx, N); -#endif - } - E = getTimeStamp(); - - rep = 100 * (0.5 / (E - S)); - S = getTimeStamp(); - LIKWID_MARKER_START("gather"); - for(int r = 0; r < rep; ++r) { -#ifdef TEST - gather(a, idx, N, t); -#else - gather(a, idx, N); -#endif - } - LIKWID_MARKER_STOP("gather"); - E = getTimeStamp(); - - time = E - S; - -#ifdef TEST - int test_failed = 0; - for(int i = 0; i < N; ++i) { - if(t[i] != i * stride % N) { - test_failed = 1; - break; - } - } - - if(test_failed) { - printf("Test failed!\n"); - return EXIT_FAILURE; - } else { - printf("Test passed!\n"); - } -#endif - - const double size = N * (sizeof(double) + sizeof(int)) / 1000.0; - const double time_per_it = time * 1e6 / ((double) N * rep); - const double cy_per_gather = time * freq * _VL_ / ((double) N * rep); - const double cy_per_elem = time * freq / ((double) N * rep); - printf("%14d,%14.2f,%14.10f,%14.10f,%14.6f,%14.6f\n", N, size, time, time_per_it, cy_per_gather, cy_per_elem); - free(a); - free(idx); -#ifdef TEST - free(t); -#endif - } - - LIKWID_MARKER_CLOSE; - return EXIT_SUCCESS; -} diff --git a/util/gather-bench/src/timing.c b/util/gather-bench/src/timing.c deleted file mode 100644 index aad74cb..0000000 --- a/util/gather-bench/src/timing.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * ======================================================================================= - * - * Author: Jan Eitzinger (je), jan.eitzinger@fau.de - * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * ======================================================================================= - */ -#include -#include - -double getTimeStamp() -{ - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9; -} - -double getTimeResolution() -{ - struct timespec ts; - clock_getres(CLOCK_MONOTONIC, &ts); - return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9; -} - -double getTimeStamp_() -{ - return getTimeStamp(); -} diff --git a/util/mdBench.c b/util/mdBench.c deleted file mode 100644 index 3bea419..0000000 --- a/util/mdBench.c +++ /dev/null @@ -1,979 +0,0 @@ -/* - * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. - * All rights reserved. This file is part of MD-Bench. - * Use of this source code is governed by a LGPL-3.0 - * license that can be found in the LICENSE file. - */ -#include -#include -#include -#include -#include -#include -#include - -#define HLINE "----------------------------------------------------------------------------\n" - -#define FACTOR 0.999 -#define SMALL 1.0e-6 -#define DELTA 20000 - -#ifndef MIN -#define MIN(x,y) ((x)<(y)?(x):(y)) -#endif -#ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) -#endif -#ifndef ABS -#define ABS(a) ((a) >= 0 ? (a) : -(a)) -#endif - -static int Natoms, Nlocal, Nghost, Nmax; -static double Cutneigh; // neighbor cutoff -static double xprd, yprd, zprd; -static double xlo, xhi; -static double ylo, yhi; -static double zlo, zhi; -static double *x, *y, *z; -static double *vx, *vy, *vz; -static double *fx, *fy, *fz; - -static int NmaxGhost; -static int *BorderMap; -static int *PBCx, *PBCy, *PBCz; - -typedef struct { - int* numneigh; - int* neighbors; - int maxneighs; - int nbinx, nbiny, nbinz; - /* double cutneigh; // neighbor cutoff */ - double cutneighsq; // neighbor cutoff squared - int every; - int ncalls; - int max_totalneigh; - int *bincount; - int *bins; - int nmax; - int nstencil; // # of bins in stencil - int* stencil; // stencil list of bin offsets - int mbins; //total number of bins - int atoms_per_bin; // max atoms per bin - int mbinx, mbiny, mbinz; // n bins in x, y, z - int mbinxlo, mbinylo, mbinzlo; - double binsizex, binsizey, binsizez; - double bininvx, bininvy, bininvz; -} Neighbor; - -typedef struct { - double epsilon; - double sigma6; - double temp; - double rho; - double mass; - int ntimes; - int nstat; - double dt; - double dtforce; - double cutforce; - int nx, ny, nz; -} Parameter; - -typedef struct { - int *steparr; - double *tmparr; - double *engarr; - double *prsarr; - double mvv2e; - int dof_boltz; - double t_scale; - double p_scale; - double e_scale; - double t_act; - double p_act; - double e_act; - int mstat; -} Thermo; - -/* Park/Miller RNG w/out MASKING, so as to be like f90s version */ -#define IA 16807 -#define IM 2147483647 -#define AM (1.0/IM) -#define IQ 127773 -#define IR 2836 -#define MASK 123459876 - -double myrandom(int* idum) -{ - int k= (*idum) / IQ; - double ans; - - *idum = IA * (*idum - k * IQ) - IR * k; - if(*idum < 0) *idum += IM; - ans = AM * (*idum); - return ans; -} - -int coord2bin(Neighbor* neighbor, double xin, double yin, double zin) -{ - int ix, iy, iz; - double bininvx = neighbor->bininvx; - double bininvy = neighbor->bininvy; - double bininvz = neighbor->bininvz; - int mbinxlo = neighbor->mbinxlo; - int mbinylo = neighbor->mbinylo; - int mbinzlo = neighbor->mbinzlo; - - if(xin >= xprd) { - ix = (int)((xin - xprd) * bininvx) + neighbor->nbinx - mbinxlo; - } else if(xin >= 0.0) { - ix = (int)(xin * bininvx) - mbinxlo; - } else { - ix = (int)(xin * bininvx) - mbinxlo - 1; - } - - if(yin >= yprd) { - iy = (int)((yin - yprd) * bininvy) + neighbor->nbiny - mbinylo; - } else if(yin >= 0.0) { - iy = (int)(yin * bininvy) - mbinylo; - } else { - iy = (int)(yin * bininvy) - mbinylo - 1; - } - - if(zin >= zprd) { - iz = (int)((zin - zprd) * bininvz) + neighbor->nbinz - mbinzlo; - } else if(zin >= 0.0) { - iz = (int)(zin * bininvz) - mbinzlo; - } else { - iz = (int)(zin * bininvz) - mbinzlo - 1; - } - - return (iz * neighbor->mbiny * neighbor->mbinx + iy * neighbor->mbinx + ix + 1); -} - -void binatoms(Neighbor *neighbor) -{ - int* bincount = neighbor->bincount; - int mbins = neighbor->mbins; - int nall = Nlocal + Nghost; - int resize = 1; - - while(resize > 0) { - resize = 0; - - for(int i = 0; i < mbins; i++) { - bincount[i] = 0; - } - - for(int i = 0; i < nall; i++) { - int ibin = coord2bin(neighbor, x[i], y[i], z[i]); - - if(bincount[ibin] < neighbor->atoms_per_bin) { - int ac = neighbor->bincount[ibin]++; - neighbor->bins[ibin * neighbor->atoms_per_bin + ac] = i; - } else { - resize = 1; - } - } - - if(resize) { - free(neighbor->bins); - neighbor->atoms_per_bin *= 2; - neighbor->bins = (int*) malloc(mbins * neighbor->atoms_per_bin * sizeof(int)); - } - } -} - -double bindist(Neighbor *neighbor, int i, int j, int k) -{ - double delx, dely, delz; - - if(i > 0) { - delx = (i - 1) * neighbor->binsizex; - } else if(i == 0) { - delx = 0.0; - } else { - delx = (i + 1) * neighbor->binsizex; - } - - if(j > 0) { - dely = (j - 1) * neighbor->binsizey; - } else if(j == 0) { - dely = 0.0; - } else { - dely = (j + 1) * neighbor->binsizey; - } - - if(k > 0) { - delz = (k - 1) * neighbor->binsizez; - } else if(k == 0) { - delz = 0.0; - } else { - delz = (k + 1) * neighbor->binsizez; - } - - return (delx * delx + dely * dely + delz * delz); -} - -void buildNeighborlist(Neighbor *neighbor) -{ - neighbor->ncalls++; - int nall = Nlocal + Nghost; - - /* extend atom arrays if necessary */ - if(nall > neighbor->nmax) { - neighbor->nmax = nall; - if(neighbor->numneigh) free(neighbor->numneigh); - if(neighbor->neighbors) free(neighbor->neighbors); - neighbor->numneigh = (int*) malloc(neighbor->nmax * sizeof(int)); - neighbor->neighbors = (int*) malloc(neighbor->nmax * neighbor->maxneighs * sizeof(int*)); - } - - /* bin local & ghost atoms */ - binatoms(neighbor); - int resize = 1; - - /* loop over each atom, storing neighbors */ - while(resize) { - int new_maxneighs = neighbor->maxneighs; - resize = 0; - - for(int i = 0; i < Nlocal; i++) { - int* neighptr = &neighbor->neighbors[i * neighbor->maxneighs]; - int n = 0; - double xtmp = x[i]; - double ytmp = y[i]; - double ztmp = z[i]; - int ibin = coord2bin(neighbor, xtmp, ytmp, ztmp); - - for(int k = 0; k < neighbor->nstencil; k++) { - int jbin = ibin + neighbor->stencil[k]; - int* loc_bin = &neighbor->bins[jbin * neighbor->atoms_per_bin]; - - for(int m = 0; m < neighbor->bincount[jbin]; m++) { - int j = loc_bin[m]; - - if ( j == i ){ - continue; - } - - double delx = xtmp - x[j]; - double dely = ytmp - y[j]; - double delz = ztmp - z[j]; - double rsq = delx * delx + dely * dely + delz * delz; - - if( rsq <= neighbor->cutneighsq ) { - neighptr[n++] = j; - } - } - } - - neighbor->numneigh[i] = n; - - if(n >= neighbor->maxneighs) { - resize = 1; - - if(n >= new_maxneighs) { - new_maxneighs = n; - } - } - } - - if(resize) { - neighbor->maxneighs = new_maxneighs * 1.2; - free(neighbor->neighbors); - neighbor->neighbors = (int*) malloc(Nmax* neighbor->maxneighs * sizeof(int)); - } - } -} - -void init(Neighbor *neighbor, Parameter *param) -{ - x = NULL; y = NULL; z = NULL; - vx = NULL; vy = NULL; vz = NULL; - fx = NULL; fy = NULL; fz = NULL; - - NmaxGhost = 0; - BorderMap = NULL; - PBCx = NULL; PBCy = NULL; PBCz = NULL; - - param->epsilon = 1.0; - param->sigma6 = 1.0; - param->rho = 0.8442; - param->ntimes = 200; - param->dt = 0.005; - param->nx = 32; - param->ny = 32; - param->nz = 64; - param->cutforce = 2.5; - param->temp = 1.44; - param->nstat = 100; - param->mass = 1.0; - param->dtforce = 0.5 * param->dt; - - Cutneigh = param->cutforce + 0.30; - double neighscale = 5.0 / 6.0; - neighbor->nbinx = neighscale * param->nx; - neighbor->nbiny = neighscale * param->ny; - neighbor->nbinz = neighscale * param->nz; - neighbor->every = 20; - neighbor->ncalls = 0; - neighbor->nmax = 0; - neighbor->atoms_per_bin = 8; - neighbor->maxneighs = 100; - /* neighbor->cutneigh = param->cutforce + 0.30; */ - neighbor->numneigh = NULL; - neighbor->neighbors = NULL; - neighbor->stencil = NULL; - neighbor->bins = NULL; - neighbor->bincount = NULL; -} - -void setup(Neighbor *neighbor, Parameter *param) -{ - double lattice = pow((4.0 / param->rho), (1.0 / 3.0)); - double coord; - int mbinxhi, mbinyhi, mbinzhi; - int nextx, nexty, nextz; - - xprd = param->nx * lattice; - yprd = param->ny * lattice; - zprd = param->nz * lattice; - - xlo = 0.0; xhi = xprd; - ylo = 0.0; yhi = yprd; - zlo = 0.0; zhi = zprd; - - neighbor->cutneighsq = Cutneigh * Cutneigh; - neighbor->binsizex = xprd / neighbor->nbinx; - neighbor->binsizey = yprd / neighbor->nbiny; - neighbor->binsizez = zprd / neighbor->nbinz; - - neighbor->bininvx = 1.0 / neighbor->binsizex; - neighbor->bininvy = 1.0 / neighbor->binsizey; - neighbor->bininvz = 1.0 / neighbor->binsizez; - - coord = xlo - Cutneigh - SMALL * xprd; - neighbor->mbinxlo = (int) (coord * neighbor->bininvx); - if (coord < 0.0) { - neighbor->mbinxlo = neighbor->mbinxlo - 1; - } - coord = xhi + Cutneigh + SMALL * xprd; - mbinxhi = (int) (coord * neighbor->bininvx); - - coord = ylo - Cutneigh - SMALL * yprd; - neighbor->mbinylo = (int) (coord * neighbor->bininvy); - if (coord < 0.0) { - neighbor->mbinylo = neighbor->mbinylo - 1; - } - coord = yhi + Cutneigh + SMALL * yprd; - mbinyhi = (int) (coord * neighbor->bininvy); - - coord = zlo - Cutneigh - SMALL * zprd; - neighbor->mbinzlo = (int) (coord * neighbor->bininvz); - if (coord < 0.0) { - neighbor->mbinzlo = neighbor->mbinzlo - 1; - } - coord = zhi + Cutneigh + SMALL * zprd; - mbinzhi = (int) (coord * neighbor->bininvz); - - neighbor->mbinxlo = neighbor->mbinxlo - 1; - mbinxhi = mbinxhi + 1; - neighbor->mbinx = mbinxhi - neighbor->mbinxlo + 1; - - neighbor->mbinylo = neighbor->mbinylo - 1; - mbinyhi = mbinyhi + 1; - neighbor->mbiny = mbinyhi - neighbor->mbinylo + 1; - - neighbor->mbinzlo = neighbor->mbinzlo - 1; - mbinzhi = mbinzhi + 1; - neighbor->mbinz = mbinzhi - neighbor->mbinzlo + 1; - - nextx = (int) (Cutneigh * neighbor->bininvx); - if(nextx * neighbor->binsizex < FACTOR * Cutneigh) nextx++; - - nexty = (int) (Cutneigh * neighbor->bininvy); - if(nexty * neighbor->binsizey < FACTOR * Cutneigh) nexty++; - - nextz = (int) (Cutneigh * neighbor->bininvz); - if(nextz * neighbor->binsizez < FACTOR * Cutneigh) nextz++; - - if (neighbor->stencil) { - free(neighbor->stencil); - } - - neighbor->stencil = (int*) malloc( - (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int)); - - neighbor->nstencil = 0; - int kstart = -nextz; - - for(int k = kstart; k <= nextz; k++) { - for(int j = -nexty; j <= nexty; j++) { - for(int i = -nextx; i <= nextx; i++) { - if(bindist(neighbor, i, j, k) < neighbor->cutneighsq) { - neighbor->stencil[neighbor->nstencil++] = - k * neighbor->mbiny * neighbor->mbinx + j * neighbor->mbinx + i; - } - } - } - } - - neighbor->mbins = neighbor->mbinx * neighbor->mbiny * neighbor->mbinz; - - if (neighbor->bincount) { - free(neighbor->bincount); - } - neighbor->bincount = (int*) malloc(neighbor->mbins * sizeof(int)); - - if (neighbor->bins) { - free(neighbor->bins); - } - neighbor->bins = (int*) malloc(neighbor->mbins * neighbor->atoms_per_bin * sizeof(int)); -} - -double* myrealloc(double *ptr, int n, int nold) { - - double* newarray; - newarray = (double*) malloc(n * sizeof(double)); - - if(nold) { - memcpy(newarray, ptr, nold * sizeof(double)); - } - if(ptr) { - free(ptr); - } - - return newarray; -} - -int* myreallocInt(int *ptr, int n, int nold) { - - int* newarray; - - newarray = (int*) malloc(n * sizeof(int)); - - if(nold) { - memcpy(newarray, ptr, nold * sizeof(int)); - } - if(ptr) { - free(ptr); - } - - return newarray; -} - -void growBoundary() -{ - int nold = NmaxGhost; - NmaxGhost += DELTA; - - BorderMap = myreallocInt(BorderMap, NmaxGhost, nold); - PBCx = myreallocInt(PBCx, NmaxGhost, nold); - PBCy = myreallocInt(PBCy, NmaxGhost, nold); - PBCz = myreallocInt(PBCz, NmaxGhost, nold); - - if(BorderMap == NULL || PBCx == NULL || PBCy == NULL || PBCz == NULL ) { - printf("ERROR: No memory for Boundary\n"); - } -} - -void growarray() -{ - int nold = Nmax; - Nmax += DELTA; - - x = myrealloc(x, Nmax, nold); y = myrealloc(y, Nmax, nold); z = myrealloc(z, Nmax, nold); - vx = myrealloc(vx, Nmax, nold); vy = myrealloc(vy, Nmax, nold); vz = myrealloc(vz, Nmax, nold); - fx = myrealloc(fx, Nmax, nold); fy = myrealloc(fy, Nmax, nold); fz = myrealloc(fz, Nmax, nold); - - if(x == NULL || y == NULL || z == NULL || - vx == NULL || vy == NULL || vz == NULL || - fx == NULL || fy == NULL || fz == NULL ) { - printf("ERROR: No memory for atoms\n"); - } -} - -void updateBorders() -{ - for(int i = 0; i < Nghost; i++) { - x[Nlocal + i] = x[BorderMap[i]] + PBCx[i] * xprd; - y[Nlocal + i] = y[BorderMap[i]] + PBCy[i] * yprd; - z[Nlocal + i] = z[BorderMap[i]] + PBCz[i] * zprd; - } -} - -void updateAtomLocations() -{ - for(int i = 0; i < Nlocal; i++) { - - if(x[i] < 0.0) { - x[i] += xprd; - } else if(x[i] >= xprd) { - x[i] -= xprd; - } - - if(y[i] < 0.0) { - y[i] += yprd; - } else if(y[i] >= yprd) { - y[i] -= yprd; - } - - if(z[i] < 0.0) { - z[i] += zprd; - } else if(z[i] >= zprd) { - z[i] -= zprd; - } - } -} - -void setupBordersNew() -{ - int lastidx = 0; - int nghostprev = 0; - Nghost = 0; - - for (int i = 0; i < Nlocal; i++) { - - if (Nlocal + Nghost + 1 >= Nmax) { - growarray(); - } - - if (x[i] < Cutneigh) { - Nghost++; - x[i+lastidx] = x[i] + xprd; - y[i+lastidx] = y[i]; - z[i+lastidx] = z[i]; - lastidx++; - } else if (x[i] >= xprd - Cutneigh) { - Nghost++; - x[i+lastidx] = x[i] - xprd; - y[i+lastidx] = y[i]; - z[i+lastidx] = z[i]; - lastidx++; - } - } - - nghostprev = Nghost+1; - - for (int i = 0; i < Nlocal + nghostprev ; i++) { - - if (Nlocal + Nghost + 1 >= Nmax) { - growarray(); - } - - if (y[i] < Cutneigh) { - Nghost++; - x[i+lastidx] = x[i]; - y[i+lastidx] = y[i] + yprd; - z[i+lastidx] = z[i]; - lastidx++; - } else if (y[i] >= yprd - Cutneigh) { - Nghost++; - x[i+lastidx] = x[i]; - y[i+lastidx] = y[i] - yprd; - z[i+lastidx] = z[i]; - lastidx++; - } - } - - nghostprev = Nghost+1; - - for (int i = 0; i < Nlocal + nghostprev; i++) { - - if (Nlocal + Nghost + 1 >= Nmax) { - growarray(); - } - - if (z[i] < Cutneigh) { - Nghost++; - x[i+lastidx] = x[i]; - y[i+lastidx] = y[i]; - z[i+lastidx] = z[i] + zprd; - lastidx++; - } else if(z[i] >= zprd - Cutneigh) { - Nghost++; - x[i+lastidx] = x[i]; - y[i+lastidx] = y[i]; - z[i+lastidx] = z[i] - zprd; - lastidx++; - } - } - - Nghost++; -} - -#define ADDGHOST(dx,dy,dz) Nghost++; BorderMap[Nghost] = i; PBCx[Nghost] = dx; PBCy[Nghost] = dy; PBCz[Nghost] = dz; -void setupBorders() -{ - Nghost = -1; - - for(int i = 0; i < Nlocal; i++) { - - if (Nlocal + Nghost + 7 >= Nmax) { - growarray(); - } - if (Nghost + 7 >= NmaxGhost) { - growBoundary(); - } - - /* Setup ghost atoms */ - /* 6 planes */ - if (x[i] < Cutneigh) { ADDGHOST(+1,0,0); } - if (x[i] >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); } - if (y[i] < Cutneigh) { ADDGHOST(0,+1,0); } - if (y[i] >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); } - if (z[i] < Cutneigh) { ADDGHOST(0,0,+1); } - if (z[i] >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); } - /* 8 corners */ - if (x[i] < Cutneigh && y[i] < Cutneigh && z[i] < Cutneigh) { ADDGHOST(+1,+1,+1); } - if (x[i] < Cutneigh && y[i] >= (yprd-Cutneigh) && z[i] < Cutneigh) { ADDGHOST(+1,-1,+1); } - if (x[i] < Cutneigh && y[i] >= Cutneigh && z[i] >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); } - if (x[i] < Cutneigh && y[i] >= (yprd-Cutneigh) && z[i] >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); } - if (x[i] >= (xprd-Cutneigh) && y[i] < Cutneigh && z[i] < Cutneigh) { ADDGHOST(-1,+1,+1); } - if (x[i] >= (xprd-Cutneigh) && y[i] >= (yprd-Cutneigh) && z[i] < Cutneigh) { ADDGHOST(-1,-1,+1); } - if (x[i] >= (xprd-Cutneigh) && y[i] < Cutneigh && z[i] >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); } - if (x[i] >= (xprd-Cutneigh) && y[i] >= (yprd-Cutneigh) && z[i] >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); } - /* 12 edges */ - if (x[i] < Cutneigh && z[i] < Cutneigh) { ADDGHOST(+1,0,+1); } - if (x[i] < Cutneigh && z[i] >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); } - if (x[i] >= (xprd-Cutneigh) && z[i] < Cutneigh) { ADDGHOST(-1,0,+1); } - if (x[i] >= (xprd-Cutneigh) && z[i] >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); } - if (y[i] < Cutneigh && z[i] < Cutneigh) { ADDGHOST(0,+1,+1); } - if (y[i] < Cutneigh && z[i] >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); } - if (y[i] >= (yprd-Cutneigh) && z[i] < Cutneigh) { ADDGHOST(0,-1,+1); } - if (y[i] >= (yprd-Cutneigh) && z[i] >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); } - if (y[i] < Cutneigh && x[i] < Cutneigh) { ADDGHOST(+1,+1,0); } - if (y[i] < Cutneigh && x[i] >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); } - if (y[i] >= (yprd-Cutneigh) && x[i] < Cutneigh) { ADDGHOST(+1,-1,0); } - if (y[i] >= (yprd-Cutneigh) && x[i] >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } - } - // increase by one to make it the ghost atom count - Nghost++; -} - -void sortAtoms(Neighbor *neighbor) -{ - binatoms(neighbor); - int* binpos = neighbor->bincount; - int* bins = neighbor->bins; - - int mbins = neighbor->mbins; - int atoms_per_bin = neighbor->atoms_per_bin; - - for(int i=1; i0?binpos[mybin-1]:0; - int count = binpos[mybin] - start; - - for(int k=0; knx * param->ny * param->nz; - Nlocal = 0; - double alat = pow((4.0 / param->rho), (1.0 / 3.0)); - int ilo = (int) (xlo / (0.5 * alat) - 1); - int ihi = (int) (xhi / (0.5 * alat) + 1); - int jlo = (int) (ylo / (0.5 * alat) - 1); - int jhi = (int) (yhi / (0.5 * alat) + 1); - int klo = (int) (zlo / (0.5 * alat) - 1); - int khi = (int) (zhi / (0.5 * alat) + 1); - - ilo = MAX(ilo, 0); - ihi = MIN(ihi, 2 * param->nx - 1); - jlo = MAX(jlo, 0); - jhi = MIN(jhi, 2 * param->ny - 1); - klo = MAX(klo, 0); - khi = MIN(khi, 2 * param->nz - 1); - - double xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp; - int i, j, k, m, n; - int sx = 0; int sy = 0; int sz = 0; - int ox = 0; int oy = 0; int oz = 0; - int subboxdim = 8; - - while(oz * subboxdim <= khi) { - - k = oz * subboxdim + sz; - j = oy * subboxdim + sy; - i = ox * subboxdim + sx; - - if(((i + j + k) % 2 == 0) && - (i >= ilo) && (i <= ihi) && - (j >= jlo) && (j <= jhi) && - (k >= klo) && (k <= khi)) { - - xtmp = 0.5 * alat * i; - ytmp = 0.5 * alat * j; - ztmp = 0.5 * alat * k; - - if( xtmp >= xlo && xtmp < xhi && - ytmp >= ylo && ytmp < yhi && - ztmp >= zlo && ztmp < zhi ) { - - n = k * (2 * param->ny) * (2 * param->nx) + - j * (2 * param->nx) + - i + 1; - - for(m = 0; m < 5; m++) { - myrandom(&n); - } - vxtmp = myrandom(&n); - - for(m = 0; m < 5; m++){ - myrandom(&n); - } - vytmp = myrandom(&n); - - for(m = 0; m < 5; m++) { - myrandom(&n); - } - vztmp = myrandom(&n); - - if(Nlocal == Nmax) { - growarray(); - } - - x[Nlocal] = xtmp; y[Nlocal] = ytmp; z[Nlocal] = ztmp; - vx[Nlocal] = vxtmp; vy[Nlocal] = vytmp; vz[Nlocal] = vztmp; - Nlocal++; - } - } - - sx++; - - if(sx == subboxdim) { sx = 0; sy++; } - if(sy == subboxdim) { sy = 0; sz++; } - if(sz == subboxdim) { sz = 0; ox++; } - if(ox * subboxdim > ihi) { ox = 0; oy++; } - if(oy * subboxdim > jhi) { oy = 0; oz++; } - } -} - -void adjustVelocity(Parameter *param, Thermo *thermo) -{ - /* zero center-of-mass motion */ - double vxtot = 0.0; - double vytot = 0.0; - double vztot = 0.0; - - for(int i = 0; i < Nlocal; i++) { - vxtot += vx[i]; - vytot += vy[i]; - vztot += vz[i]; - } - - vxtot = vxtot / Natoms; - vytot = vytot / Natoms; - vztot = vztot / Natoms; - - for(int i = 0; i < Nlocal; i++) { - vx[i] -= vxtot; - vy[i] -= vytot; - vz[i] -= vztot; - } - - thermo->t_act = 0; - double t = 0.0; - - for(int i = 0; i < Nlocal; i++) { - t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass; - } - - t *= thermo->t_scale; - double factor = sqrt(param->temp / t); - - for(int i = 0; i < Nlocal; i++) { - vx[i] *= factor; - vy[i] *= factor; - vz[i] *= factor; - } -} - -void thermoSetup(Parameter *param, Thermo *thermo) -{ - int maxstat = param->ntimes / param->nstat + 2; - - thermo->steparr = (int*) malloc(maxstat * sizeof(int)); - thermo->tmparr = (double*) malloc(maxstat * sizeof(double)); - thermo->engarr = (double*) malloc(maxstat * sizeof(double)); - thermo->prsarr = (double*) malloc(maxstat * sizeof(double)); - - thermo->mvv2e = 1.0; - thermo->dof_boltz = (Natoms * 3 - 3); - thermo->t_scale = thermo->mvv2e / thermo->dof_boltz; - thermo->p_scale = 1.0 / 3 / xprd / yprd / zprd; - thermo->e_scale = 0.5; - - printf("step\ttemp\t\tpressure\n"); -} - - -void thermoCompute(int iflag, Parameter *param, Thermo *thermo) -{ - double t = 0.0, p; - - for(int i = 0; i < Nlocal; i++) { - t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass; - } - - t = t * thermo->t_scale; - p = (t * thermo->dof_boltz) * thermo->p_scale; - - int istep = iflag; - - if(iflag == -1){ - istep = param->ntimes; - } - if(iflag == 0){ - thermo->mstat = 0; - } - - thermo->steparr[thermo->mstat] = istep; - thermo->tmparr[thermo->mstat] = t; - thermo->prsarr[thermo->mstat] = p; - thermo->mstat++; - fprintf(stdout, "%i\t%e\t%e\n", istep, t, p); -} - -void initialIntegrate(Parameter *param) -{ - for(int i = 0; i < Nlocal; i++) { - vx[i] += param->dtforce * fx[i]; - vy[i] += param->dtforce * fy[i]; - vz[i] += param->dtforce * fz[i]; - x[i] += param->dt * vx[i]; - y[i] += param->dt * vy[i]; - z[i] += param->dt * vz[i]; - } -} - -void finalIntegrate(Parameter *param) -{ - for(int i = 0; i < Nlocal; i++) { - vx[i] += param->dtforce * fx[i]; - vy[i] += param->dtforce * fy[i]; - vz[i] += param->dtforce * fz[i]; - } -} - -void computeForce(Neighbor *neighbor, Parameter *param) -{ - int* neighs; - double cutforcesq = param->cutforce * param->cutforce; - double sigma6 = param->sigma6; - double epsilon = param->epsilon; - - for(int i = 0; i < Nlocal; i++) { - fx[i] = 0.0; - fy[i] = 0.0; - fz[i] = 0.0; - } - - for(int i = 0; i < Nlocal; i++) { - neighs = &neighbor->neighbors[i * neighbor->maxneighs]; - int numneighs = neighbor->numneigh[i]; - double xtmp = x[i]; - double ytmp = y[i]; - double ztmp = z[i]; - - double fix = 0; - double fiy = 0; - double fiz = 0; - - for(int k = 0; k < numneighs; k++) { - int j = neighs[k]; - double delx = xtmp - x[j]; - double dely = ytmp - y[j]; - double delz = ztmp - z[j]; - double rsq = delx * delx + dely * dely + delz * delz; - - if(rsq < cutforcesq) { - double sr2 = 1.0 / rsq; - double sr6 = sr2 * sr2 * sr2 * sigma6; - double force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; - fix += delx * force; - fiy += dely * force; - fiz += delz * force; - } - } - - fx[i] += fix; - fy[i] += fiy; - fz[i] += fiz; - } -} - -int main (int argc, char** argv) -{ - Neighbor neighbor; - Parameter param; - Thermo thermo; - - init(&neighbor, ¶m); - setup(&neighbor, ¶m); - create_atoms(¶m); - thermoSetup(¶m, &thermo); - adjustVelocity(¶m, &thermo); - setupBorders(); - updateBorders(); - buildNeighborlist(&neighbor); - thermoCompute(0, ¶m, &thermo); - computeForce(&neighbor, ¶m); - - for(int n = 0; n < param.ntimes; n++) { - - initialIntegrate(¶m); - - if((n + 1) % neighbor.every) { - updateBorders(); - } else { - updateAtomLocations(); - setupBorders(); - updateBorders(); - /* sortAtoms(&neighbor); */ - buildNeighborlist(&neighbor); - } - - computeForce(&neighbor, ¶m); - finalIntegrate(¶m); - - if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { - thermoCompute(n + 1, ¶m, &thermo); - } - } - - thermoCompute(-1, ¶m, &thermo); - - return EXIT_SUCCESS; -} diff --git a/util/plot_gather_data.py b/util/plot_gather_data.py deleted file mode 100644 index c639e91..0000000 --- a/util/plot_gather_data.py +++ /dev/null @@ -1,114 +0,0 @@ -import matplotlib.pyplot as plt -import sys - -filename = sys.argv[1] -plot_output_file = filename.replace(".txt", ".pdf") -raw_output_file = filename.replace(".txt", ".csv") -fig = plt.figure() -ax = plt.axes() -plot_data = {} - -status = 0 # No header found -md_case = False -cut_cls_case = False -stride = None -dims = None -freq = None -cl_size = None -vector_width = None -cache_lines_per_gather = None - -with open(filename, 'r') as fp: - for line in fp.readlines(): - line = line.strip() - - if len(line) <= 0 or "likwid-pin" in line or "INFO:" in line: - continue - - if line.startswith("ISA,"): - status = 1 - md_case = True if "Dims" in line else False - continue - - if line.startswith("N,"): - status = 2 - cut_cls_case = True if "cut CLs" in line else False - continue - - assert status == 1 or status == 2, "Invalid input!" - - if status == 1: - if md_case: - isa, layout, stride, dims, freq, cl_size, vector_width, cache_lines_per_gather = line.split(',') - else: - isa, stride, freq, cl_size, vector_width, cache_lines_per_gather = line.split(',') - - stride = int(stride) - continue - - if md_case: - if cut_cls_case: - N, size, cut_cls, total_time, time_per_it, cy_per_iter, cy_per_gather, cy_per_elem = line.split(',') - else: - print(line) - N, size, total_time, time_per_it, cy_per_iter, cy_per_gather, cy_per_elem = line.split(',') - cut_cls = 0 - else: - N, size, total_time, time_per_it, cy_per_gather, cy_per_elem = line.split(',') - cut_cls = 0 - - size = float(size) - cycles = float(cy_per_iter) if md_case else float(cy_per_gather) - - if stride not in plot_data: - plot_data[stride] = {} - - plot_data[stride][size] = cycles if size not in plot_data[stride] \ - else min(cycles, plot_data[stride][size]) - -all_sizes = set() -all_strides = set() -for stride in plot_data: - sizes = list(plot_data[stride].keys()) - sizes.sort() - cycles = [plot_data[stride][size] for size in sizes] - ax.plot(sizes, cycles, marker='.', label=str(stride)) - - for size in sizes: - all_sizes.add(size) - - all_strides.add(stride) - -all_sizes = list(all_sizes) -all_sizes.sort() -all_strides = list(all_strides) -all_strides.sort() -with open(raw_output_file, 'w') as wp: - wp.write(" size\stride") - - for stride in all_strides: - wp.write(",{0:14}".format(stride)) - - wp.write("\n") - - for size in all_sizes: - wp.write("{:14.6f}".format(size)) - for stride in all_strides: - try: - cycles = plot_data[stride][size] - except: - cycles = '' - - wp.write(",{:14.6f}".format(cycles)) - - wp.write("\n") - -cy_label = "Cycles per iteration" if md_case else "Cycles per gather" -ax.vlines([48, 1000, 48000], 0, 1, transform=ax.get_xaxis_transform(), linestyles='dashed', color=['#444444', '#777777']) -#ax.vlines([32, 1000, 28000], 0, 1, transform=ax.get_xaxis_transform(), linestyles='dashed', color=['#444444', '#777777', '#aaaaaa']) -ax.set(xlabel='Array size (kB)', ylabel=cy_label) -ax.set_xscale('log') -#ax.set_xticks([32, 1000, 28000]) -#ax.set_xlim(0, 200000) -plt.legend(title="Stride") -fig.savefig(plot_output_file, bbox_inches = 'tight', pad_inches = 0) diff --git a/util/plot_run_stub_data.py b/util/plot_run_stub_data.py deleted file mode 100644 index f1f70a5..0000000 --- a/util/plot_run_stub_data.py +++ /dev/null @@ -1,76 +0,0 @@ -import matplotlib.pyplot as plt -import sys - -vector_width = 8 # 8 doubles per zmm vector - -# Filter condition of which data to plot -def plot_filter(atoms_per_unit_cell): - #return atoms_per_unit_cell < 2048 - return True - -filename = sys.argv[1] -plot_output_file = filename.replace(".txt", ".pdf") -raw_output_file = filename.replace(".txt", ".csv") -fig = plt.figure() -ax = plt.axes() -plot_data = {} - -with open(filename, 'r') as fp: - for line in fp.readlines(): - steps, unit_cells, atoms_per_unit_cell, total_atoms, total_vol, atoms_vol, neigh_vol, time, atom_upds_per_sec, cy_per_atom, cy_per_neigh = line.split(',') - atoms_per_unit_cell = int(atoms_per_unit_cell) - vol = float(neigh_vol) - cy_per_atom = float(cy_per_atom) - - if plot_filter(atoms_per_unit_cell): - if atoms_per_unit_cell not in plot_data: - plot_data[atoms_per_unit_cell] = {} - - cy_per_iter = cy_per_atom * vector_width / atoms_per_unit_cell - plot_data[atoms_per_unit_cell][vol] = cy_per_iter if vol not in plot_data[atoms_per_unit_cell] \ - else min(cy_per_iter, plot_data[atoms_per_unit_cell][vol]) - -all_volumes = set() -all_configs = set() -for atoms_per_unit_cell in plot_data: - volumes = list(plot_data[atoms_per_unit_cell].keys()) - volumes.sort() - cycles = [plot_data[atoms_per_unit_cell][vol] for vol in volumes] - ax.plot(volumes, cycles, marker='.', label=str(atoms_per_unit_cell)) - - for vol in volumes: - all_volumes.add(vol) - - all_configs.add(atoms_per_unit_cell) - -all_volumes = list(all_volumes) -all_volumes.sort() -all_configs = list(all_configs) -all_configs.sort() -with open(raw_output_file, 'w') as wp: - wp.write(" volume\config") - - for conf in all_configs: - wp.write(",{0:14}".format(conf)) - - wp.write("\n") - - for vol in all_volumes: - wp.write("{:14.6f}".format(vol)) - for conf in all_configs: - try: - cycles = plot_data[conf][vol] - wp.write(",{:14.6f}".format(cycles)) - except: - wp.write(',' + ' ' * 14) - - - wp.write("\n") - -ax.vlines([32, 1000, 28000], 0, 1, transform=ax.get_xaxis_transform(), linestyles='dashed', color=['#444444', '#777777', '#aaaaaa']) -ax.set(xlabel='Neighbor data volume (kB)', ylabel='Cycles per iteration') -ax.set_xscale('log') -#ax.set_xticks([32, 1000, 28000]) -#ax.set_xlim(0, 200000) -plt.legend(title="atoms/uc") -fig.savefig(plot_output_file, bbox_inches = 'tight', pad_inches = 0) diff --git a/util/preds.py b/util/preds.py deleted file mode 100644 index f5d93db..0000000 --- a/util/preds.py +++ /dev/null @@ -1,28 +0,0 @@ -import sys -import re - -if len(sys.argv) != 6: - print("Usage: python preds.py ") - sys.exit(1) - -iaca_pred = float(sys.argv[1]) -mca_pred = float(sys.argv[2]) -osaca_pred = float(sys.argv[3]) -uica_pred = float(sys.argv[4]) -div_factor = float(sys.argv[5]) -preds = [x / div_factor for x in [iaca_pred, mca_pred, osaca_pred, uica_pred]] - -start = -4.0 -end = 36.0 -npoints = 50 -offset = (end - start) / (npoints - 1) -i = 0 -for pred in preds: - print(f"@target G0.S{i+6}") - print(f"@type xy") - for j in range(npoints): - pos = start + offset * j - print("{:.6f} {}".format(pos, pred)) - - print("&") - i += 1 diff --git a/util/run_stub.sh b/util/run_stub.sh deleted file mode 100644 index 5ceea52..0000000 --- a/util/run_stub.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -while getopts "a:f:n:o:r:x:y:z:" flag; do - case "${flag}" in - a) atoms_per_unit_cell=${OPTARG};; - f) frequency=${OPTARG};; - n) timesteps=${OPTARG};; - o) output_file=${OPTARG};; - r) nruns=${OPTARG};; - x) nx=${OPTARG};; - y) ny=${OPTARG};; - z) nz=${OPTARG};; - esac -done - -EXEC="../MDBench-ICC-stub" -ATOMS_PER_UNIT_CELL="${atoms_per_unit_cell:-8}" -FREQUENCY="${frequency:-0.0}" -TIMESTEPS="${timesteps:-200}" -OUTPUT_FILE="${output_file:-run_results.txt}" -NRUNS="${nruns:-3}" -NX="${nx:-4}" -NY="${ny:-4}" -NZ="${nz:-2}" - -for timesteps in ${TIMESTEPS}; do - for atoms_per_unit_cell in ${ATOMS_PER_UNIT_CELL}; do - for nx in ${NX}; do - for ny in ${NY}; do - for nz in ${NZ}; do - best_perf= - best_output="invalid" - for nruns in ${NRUNS}; do - output=$( - ./${EXEC} -f ${FREQUENCY} -n ${timesteps} -na ${atoms_per_unit_cell} -nx ${nx} -ny ${ny} -nz ${nz} -csv | - grep -v steps | - grep -iv resize - ) - perf=$(echo $output | cut -d',' -f8) - if [ -z "$best_perf" ]; then - best_perf="$perf" - best_output="$output" - elif (( $(echo "$perf > 0.0 && $perf < $best_perf" | bc -l) )); then - best_perf="$perf" - best_output="$output" - fi - done - - echo "${best_output}" | tee -a "${OUTPUT_FILE}" - done - done - done - done -done diff --git a/util/string_to_agr.py b/util/string_to_agr.py deleted file mode 100644 index 446b871..0000000 --- a/util/string_to_agr.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys -import re - -if len(sys.argv) != 3: - print("Usage: python string_to_agr.py ") - sys.exit(1) - -input_filename = sys.argv[1] -div_factor = float(sys.argv[2]) -result_list = [] - -with open(input_filename, 'r') as file: - for line in file: - numbers = re.findall(r'\d+\.\d+', line) - divided_numbers = [float(number) / div_factor for number in numbers] - result_list.append(divided_numbers) - -start = -2.5 -bar_offset = 1.0 -group_offset = 8.0 -i = 0 - -for group in result_list: - print(f"@target G0.S{i}") - print(f"@type bar") - - j = 0 - for meas in group: - pos = start + i * bar_offset + j * group_offset - print(f"{pos} {meas}") - j += 1 - - print("&") - i += 1 diff --git a/verletlist/includes/atom.h b/verletlist/includes/atom.h deleted file mode 100644 index a877f98..0000000 --- a/verletlist/includes/atom.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. - * All rights reserved. This file is part of MD-Bench. - * Use of this source code is governed by a LGPL-3.0 - * license that can be found in the LICENSE file. - */ -#include - -#ifndef __ATOM_H_ -#define __ATOM_H_ - -#ifdef CUDA_TARGET -# define KERNEL_NAME "CUDA" -# define computeForceLJFullNeigh computeForceLJFullNeigh_cuda -# define initialIntegrate initialIntegrate_cuda -# define finalIntegrate finalIntegrate_cuda -# define buildNeighbor buildNeighbor_cuda -# define updatePbc updatePbc_cuda -# define updateAtomsPbc updateAtomsPbc_cuda -#else -# ifdef USE_SIMD_KERNEL -# define KERNEL_NAME "SIMD" -# define computeForceLJFullNeigh computeForceLJFullNeigh_simd -# else -# define KERNEL_NAME "plain-C" -# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c -# endif -# define initialIntegrate initialIntegrate_cpu -# define finalIntegrate finalIntegrate_cpu -# define buildNeighbor buildNeighbor_cpu -# define updatePbc updatePbc_cpu -# define updateAtomsPbc updateAtomsPbc_cpu -#endif - -typedef struct { - MD_FLOAT *x, *y, *z; - MD_FLOAT *vx, *vy, *vz; - MD_FLOAT *fx, *fy, *fz; - int *border_map; - int *type; - MD_FLOAT *epsilon; - MD_FLOAT *sigma6; - MD_FLOAT *cutforcesq; - MD_FLOAT *cutneighsq; -} DeviceAtom; - -typedef struct { - int Natoms, Nlocal, Nghost, Nmax; - MD_FLOAT *x, *y, *z; - MD_FLOAT *vx, *vy, *vz; - MD_FLOAT *fx, *fy, *fz; - int *border_map; - int *type; - int ntypes; - MD_FLOAT *epsilon; - MD_FLOAT *sigma6; - MD_FLOAT *cutforcesq; - MD_FLOAT *cutneighsq; - - // DEM - MD_FLOAT *radius; - MD_FLOAT *av; - MD_FLOAT *r; - - // Device data - DeviceAtom d_atom; -} Atom; - -extern void initAtom(Atom*); -extern void createAtom(Atom*, Parameter*); -extern int readAtom(Atom*, Parameter*); -extern int readAtom_pdb(Atom*, Parameter*); -extern int readAtom_gro(Atom*, Parameter*); -extern int readAtom_dmp(Atom*, Parameter*); -extern int readAtom_in(Atom*, Parameter*); -extern void writeAtom(Atom*, Parameter*); -extern void growAtom(Atom*); - -#ifdef AOS -# define POS_DATA_LAYOUT "AoS" -# define atom_x(i) atom->x[(i) * 3 + 0] -# define atom_y(i) atom->x[(i) * 3 + 1] -# define atom_z(i) atom->x[(i) * 3 + 2] -# define atom_vx(i) atom->vx[(i) * 3 + 0] -# define atom_vy(i) atom->vx[(i) * 3 + 1] -# define atom_vz(i) atom->vx[(i) * 3 + 2] -# define atom_fx(i) atom->fx[(i) * 3 + 0] -# define atom_fy(i) atom->fx[(i) * 3 + 1] -# define atom_fz(i) atom->fx[(i) * 3 + 2] -#else -# define POS_DATA_LAYOUT "SoA" -# define atom_x(i) atom->x[i] -# define atom_y(i) atom->y[i] -# define atom_z(i) atom->z[i] -# define atom_vx(i) atom->vx[i] -# define atom_vy(i) atom->vy[i] -# define atom_vz(i) atom->vz[i] -# define atom_fx(i) atom->fx[i] -# define atom_fy(i) atom->fy[i] -# define atom_fz(i) atom->fz[i] -#endif - -#endif