From af92800c64b2b502bd6bd749b82fb906b3ec5f0b Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Wed, 2 Mar 2022 23:12:04 +0100 Subject: [PATCH] Add SIMD version with AVX (no AVX2) and XTC output Signed-off-by: Rafael Ravedutti --- Makefile | 9 +++++ config.mk | 10 +++-- gromacs/includes/parameter.h | 5 ++- gromacs/includes/simd.h | 78 +++++++++++++++++++++++++----------- gromacs/includes/xtc.h | 37 +++++++++++++++++ gromacs/main.c | 38 ++++++++++++++++-- gromacs/xtc.c | 71 ++++++++++++++++++++++++++++++++ include_GROMACS.mk | 11 +++++ 8 files changed, 228 insertions(+), 31 deletions(-) create mode 100644 gromacs/includes/xtc.h create mode 100644 gromacs/xtc.c create mode 100644 include_GROMACS.mk diff --git a/Makefile b/Makefile index 7986ebb..e214557 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ Q ?= @ include $(MAKE_DIR)/config.mk include $(MAKE_DIR)/include_$(TAG).mk include $(MAKE_DIR)/include_LIKWID.mk +include $(MAKE_DIR)/include_GROMACS.mk INCLUDES += -I./$(SRC_DIR)/includes ifeq ($(strip $(DATA_LAYOUT)),AOS) @@ -52,6 +53,10 @@ ifeq ($(strip $(COMPUTE_STATS)),true) DEFINES += -DCOMPUTE_STATS endif +ifeq ($(strip $(XTC_OUTPUT)),true) + DEFINES += -DXTC_OUTPUT +endif + ifeq ($(strip $(USE_REFERENCE_VERSION)),true) DEFINES += -DUSE_REFERENCE_VERSION endif @@ -64,6 +69,10 @@ ifneq ($(VECTOR_WIDTH),) DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH) endif +ifeq ($(strip $(NO_AVX2)),true) + DEFINES += -DNO_AVX2 +endif + VPATH = $(SRC_DIR) $(ASM_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) diff --git a/config.mk b/config.mk index 62206d8..6a27e04 100644 --- a/config.mk +++ b/config.mk @@ -1,9 +1,9 @@ # Compiler tag (GCC/CLANG/ICC) TAG ?= ICC # Optimization scheme (lammps/gromacs/clusters_per_bin) -OPT_SCHEME = gromacs +OPT_SCHEME ?= gromacs # Enable likwid (true or false) -ENABLE_LIKWID ?= false +ENABLE_LIKWID ?= true # SP or DP DATA_TYPE ?= DP # AOS or SOA @@ -24,7 +24,9 @@ MEM_TRACER ?= false # Trace indexes and distances for gather-md (true or false) INDEX_TRACER ?= false # Vector width (elements) for index and distance tracer -VECTOR_WIDTH ?= 8 +VECTOR_WIDTH ?= 4 +# When vector width is 4 but AVX2 is not supported (AVX only), set this to true +NO_AVX2 ?= true # Compute statistics COMPUTE_STATS ?= true @@ -33,6 +35,8 @@ COMPUTE_STATS ?= true CLUSTER_LAYOUT ?= SOA # Use reference version USE_REFERENCE_VERSION ?= false +# Enable XTC output +XTC_OUTPUT ?= false #Feature options OPTIONS = -DALIGNMENT=64 diff --git a/gromacs/includes/parameter.h b/gromacs/includes/parameter.h index 84a0cdb..c5a03a5 100644 --- a/gromacs/includes/parameter.h +++ b/gromacs/includes/parameter.h @@ -33,6 +33,7 @@ typedef struct { int force_field; char* input_file; char* vtk_file; + char *xtc_file; MD_FLOAT epsilon; MD_FLOAT sigma6; MD_FLOAT temp; @@ -41,8 +42,10 @@ typedef struct { int ntypes; int ntimes; int nstat; - int every; + int reneigh_every; int prune_every; + int x_out_every; + int v_out_every; MD_FLOAT dt; MD_FLOAT dtforce; MD_FLOAT cutforce; diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index 9e1f443..ad86e7b 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -41,10 +41,10 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); } static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); } -static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } -static inline MD_SIMD_MASK simd_mask_to_u32(unsigned int a) { return _cvtmask8_u32(a); } static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) { MD_SIMD_FLOAT x; @@ -64,39 +64,55 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee)); x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11)); x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01)); - return *((double *) &x); + return *((MD_FLOAT *) &x); } -#else // AVX2 +#else // AVX or AVX2 #define MD_SIMD_FLOAT __m256d + +#ifdef NO_AVX2 +#define MD_SIMD_MASK __m256d +#else #define MD_SIMD_MASK __mmask8 +#endif static inline MD_SIMD_FLOAT simd_broadcast(double scalar) { return _mm256_set1_pd(scalar); } static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); } static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); } static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); } -static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); } -static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); } -static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); } -static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } -static inline MD_SIMD_MASK simd_mask_to_u32(unsigned int a) { return _cvtmask8_u32(a); } -static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } -static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); } -static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) { - MD_SIMD_FLOAT x; -#ifdef CLUSTER_AOS - __m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0); - __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d)); - x = _mm256_i32gather_pd(c0, vindex, sizeof(double)); -#else - x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]); -#endif - return x; +#ifdef NO_AVX2 +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); } +// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0 +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { + const unsigned long long int all = 0xFFFFFFFFFFFFFFFF; + const unsigned long long int none = 0x0; + return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none)); } - +// TODO: Implement this, althrough it is just required for debugging +static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; } +static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { + __m128d a0, a1; + a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101)); + a0 = _mm256_castpd256_pd128(a); + a1 = _mm256_extractf128_pd(a, 0x1); + a0 = _mm_add_sd(a0, a1); + return *((MD_FLOAT *) &a0); +} +#else +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; // test with shuffle & add as an alternative to hadd later @@ -104,7 +120,23 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm256_castpd256_pd128(a); a1 = _mm256_extractf128_pd(a, 0x1); a0 = _mm_add_sd(a0, a1); - return *((double *) &a0); + return *((MD_FLOAT *) &a0); +} +#endif + +static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) { + MD_SIMD_FLOAT x; +#ifdef CLUSTER_AOS + #ifdef NO_AVX2 + #error "Not possible to use AoS cluster layout without AVX2 support!" + #endif + __m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0); + __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d)); + x = _mm256_i32gather_pd(c0, vindex, sizeof(double)); +#else + x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]); +#endif + return x; } #endif diff --git a/gromacs/includes/xtc.h b/gromacs/includes/xtc.h new file mode 100644 index 0000000..96ecfc6 --- /dev/null +++ b/gromacs/includes/xtc.h @@ -0,0 +1,37 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include + +#ifndef __XTC_H_ +#define __XTC_H_ + +#ifdef XTC_OUTPUT +void xtc_init(const char *, Atom*, int); +void xtc_write(Atom*, int, int, int); +void xtc_end(); +#else +#define xtc_init(a,b,c) +#define xtc_write(a,b,c,d) +#define xtc_end() +#endif +#endif diff --git a/gromacs/main.c b/gromacs/main.c index 2c5f013..773ee09 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #define HLINE "----------------------------------------------------------------------------\n" @@ -58,6 +59,7 @@ extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); void init(Parameter *param) { param->input_file = NULL; param->vtk_file = NULL; + param->xtc_file = NULL; param->force_field = FF_LJ; param->epsilon = 1.0; param->sigma6 = 1.0; @@ -75,8 +77,10 @@ void init(Parameter *param) { param->nstat = 100; param->mass = 1.0; param->dtforce = 0.5 * param->dt; - param->every = 20; + param->reneigh_every = 20; param->prune_every = 1000; + param->x_out_every = 20; + param->v_out_every = 5; param->proc_freq = 2.4; } @@ -241,6 +245,15 @@ int main(int argc, char** argv) { param.vtk_file = strdup(argv[++i]); continue; } + if((strcmp(argv[i], "--xtc") == 0)) { + #ifndef XTC_OUTPUT + fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!"); + exit(-1); + #else + param.xtc_file = strdup(argv[++i]); + #endif + continue; + } if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { printf("MD Bench: A minimalistic re-implementation of miniMD\n"); printf(HLINE); @@ -253,6 +266,7 @@ int main(int argc, char** argv) { printf("-s / --skin : set skin (verlet buffer)\n"); printf("--freq : processor frequency (GHz)\n"); printf("--vtk : VTK file for visualization\n"); + printf("--xtc : XTC file for visualization\n"); printf(HLINE); exit(EXIT_SUCCESS); } @@ -277,10 +291,14 @@ int main(int argc, char** argv) { write_data_to_vtk_file(param.vtk_file, &atom, 0); } + if(param.xtc_file != NULL) { + xtc_init(param.xtc_file, &atom, 0); + } + for(int n = 0; n < param.ntimes; n++) { initialIntegrate(¶m, &atom); - if((n + 1) % param.every) { + if((n + 1) % param.reneigh_every) { if(!((n + 1) % param.prune_every)) { pruneNeighbor(¶m, &atom, &neighbor); } @@ -306,14 +324,26 @@ int main(int argc, char** argv) { computeThermo(n + 1, ¶m, &atom); } - if(param.vtk_file != NULL) { - write_data_to_vtk_file(param.vtk_file, &atom, n + 1); + int write_pos = !((n + 1) % param.x_out_every); + int write_vel = !((n + 1) % param.v_out_every); + if(write_pos || write_vel) { + if(param.vtk_file != NULL) { + write_data_to_vtk_file(param.vtk_file, &atom, n + 1); + } + + if(param.xtc_file != NULL) { + xtc_write(&atom, n + 1, write_pos, write_vel); + } } } timer[TOTAL] = getTimeStamp() - timer[TOTAL]; computeThermo(-1, ¶m, &atom); + if(param.xtc_file != NULL) { + xtc_end(); + } + printf(HLINE); printf("Force field: %s\n", ff2str(param.force_field)); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); diff --git a/gromacs/xtc.c b/gromacs/xtc.c new file mode 100644 index 0000000..d381f40 --- /dev/null +++ b/gromacs/xtc.c @@ -0,0 +1,71 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include +//--- +#include +#include +#include + +#ifdef XTC_OUTPUT +#include + +static struct t_fileio *xtc_file = NULL; +static rvec *x_buf = NULL; +static rvec basis[3]; + +void xtc_init(const char *filename, Atom *atom, int timestep) { + basis[0][XX] = 1.0; + basis[0][YY] = 0.0; + basis[0][ZZ] = 0.0; + basis[1][XX] = 0.0; + basis[1][YY] = 1.0; + basis[1][ZZ] = 0.0; + basis[2][XX] = 0.0; + basis[2][YY] = 0.0; + basis[2][ZZ] = 1.0; + + xtc_file = open_xtc(filename, "w"); + x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1)); + xtc_write(atom, timestep, 1, 1); +} + +void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) { + int i = 0; + for(int ci = 0; ci < atom->Nclusters_local; ++ci) { + MD_FLOAT *cptr = cluster_pos_ptr(ci); + for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { + x_buf[i][XX] = cluster_x(cptr, cii); + x_buf[i][YY] = cluster_y(cptr, cii); + x_buf[i][ZZ] = cluster_z(cptr, cii); + i++; + } + } + + write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000); +} + +void xtc_end() { + free(x_buf); + close_xtc(xtc_file); +} +#endif diff --git a/include_GROMACS.mk b/include_GROMACS.mk new file mode 100644 index 0000000..d91198d --- /dev/null +++ b/include_GROMACS.mk @@ -0,0 +1,11 @@ +GROMACS_PATH=/apps/Gromacs/2018.1-mkl +GROMACS_INC ?= -I${GROMACS_PATH}/include +GROMACS_DEFINES ?= +GROMACS_LIB ?= -L${GROMACS_PATH}/lib64 + +ifeq ($(strip $(XTC_OUTPUT)),true) +INCLUDES += ${GROMACS_INC} +DEFINES += ${GROMACS_DEFINES} +LIBS += -lgromacs +LFLAGS += ${GROMACS_LIB} +endif