diff --git a/Makefile b/Makefile index 4cff876..c82abdf 100644 --- a/Makefile +++ b/Makefile @@ -75,6 +75,10 @@ ifeq ($(strip $(ENABLE_OMP_SIMD)),true) DEFINES += -DENABLE_OMP_SIMD endif +ifeq ($(strip $(USE_SIMD_KERNEL)),true) + DEFINES += -DUSE_SIMD_KERNEL +endif + VPATH = $(SRC_DIR) $(ASM_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) diff --git a/config.mk b/config.mk index 3f398fe..8140400 100644 --- a/config.mk +++ b/config.mk @@ -14,8 +14,6 @@ DATA_LAYOUT ?= AOS ASM_SYNTAX ?= ATT # Debug DEBUG ?= false -# Use omp simd pragma for lammps halfneigh -ENABLE_OMP_SIMD ?= true # Explicitly store and load atom types (true or false) EXPLICIT_TYPES ?= false @@ -26,6 +24,12 @@ INDEX_TRACER ?= false # Compute statistics COMPUTE_STATS ?= false +# Configurations for lammps optimization scheme +# Use omp simd pragma when running with half neighbor-lists +ENABLE_OMP_SIMD ?= true +# Use kernel with explicit SIMD intrinsics +USE_SIMD_KERNEL ?= false + # Configurations for gromacs optimization scheme # Use reference version USE_REFERENCE_VERSION ?= false diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index 32c6020..0b37f61 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -21,11 +21,20 @@ * ======================================================================================= */ +#include #include #include #include #include +#ifndef CLUSTER_M +# define CLUSTER_M 1 +#endif + +#ifndef CLUSTER_N +# define CLUSTER_N 1 +#endif + #ifdef AVX512 # if PRECISION == 2 # include "simd/avx512_double.h" diff --git a/gromacs/includes/simd/avx512_double.h b/gromacs/includes/simd/avx512_double.h index 7294c54..a482185 100644 --- a/gromacs/includes/simd/avx512_double.h +++ b/gromacs/includes/simd/avx512_double.h @@ -26,6 +26,7 @@ #define MD_SIMD_FLOAT __m512d #define MD_SIMD_MASK __mmask8 +#define MD_SIMD_INT __m256i static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); } static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); } @@ -110,3 +111,14 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, simd_h_decr(m + CLUSTER_N, a1); simd_h_decr(m + CLUSTER_N * 2, a2); } + +// Functions used in LAMMPS kernel +static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); } +static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); } +static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); } +static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } +static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); } +static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); } +static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); } +static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); } +static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); } diff --git a/lammps/force_lj.c b/lammps/force_lj.c index b111c4e..ba9aa8f 100644 --- a/lammps/force_lj.c +++ b/lammps/force_lj.c @@ -28,7 +28,10 @@ #include #include -double computeForceLJFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { +// TODO: Joint common files for gromacs and lammps variants +#include "../gromacs/includes/simd.h" + +double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { int Nlocal = atom->Nlocal; int* neighs; #ifndef EXPLICIT_TYPES @@ -184,3 +187,71 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, double E = getTimeStamp(); return E-S; } + +double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + + for(int i = 0; i < Nlocal; i++) { + atom_fx(i) = 0.0; + atom_fy(i) = 0.0; + atom_fz(i) = 0.0; + } + + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + #pragma omp parallel for + for(int i = 0; i < Nlocal; i++) { + neighs = &neighbor->neighbors[i * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[i]; + MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs); + MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i)); + MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i)); + MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i)); + MD_SIMD_FLOAT fix = simd_zero(); + MD_SIMD_FLOAT fiy = simd_zero(); + MD_SIMD_FLOAT fiz = simd_zero(); + + for(int k = 0; k < numneighs; k += VECTOR_WIDTH) { + // If the last iteration of this loop is separated from the rest, this mask can be set only there + MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(simd_int_add(simd_int_broadcast(k), simd_int_seq()), numneighs_vec); + MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs); + #ifdef AOS + MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3 + MD_SIMD_FLOAT delx = xtmp - simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT)); + MD_SIMD_FLOAT dely = ytmp - simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT)); + MD_SIMD_FLOAT delz = ztmp - simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT)); + #else + MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT)); + MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT)); + MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT)); + #endif + MD_SIMD_FLOAT rsq = simd_fma(delx, delx, simd_fma(dely, dely, simd_mul(delz, delz))); + MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, simd_mask_cond_lt(rsq, cutforcesq_vec)); + MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq); + MD_SIMD_FLOAT sr6 = simd_mul(sr2, simd_mul(sr2, simd_mul(sr2, sigma6_vec))); + MD_SIMD_FLOAT force = simd_mul(c48_vec, simd_mul(sr6, simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec)))); + + fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask); + fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask); + fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask); + } + + atom_fx(i) += simd_h_reduce_sum(fix); + atom_fy(i) += simd_h_reduce_sum(fiy); + atom_fz(i) += simd_h_reduce_sum(fiz); + } + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + return E-S; +} diff --git a/lammps/main.c b/lammps/main.c index 3b7b63c..f770fa1 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -45,10 +45,19 @@ #define HLINE "----------------------------------------------------------------------------\n" -extern double computeForceLJFullNeigh(Parameter*, Atom*, Neighbor*, Stats*); +extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*); +extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); +#ifdef USE_SIMD_KERNEL +# define KERNEL_NAME "SIMD" +# define computeForceLJFullNeigh computeForceLJFullNeigh_simd +#else +# define KERNEL_NAME "plain-C" +# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c +#endif + double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E;