From 911ba6333680d198a9692c9346bbf7d5bbafa99a Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Tue, 16 Aug 2022 18:36:47 +0200 Subject: [PATCH] Adjust ISA options and improve output Signed-off-by: Rafael Ravedutti --- Makefile | 8 +++- config.mk | 3 ++ gromacs/includes/simd/avx_avx2_double.h | 6 +-- include_ISA.mk | 3 +- lammps/force_dem.c | 8 ++-- lammps/force_lj.c | 59 ++++++++++++++----------- lammps/includes/util.h | 21 ++++++--- lammps/main.c | 9 +--- lammps/parameter.c | 6 ++- 9 files changed, 70 insertions(+), 53 deletions(-) diff --git a/Makefile b/Makefile index 668461a..5a8f1cd 100644 --- a/Makefile +++ b/Makefile @@ -64,8 +64,12 @@ ifneq ($(VECTOR_WIDTH),) DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH) endif -ifeq ($(strip $(NO_AVX2)),true) - DEFINES += -DNO_AVX2 +ifeq ($(strip $(MASK_REGISTERS)),true) + DEFINES += -DMASK_REGISTERS +endif + +ifeq ($(strip $(SIMD_KERNEL_AVAILABLE)),true) + DEFINES += -DSIMD_KERNEL_AVAILABLE endif ifeq ($(strip $(AVX512)),true) diff --git a/config.mk b/config.mk index 696f1e5..290c150 100644 --- a/config.mk +++ b/config.mk @@ -2,6 +2,9 @@ TAG ?= ICC # Instruction set (SSE/AVX/AVX2/AVX512) ISA ?= AVX512 +# Use mask registers (AVX512 must be available in the target CPU) +# This is always true when ISA is set to AVX512 +MASK_REGISTERS ?= true # Optimization scheme (lammps/gromacs/clusters_per_bin) OPT_SCHEME ?= lammps # Enable likwid (true or false) diff --git a/gromacs/includes/simd/avx_avx2_double.h b/gromacs/includes/simd/avx_avx2_double.h index c0c52e8..637d1ed 100644 --- a/gromacs/includes/simd/avx_avx2_double.h +++ b/gromacs/includes/simd/avx_avx2_double.h @@ -28,10 +28,10 @@ #define MD_SIMD_FLOAT __m256d #define MD_SIMD_INT __m128i -#ifdef NO_AVX2 -# define MD_SIMD_MASK __m256d -#else +#ifdef MASK_REGISTERS # define MD_SIMD_MASK __mmask8 +#else +# define MD_SIMD_MASK __m256d #endif static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); } diff --git a/include_ISA.mk b/include_ISA.mk index fa9b90b..7008d26 100644 --- a/include_ISA.mk +++ b/include_ISA.mk @@ -2,12 +2,13 @@ ifeq ($(strip $(ISA)), SSE) _VECTOR_WIDTH=2 else ifeq ($(strip $(ISA)), AVX) # Vector width is 4 but AVX2 instruction set is not supported -NO_AVX2=true _VECTOR_WIDTH=4 else ifeq ($(strip $(ISA)), AVX2) +#SIMD_KERNEL_AVAILABLE=true _VECTOR_WIDTH=4 else ifeq ($(strip $(ISA)), AVX512) AVX512=true +SIMD_KERNEL_AVAILABLE=true _VECTOR_WIDTH=8 endif diff --git a/lammps/force_dem.c b/lammps/force_dem.c index 47b5564..fdae21f 100644 --- a/lammps/force_dem.c +++ b/lammps/force_dem.c @@ -21,16 +21,14 @@ * ======================================================================================= */ #include +//--- +#include #include - -#include #include #include -#include #include +#include -// TODO: Joint common files for gromacs and lammps variants -#include "../gromacs/includes/simd.h" double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { int Nlocal = atom->Nlocal; diff --git a/lammps/force_lj.c b/lammps/force_lj.c index 0c72873..408bb21 100644 --- a/lammps/force_lj.c +++ b/lammps/force_lj.c @@ -20,25 +20,29 @@ * with MD-Bench. If not, see . * ======================================================================================= */ +#include +#include +//--- +#include #include - -#include #include #include -#include #include +#include // TODO: Joint common files for gromacs and lammps variants +#ifdef SIMD_KERNEL_AVAILABLE #include "../gromacs/includes/simd.h" +#endif double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { int Nlocal = atom->Nlocal; int* neighs; -#ifndef EXPLICIT_TYPES + #ifndef EXPLICIT_TYPES MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; -#endif + #endif for(int i = 0; i < Nlocal; i++) { atom_fx(i) = 0.0; @@ -49,7 +53,7 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n double S = getTimeStamp(); LIKWID_MARKER_START("force"); -#pragma omp parallel for + #pragma omp parallel for for(int i = 0; i < Nlocal; i++) { neighs = &neighbor->neighbors[i * neighbor->maxneighs]; int numneighs = neighbor->numneigh[i]; @@ -60,9 +64,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n MD_FLOAT fiy = 0; MD_FLOAT fiz = 0; -#ifdef EXPLICIT_TYPES + #ifdef EXPLICIT_TYPES const int type_i = atom->type[i]; -#endif + #endif for(int k = 0; k < numneighs; k++) { int j = neighs[k]; @@ -71,13 +75,13 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n MD_FLOAT delz = ztmp - atom_z(j); MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; -#ifdef EXPLICIT_TYPES + #ifdef EXPLICIT_TYPES const int type_j = atom->type[j]; const int type_ij = type_i * atom->ntypes + type_j; const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; const MD_FLOAT sigma6 = atom->sigma6[type_ij]; const MD_FLOAT epsilon = atom->epsilon[type_ij]; -#endif + #endif if(rsq < cutforcesq) { MD_FLOAT sr2 = 1.0 / rsq; @@ -86,11 +90,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n fix += delx * force; fiy += dely * force; fiz += delz * force; -#ifdef USE_REFERENCE_VERSION + #ifdef USE_REFERENCE_VERSION addStat(stats->atoms_within_cutoff, 1); } else { addStat(stats->atoms_outside_cutoff, 1); -#endif + #endif } } @@ -110,11 +114,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { int Nlocal = atom->Nlocal; int* neighs; -#ifndef EXPLICIT_TYPES + #ifndef EXPLICIT_TYPES MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; -#endif + #endif for(int i = 0; i < Nlocal; i++) { atom_fx(i) = 0.0; @@ -135,14 +139,14 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, MD_FLOAT fiy = 0; MD_FLOAT fiz = 0; -#ifdef EXPLICIT_TYPES + #ifdef EXPLICIT_TYPES const int type_i = atom->type[i]; -#endif + #endif // Pragma required to vectorize the inner loop -#ifdef ENABLE_OMP_SIMD + #ifdef ENABLE_OMP_SIMD #pragma omp simd reduction(+: fix,fiy,fiz) -#endif + #endif for(int k = 0; k < numneighs; k++) { int j = neighs[k]; MD_FLOAT delx = xtmp - atom_x(j); @@ -150,13 +154,13 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, MD_FLOAT delz = ztmp - atom_z(j); MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; -#ifdef EXPLICIT_TYPES + #ifdef EXPLICIT_TYPES const int type_j = atom->type[j]; const int type_ij = type_i * atom->ntypes + type_j; const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; const MD_FLOAT sigma6 = atom->sigma6[type_ij]; const MD_FLOAT epsilon = atom->epsilon[type_ij]; -#endif + #endif if(rsq < cutforcesq) { MD_FLOAT sr2 = 1.0 / rsq; @@ -194,11 +198,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; - MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); - MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); - MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); - MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); for(int i = 0; i < Nlocal; i++) { atom_fx(i) = 0.0; @@ -209,10 +208,16 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig double S = getTimeStamp(); LIKWID_MARKER_START("force"); - #ifdef NO_AVX2 - fprintf(stderr, "Error: SIMD kernel implemented for AVX2 and AVX512 only!"); + #ifndef SIMD_KERNEL_AVAILABLE + fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!"); exit(-1); #else + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + #pragma omp parallel for for(int i = 0; i < Nlocal; i++) { neighs = &neighbor->neighbors[i * neighbor->maxneighs]; diff --git a/lammps/includes/util.h b/lammps/includes/util.h index 3f188b2..680c78d 100644 --- a/lammps/includes/util.h +++ b/lammps/includes/util.h @@ -24,28 +24,37 @@ #define __UTIL_H_ #ifndef MIN -#define MIN(x,y) ((x)<(y)?(x):(y)) +# define MIN(x,y) ((x)<(y)?(x):(y)) #endif + #ifndef MAX -#define MAX(x,y) ((x)>(y)?(x):(y)) +# define MAX(x,y) ((x)>(y)?(x):(y)) #endif + #ifndef ABS -#define ABS(a) ((a) >= 0 ? (a) : -(a)) +# define ABS(a) ((a) >= 0 ? (a) : -(a)) #endif + #ifdef DEBUG -#define DEBUG_MESSAGE printf +# define DEBUG_MESSAGE printf #else -#define DEBUG_MESSAGE +# define DEBUG_MESSAGE #endif #ifndef MAXLINE -#define MAXLINE 4096 +# define MAXLINE 4096 #endif #define FF_LJ 0 #define FF_EAM 1 #define FF_DEM 2 +#if PRECISION == 1 +# define PRECISION_STRING "single" +#else +# define PRECISION_STRING "double" +#endif + extern double myrandom(int*); extern void random_reset(int *seed, int ibase, double *coord); extern int str2ff(const char *string); diff --git a/lammps/main.c b/lammps/main.c index 465f7a2..89a6d08 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -226,6 +226,7 @@ int main(int argc, char** argv) { param.cutneigh = param.cutforce + param.skin; setup(¶m, &eam, &atom, &neighbor, &stats); printParameter(¶m); + printf(HLINE); printf("step\ttemp\t\tpressure\n"); computeThermo(0, ¶m, &atom); @@ -272,14 +273,6 @@ int main(int argc, char** argv) { timer[TOTAL] = getTimeStamp() - timer[TOTAL]; computeThermo(-1, ¶m, &atom); - printf(HLINE); - printf("Force field: %s\n", ff2str(param.force_field)); - printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); - #if PRECISION == 1 - printf("Using single precision floating point.\n"); - #else - printf("Using double precision floating point.\n"); - #endif printf(HLINE); printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n", diff --git a/lammps/parameter.c b/lammps/parameter.c index 5606bb2..e5282d2 100644 --- a/lammps/parameter.c +++ b/lammps/parameter.c @@ -24,6 +24,7 @@ #include #include //--- +#include #include #include @@ -154,6 +155,9 @@ void printParameter(Parameter *param) { } printf("\tForce field: %s\n", ff2str(param->force_field)); + printf("\tKernel: %s\n", KERNEL_NAME); + printf("\tData layout: %s\n", POS_DATA_LAYOUT); + printf("\tFloating-point precision: %s\n", PRECISION_STRING); printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz); printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd); printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z); @@ -175,5 +179,5 @@ void printParameter(Parameter *param) { printf("\tCutoff radius: %e\n", param->cutforce); printf("\tSkin: %e\n", param->skin); printf("\tHalf neighbor lists: %d\n", param->half_neigh); - printf("\tProcessor frequency (GHz): %.4f\n\n", param->proc_freq); + printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq); }