From c7360305c8bffef44672110a46926fe25b9d2d8e Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Wed, 9 Mar 2022 02:25:39 +0100 Subject: [PATCH 1/7] Add first draft version of GROMACS method separating i-clusters and j-clusters Signed-off-by: Rafael Ravedutti --- Makefile | 1 + config.mk | 6 +- gromacs/force_lj.c | 10 +- gromacs/includes/atom.h | 79 +++++-- gromacs/includes/simd.h | 6 +- gromacs/main-stub.c | 7 +- gromacs/neighbor.c | 460 +++++++++++++++++++++++++--------------- gromacs/pbc.c | 74 ++++--- include_ISA.mk | 15 ++ 9 files changed, 411 insertions(+), 247 deletions(-) create mode 100644 include_ISA.mk diff --git a/Makefile b/Makefile index e214557..78289dc 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ Q ?= @ include $(MAKE_DIR)/config.mk include $(MAKE_DIR)/include_$(TAG).mk include $(MAKE_DIR)/include_LIKWID.mk +include $(MAKE_DIR)/include_ISA.mk include $(MAKE_DIR)/include_GROMACS.mk INCLUDES += -I./$(SRC_DIR)/includes diff --git a/config.mk b/config.mk index 3e8bb88..de2bf9d 100644 --- a/config.mk +++ b/config.mk @@ -1,5 +1,7 @@ # Compiler tag (GCC/CLANG/ICC) TAG ?= ICC +# Instruction set (SSE/AVX/AVX2/AVX512) +ISA ?= AVX512 # Optimization scheme (lammps/gromacs/clusters_per_bin) OPT_SCHEME ?= gromacs # Enable likwid (true or false) @@ -23,10 +25,6 @@ EXPLICIT_TYPES ?= false MEM_TRACER ?= false # Trace indexes and distances for gather-md (true or false) INDEX_TRACER ?= false -# Vector width (elements) for index and distance tracer -VECTOR_WIDTH ?= 8 -# When vector width is 4 but AVX2 is not supported (AVX only), set this to true -NO_AVX2 ?= false # Compute statistics COMPUTE_STATS ?= true diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 07ed0e5..04739a6 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -63,7 +63,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat int cj = neighs[k]; int any = 0; MD_FLOAT *cjptr = cluster_pos_ptr(cj); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { MD_FLOAT xtmp = cluster_x(ciptr, cii); MD_FLOAT ytmp = cluster_y(ciptr, cii); MD_FLOAT ztmp = cluster_z(ciptr, cii); @@ -71,7 +71,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT fiy = 0; MD_FLOAT fiz = 0; - for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) { + for(int cjj = 0; cjj < CLUSTER_M; cjj++) { if(ci != cj || cii != cjj) { MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj); MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj); @@ -106,7 +106,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); - addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N)); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } LIKWID_MARKER_STOP("force"); @@ -128,7 +128,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M; + const int unroll_j = CLUSTER_N / CLUSTER_M; double S = getTimeStamp(); LIKWID_MARKER_START("force"); @@ -262,7 +262,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); - addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N)); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } LIKWID_MARKER_STOP("force"); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 854d0b0..2373d6b 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -27,13 +27,60 @@ #define DELTA 20000 -#define CLUSTER_DIM_M 4 -#define CLUSTER_DIM_N VECTOR_WIDTH +#define CLUSTER_M 4 + +// Nbnxn layouts (as of GROMACS): +// Simd4xN: M=4, N=VECTOR_WIDTH +// Simd2xNN: M=4, N=(VECTOR_WIDTH/2) + +// Simd2XNN (here used for single-precision) +#if VECTOR_WIDTH > CLUSTER_M * 2 +# define CLUSTER_N (VECTOR_WIDTH / 2) +// Simd4xN +#else +# define CLUSTER_N VECTOR_WIDTH +#endif + +#if CLUSTER_M == CLUSTER_N +# define CJ0_FROM_CI(a) (a) +# define CJ1_FROM_CI(a) (a) +# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#elif CLUSTER_M == CLUSTER_N * 2 // M > N +# define CJ0_FROM_CI(a) ((a) << 1) +# define CJ1_FROM_CI(a) (((a) << 1) | 0x1) +# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b)) +# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1)) +#elif CLUSTER_M == CLUSTER_N / 2 // M < N +# define CJ0_FROM_CI(a) ((a) >> 1) +# define CJ1_FROM_CI(a) ((a) >> 1) +# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1)) +# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#else +# error "Invalid cluster configuration!" +#endif + +#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8 +# error "Cluster N dimension can be only 2, 4 and 8" +#endif + +#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1)) +#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3)) +#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1)) +#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3)) + +#if CLUSTER_M >= CLUSTER_N +# define CL_X_OFFSET (0 * CLUSTER_M) +# define CL_Y_OFFSET (1 * CLUSTER_M) +# define CL_Z_OFFSET (2 * CLUSTER_M) +#else +# define CL_X_OFFSET (0 * CLUSTER_N) +# define CL_Y_OFFSET (1 * CLUSTER_N) +# define CL_Z_OFFSET (2 * CLUSTER_N) +#endif typedef struct { - int bin; int natoms; - int type[CLUSTER_DIM_M]; MD_FLOAT bbminx, bbmaxx; MD_FLOAT bbminy, bbmaxy; MD_FLOAT bbminz, bbmaxz; @@ -44,9 +91,6 @@ typedef struct { int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max; MD_FLOAT *x, *y, *z; MD_FLOAT *vx, *vy, *vz; - MD_FLOAT *cl_x; - MD_FLOAT *cl_v; - MD_FLOAT *cl_f; int *border_map; int *type; int ntypes; @@ -54,8 +98,13 @@ typedef struct { MD_FLOAT *sigma6; MD_FLOAT *cutforcesq; MD_FLOAT *cutneighsq; - Cluster *clusters; int *PBCx, *PBCy, *PBCz; + // Data in cluster format + MD_FLOAT *cl_x; + MD_FLOAT *cl_v; + MD_FLOAT *cl_f; + int *cl_type; + Cluster *iclusters, *jclusters; } Atom; extern void initAtom(Atom*); @@ -67,10 +116,6 @@ extern int readAtom_dmp(Atom*, Parameter*); extern void growAtom(Atom*); extern void growClusters(Atom*); -#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3]) -#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3]) -#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3]) - #ifdef AOS #define POS_DATA_LAYOUT "AoS" #define atom_x(i) atom->x[(i) * 3 + 0] @@ -83,14 +128,4 @@ extern void growClusters(Atom*); #define atom_z(i) atom->z[i] #endif -#ifdef CLUSTER_AOS -#define cluster_x(cptr, i) cptr[(i) * 3 + 0] -#define cluster_y(cptr, i) cptr[(i) * 3 + 1] -#define cluster_z(cptr, i) cptr[(i) * 3 + 2] -#else -#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_M + (i)] -#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_M + (i)] -#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_M + (i)] -#endif - #endif diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index ad86e7b..9c0a842 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -54,8 +54,8 @@ static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) { x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double)); x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double)); #else - x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]); - x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1); + x = _mm512_load_pd(&c0[d * CLUSTER_M]); + x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_M]), 1); #endif return x; } @@ -134,7 +134,7 @@ static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) { __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d)); x = _mm256_i32gather_pd(c0, vindex, sizeof(double)); #else - x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]); + x = _mm256_load_pd(&c0[d * CLUSTER_M]); #endif return x; } diff --git a/gromacs/main-stub.c b/gromacs/main-stub.c index 4da90db..c7596e7 100644 --- a/gromacs/main-stub.c +++ b/gromacs/main-stub.c @@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); - printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); - printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); @@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) { E = getTimeStamp(); double T_accum = E-S; double freq_hz = param.proc_freq * 1.e9; - const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) { diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 3d86183..1f40ef9 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -39,8 +39,9 @@ static int nbinx, nbiny; static int mbinx, mbiny; // n bins in x, y static int *bincount; static int *bins; -static int *cluster_bincount; -static int *cluster_bins; +static int *bin_nclusters; +static int *bin_clusters; +static int *icluster_bin; static int mbins; //total number of bins static int atoms_per_bin; // max atoms per bin static int clusters_per_bin; // max clusters per bin @@ -63,12 +64,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { cutneigh = param->cutneigh; nmax = 0; atoms_per_bin = 8; - clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4; + clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4; stencil = NULL; bins = NULL; bincount = NULL; - cluster_bins = NULL; - cluster_bincount = NULL; + bin_clusters = NULL; + bin_nclusters = NULL; neighbor->maxneighs = 100; neighbor->numneigh = NULL; neighbor->neighbors = NULL; @@ -91,7 +92,7 @@ void setupNeighbor(Parameter *param, Atom *atom) { MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd; MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); - MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N); + MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N); MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density); nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex)); @@ -103,20 +104,16 @@ void setupNeighbor(Parameter *param, Atom *atom) { cutneighsq = cutneigh * cutneigh; coord = xlo - cutneigh - SMALL * xprd; - mbinxlo = (int) (coord * bininvx); - if (coord < 0.0) { - mbinxlo = mbinxlo - 1; - } + mbinxlo = (int)(coord * bininvx); + if(coord < 0.0) { mbinxlo = mbinxlo - 1; } coord = xhi + cutneigh + SMALL * xprd; - mbinxhi = (int) (coord * bininvx); + mbinxhi = (int)(coord * bininvx); coord = ylo - cutneigh - SMALL * yprd; - mbinylo = (int) (coord * bininvy); - if (coord < 0.0) { - mbinylo = mbinylo - 1; - } + mbinylo = (int)(coord * bininvy); + if(coord < 0.0) { mbinylo = mbinylo - 1; } coord = yhi + cutneigh + SMALL * yprd; - mbinyhi = (int) (coord * bininvy); + mbinyhi = (int)(coord * bininvy); mbinxlo = mbinxlo - 1; mbinxhi = mbinxhi + 1; @@ -126,16 +123,12 @@ void setupNeighbor(Parameter *param, Atom *atom) { mbinyhi = mbinyhi + 1; mbiny = mbinyhi - mbinylo + 1; - nextx = (int) (cutneigh * bininvx); + nextx = (int)(cutneigh * bininvx); + nexty = (int)(cutneigh * bininvy); if(nextx * binsizex < FACTOR * cutneigh) nextx++; - - nexty = (int) (cutneigh * bininvy); if(nexty * binsizey < FACTOR * cutneigh) nexty++; - if (stencil) { - free(stencil); - } - + if (stencil) { free(stencil); } stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int)); nstencil = 0; @@ -147,19 +140,15 @@ void setupNeighbor(Parameter *param, Atom *atom) { } } + if(bincount) { free(bincount); } + if(bins) { free(bins); } + if(bin_nclusters) { free(bin_nclusters); } + if(bin_clusters) { free(bin_clusters); } mbins = mbinx * mbiny; - - if (bincount) { free(bincount); } bincount = (int*) malloc(mbins * sizeof(int)); - - if (bins) { free(bins); } bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int)); - - if (cluster_bincount) { free(cluster_bincount); } - cluster_bincount = (int*) malloc(mbins * sizeof(int)); - - if (cluster_bins) { free(cluster_bins); } - cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); + bin_nclusters = (int*) malloc(mbins * sizeof(int)); + bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); /* DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi); @@ -171,20 +160,20 @@ void setupNeighbor(Parameter *param, Atom *atom) { } MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { - MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx; - MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx; + MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx; + MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx; MD_FLOAT dm = MAX(dl, dh); MD_FLOAT dm0 = MAX(dm, 0.0); MD_FLOAT d2 = dm0 * dm0; - dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy; - dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy; + dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy; + dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; - dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz; - dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz; + dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz; + dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; @@ -192,14 +181,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { } int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cjptr = cluster_pos_ptr(cj); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) { - MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj); - MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj); - MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj]; if(delx * delx + dely * dely + delz * delz < rsq) { return 1; } @@ -234,22 +225,22 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { resize = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); + int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; - int ibin = atom->clusters[ci].bin; - MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx; - MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx; - MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy; - MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy; - MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz; - MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz; + int ibin = icluster_bin[ci]; + MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; + MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; + MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; + MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy; + MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz; + MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz; for(int k = 0; k < nstencil; k++) { int jbin = ibin + stencil[k]; - int *loc_bin = &cluster_bins[jbin * clusters_per_bin]; + int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; int cj, m = -1; MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; - const int c = cluster_bincount[jbin]; + const int c = bin_nclusters[jbin]; if(c > 0) { MD_FLOAT dl, dh, dm, dm0, d_bb_sq; @@ -257,8 +248,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { do { m++; cj = loc_bin[m]; - jbb_zmin = atom->clusters[cj].bbminz; - jbb_zmax = atom->clusters[cj].bbmaxz; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; dl = ibb_zmin - jbb_zmax; dh = jbb_zmin - ibb_zmax; dm = MAX(dl, dh); @@ -266,10 +257,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { d_bb_sq = dm0 * dm0; } while(m + 1 < c && d_bb_sq > cutneighsq); - jbb_xmin = atom->clusters[cj].bbminx; - jbb_xmax = atom->clusters[cj].bbmaxx; - jbb_ymin = atom->clusters[cj].bbminy; - jbb_ymax = atom->clusters[cj].bbmaxy; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; while(m < c) { dl = ibb_zmin - jbb_zmax; @@ -303,20 +294,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { m++; if(m < c) { cj = loc_bin[m]; - jbb_xmin = atom->clusters[cj].bbminx; - jbb_xmax = atom->clusters[cj].bbmaxx; - jbb_ymin = atom->clusters[cj].bbminy; - jbb_ymax = atom->clusters[cj].bbmaxy; - jbb_zmin = atom->clusters[cj].bbminz; - jbb_zmax = atom->clusters[cj].bbmaxz; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; } } } } - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) { - neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster + // Fill neighbor list with dummy values to fit vector width + if(CLUSTER_N < VECTOR_WIDTH) { + while(n % (VECTOR_WIDTH / CLUSTER_N)) { + neighptr[n++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster } } @@ -353,7 +345,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { atom->clusters[ci].bbminz, atom->clusters[ci].bbmaxz); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii)); } @@ -371,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { atom->clusters[cj].bbminz, atom->clusters[cj].bbmaxz); - for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) { + for(int cjj = 0; cjj < CLUSTER_M; cjj++) { DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj)); } } @@ -393,8 +385,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { int k = 0; // Remove dummy clusters if necessary - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(neighs[numneighs - 1] == nall - 1) { + if(CLUSTER_N < VECTOR_WIDTH) { + while(neighs[numneighs - 1] == CJ0_FROM_CI(nall - 1)) { numneighs--; } } @@ -410,9 +402,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { } // Readd dummy clusters if necessary - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) { - neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster + if(CLUSTER_N < VECTOR_WIDTH) { + while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { + neighs[numneighs++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster } } @@ -558,34 +550,37 @@ void buildClusters(Atom *atom) { for(int bin = 0; bin < mbins; bin++) { int c = bincount[bin]; int ac = 0; - const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M); + int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M); + if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; } for(int cl = 0; cl < nclusters; cl++) { const int ci = atom->Nclusters_local; - if(ci >= atom->Nclusters_max) { growClusters(atom); } - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *cvptr = cluster_velocity_ptr(ci); + int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + int *ci_type = &atom->cl_type[ci_sca_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; - atom->clusters[ci].natoms = 0; - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + atom->iclusters[ci].natoms = 0; + for(int cii = 0; cii < CLUSTER_M; cii++) { if(ac < c) { int i = bins[bin * atoms_per_bin + ac]; MD_FLOAT xtmp = atom_x(i); MD_FLOAT ytmp = atom_y(i); MD_FLOAT ztmp = atom_z(i); - cluster_x(cptr, cii) = xtmp; - cluster_y(cptr, cii) = ytmp; - cluster_z(cptr, cii) = ztmp; - cluster_x(cvptr, cii) = atom->vx[i]; - cluster_y(cvptr, cii) = atom->vy[i]; - cluster_z(cvptr, cii) = atom->vz[i]; + ci_x[CL_X_OFFSET + cii] = xtmp; + ci_x[CL_Y_OFFSET + cii] = ytmp; + ci_x[CL_Z_OFFSET + cii] = ztmp; + ci_v[CL_X_OFFSET + cii] = atom->vx[i]; + ci_v[CL_Y_OFFSET + cii] = atom->vy[i]; + ci_v[CL_Z_OFFSET + cii] = atom->vz[i]; // TODO: To create the bounding boxes faster, we can use SIMD operations if(bbminx > xtmp) { bbminx = xtmp; } @@ -595,24 +590,24 @@ void buildClusters(Atom *atom) { if(bbminz > ztmp) { bbminz = ztmp; } if(bbmaxz < ztmp) { bbmaxz = ztmp; } - atom->clusters[ci].type[cii] = atom->type[i]; - atom->clusters[ci].natoms++; + atom->cl_type[cii] = atom->type[i]; + atom->iclusters[ci].natoms++; } else { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } ac++; } - atom->clusters[ci].bin = bin; - atom->clusters[ci].bbminx = bbminx; - atom->clusters[ci].bbmaxx = bbmaxx; - atom->clusters[ci].bbminy = bbminy; - atom->clusters[ci].bbmaxy = bbmaxy; - atom->clusters[ci].bbminz = bbminz; - atom->clusters[ci].bbmaxz = bbmaxz; + icluster_bin[ci] = bin; + atom->iclusters[ci].bbminx = bbminx; + atom->iclusters[ci].bbmaxx = bbmaxx; + atom->iclusters[ci].bbminy = bbminy; + atom->iclusters[ci].bbmaxy = bbmaxy; + atom->iclusters[ci].bbminz = bbminz; + atom->iclusters[ci].bbmaxz = bbmaxz; atom->Nclusters_local++; } } @@ -620,6 +615,93 @@ void buildClusters(Atom *atom) { DEBUG_MESSAGE("buildClusters end\n"); } +void defineJClusters(Atom *atom) { + DEBUG_MESSAGE("defineJClusters start\n"); + + for(int ci = 0; ci < atom->Nclusters_local + atom->Nclusters_ghost; ci++) { + int cj0 = CJ0_FROM_CI(ci); + + if(CLUSTER_M == CLUSTER_N) { + atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx; + atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx; + atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy; + atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy; + atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz; + atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz; + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms; + + } else if(CLUSTER_M > CLUSTER_N) { + int cj1 = CJ1_FROM_CI(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; + MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; + MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; + + for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + atom->jclusters[cj0].bbminx = bbminx; + atom->jclusters[cj0].bbmaxx = bbmaxx; + atom->jclusters[cj0].bbminy = bbminy; + atom->jclusters[cj0].bbmaxy = bbmaxy; + atom->jclusters[cj0].bbminz = bbminz; + atom->jclusters[cj0].bbmaxz = bbmaxz; + atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N); + + bbminx = INFINITY, bbmaxx = -INFINITY; + bbminy = INFINITY, bbmaxy = -INFINITY; + bbminz = INFINITY, bbmaxz = -INFINITY; + + for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + atom->jclusters[cj1].bbminx = bbminx; + atom->jclusters[cj1].bbmaxx = bbmaxx; + atom->jclusters[cj1].bbminy = bbminy; + atom->jclusters[cj1].bbmaxy = bbmaxy; + atom->jclusters[cj1].bbminz = bbminz; + atom->jclusters[cj1].bbmaxz = bbmaxz; + atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N); + + } else { + if(ci % 2 == 0) { + atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci + 1].bbminx); + atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci + 1].bbmaxx); + atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci + 1].bbminy); + atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci + 1].bbmaxy); + atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci + 1].bbminz); + atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci + 1].bbmaxz); + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci + 1].natoms; + } + } + } + + DEBUG_MESSAGE("defineJClusters end\n"); +} + void binClusters(Atom *atom) { DEBUG_MESSAGE("binClusters start\n"); @@ -629,7 +711,7 @@ void binClusters(Atom *atom) { MD_FLOAT *cptr = cluster_pos_ptr(ci); DEBUG_MESSAGE("Cluster %d:\n", ci); DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n", - atom->clusters[ci].bin, + icluster_bin[ci], atom->clusters[ci].natoms, atom->clusters[ci].bbminx, atom->clusters[ci].bbmaxx, @@ -638,104 +720,133 @@ void binClusters(Atom *atom) { atom->clusters[ci].bbminz, atom->clusters[ci].bbmaxz); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); } } */ const int nlocal = atom->Nclusters_local; - int resize = 1; + const int ncji = MAX(1, CLUSTER_M / CLUSTER_N); + if(ncji > 2) { + fprintf(stderr, "binClusters(): ncji cannot be higher than 2!"); + } + defineJClusters(atom); + + int resize = 1; while(resize > 0) { resize = 0; for(int bin = 0; bin < mbins; bin++) { - cluster_bincount[bin] = 0; + bin_nclusters[bin] = 0; } for(int ci = 0; ci < nlocal && !resize; ci++) { - int bin = atom->clusters[ci].bin; - int c = cluster_bincount[bin]; - if(c < clusters_per_bin) { - cluster_bins[bin * clusters_per_bin + c] = ci; - cluster_bincount[bin]++; - } else { - resize = 1; + // Assure we add this j-cluster only once in the bin + if(!(CLUSTER_M < CLUSTER_N && ci % 2)) { + int bin = icluster_bin[ci]; + int c = bin_nclusters[bin]; + if(c + 1 < clusters_per_bin) { + bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci); + bin_nclusters[bin]++; + + if(CLUSTER_M > CLUSTER_N) { + int cj1 = CJ1_FROM_CI(ci); + if(atoms->jclusters[cj1].natoms > 0) { + bin_clusters[bin * clusters_per_bin + c + 1] = cj1; + bin_nclusters[bin]++; + } + } + } else { + resize = 1; + } } } for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) { - const int ci = nlocal + cg; - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT ci_minz = atom->clusters[ci].bbminz; - MD_FLOAT xtmp, ytmp; - int ix = -1, iy = -1; + // Assure we add this j-cluster only once in the bin + if(!(CLUSTER_M < CLUSTER_N && cg % 2)) { + const int ci = nlocal + cg; + int ix = -1, iy = -1; + MD_FLOAT xtmp, ytmp; - xtmp = cluster_x(cptr, 0); - ytmp = cluster_y(cptr, 0); - coord2bin2D(xtmp, ytmp, &ix, &iy); - ix = MAX(MIN(ix, mbinx - 1), 0); - iy = MAX(MIN(iy, mbiny - 1), 0); - for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) { - int nix, niy; - xtmp = cluster_x(cptr, cii); - ytmp = cluster_y(cptr, cii); - coord2bin2D(xtmp, ytmp, &nix, &niy); - nix = MAX(MIN(nix, mbinx - 1), 0); - niy = MAX(MIN(niy, mbiny - 1), 0); + for(int cji = 0; cji < ncji && !resize; cji++) { + int cj = (cji == 0) ? CJ0_FROM_CI(ci) : CJ1_FROM_CI(ci); + const int n = atom->jclusters[cj].natoms; - // Always put the cluster on the bin of its innermost atom so - // the cluster should be closer to local clusters - if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } - if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } - if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } - if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } - } + if(n > 0) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT cj_minz = atom->jclusters[cj].bbminz; - int bin = iy * mbinx + ix + 1; - int c = cluster_bincount[bin]; - if(c < clusters_per_bin) { - // Insert the current ghost cluster in the bin keeping clusters - // sorted by z coordinate - int inserted = 0; - for(int i = 0; i < c; i++) { - int last_cl = cluster_bins[bin * clusters_per_bin + i]; - if(atom->clusters[last_cl].bbminz > ci_minz) { - cluster_bins[bin * clusters_per_bin + i] = ci; + xtmp = cj_x[CL_X_OFFSET + 0] + ytmp = cj_x[CL_Y_OFFSET + 0] + coord2bin2D(xtmp, ytmp, &ix, &iy); + ix = MAX(MIN(ix, mbinx - 1), 0); + iy = MAX(MIN(iy, mbiny - 1), 0); + for(int cjj = 1; cjj < n; cjj++) { + int nix, niy; + xtmp = cj_x[CL_X_OFFSET + cjj] + ytmp = cj_x[CL_Y_OFFSET + cjj] + coord2bin2D(xtmp, ytmp, &nix, &niy); + nix = MAX(MIN(nix, mbinx - 1), 0); + niy = MAX(MIN(niy, mbiny - 1), 0); - for(int j = i + 1; j <= c; j++) { - int tmp = cluster_bins[bin * clusters_per_bin + j]; - cluster_bins[bin * clusters_per_bin + j] = last_cl; - last_cl = tmp; + // Always put the cluster on the bin of its innermost atom so + // the cluster should be closer to local clusters + if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } + if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } + if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } + if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } } - inserted = 1; - break; + int bin = iy * mbinx + ix + 1; + int c = bin_nclusters[bin]; + if(c < clusters_per_bin) { + // Insert the current ghost cluster in the bin keeping clusters + // sorted by z coordinate + int inserted = 0; + for(int i = 0; i < c; i++) { + int last_cl = bin_clusters[bin * clusters_per_bin + i]; + if(atom->jclusters[last_cl].bbminz > cj_minz) { + bin_clusters[bin * clusters_per_bin + i] = cj; + + for(int j = i + 1; j <= c; j++) { + int tmp = bin_clusters[bin * clusters_per_bin + j]; + bin_clusters[bin * clusters_per_bin + j] = last_cl; + last_cl = tmp; + } + + inserted = 1; + break; + } + } + + if(!inserted) { + bin_clusters[bin * clusters_per_bin + c] = cj; + } + + icluster_bin[ci] = bin; + bin_nclusters[bin]++; + } else { + resize = 1; + } } } - - if(!inserted) { - cluster_bins[bin * clusters_per_bin + c] = ci; - } - - atom->clusters[ci].bin = bin; - cluster_bincount[bin]++; - } else { - resize = 1; } } if(resize) { - free(cluster_bins); + free(bin_clusters); clusters_per_bin *= 2; - cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); + bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); } } /* - DEBUG_MESSAGE("cluster_bincount\n"); - for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); } + DEBUG_MESSAGE("bin_nclusters\n"); + for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); } DEBUG_MESSAGE("\n"); */ @@ -747,16 +858,17 @@ void updateSingleAtoms(Atom *atom) { int Natom = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *cvptr = cluster_velocity_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - atom_x(Natom) = cluster_x(cptr, cii); - atom_y(Natom) = cluster_y(cptr, cii); - atom_z(Natom) = cluster_z(cptr, cii); - atom->vx[Natom] = cluster_x(cvptr, cii); - atom->vy[Natom] = cluster_y(cvptr, cii); - atom->vz[Natom] = cluster_z(cvptr, cii); + atom_x(Natom) = ci_x[CL_X_OFFSET + cii]; + atom_y(Natom) = ci_x[CL_Y_OFFSET + cii]; + atom_z(Natom) = ci_x[CL_Z_OFFSET + cii]; + atom->vx[Natom] = ci_v[CL_X_OFFSET + cii]; + atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii]; + atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii]; Natom++; } } diff --git a/gromacs/pbc.c b/gromacs/pbc.c index acb203e..9632e36 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -53,20 +53,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { for(int cg = 0; cg < atom->Nclusters_ghost; cg++) { const int ci = nlocal + cg; - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + int bmap_vec_base = CI_VECTOR_BASE_INDEX(border_map[cg]); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd; - MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd; - MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd; + MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd; + MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd; + MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd; - cluster_x(cptr, cii) = xtmp; - cluster_y(cptr, cii) = ytmp; - cluster_z(cptr, cii) = ztmp; + ci_x[CL_X_OFFSET + cii] = xtmp; + ci_x[CL_Y_OFFSET + cii] = ytmp; + ci_x[CL_Z_OFFSET + cii] = ztmp; if(firstUpdate) { // TODO: To create the bounding boxes faster, we can use SIMD operations @@ -80,18 +82,18 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { } if(firstUpdate) { - for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_M; cii++) { + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } - atom->clusters[ci].bbminx = bbminx; - atom->clusters[ci].bbmaxx = bbmaxx; - atom->clusters[ci].bbminy = bbminy; - atom->clusters[ci].bbmaxy = bbmaxy; - atom->clusters[ci].bbminz = bbminz; - atom->clusters[ci].bbmaxz = bbmaxz; + atom->iclusters[ci].bbminx = bbminx; + atom->iclusters[ci].bbmaxx = bbmaxx; + atom->iclusters[ci].bbminy = bbminy; + atom->iclusters[ci].bbmaxy = bbmaxy; + atom->iclusters[ci].bbminz = bbminz; + atom->iclusters[ci].bbmaxz = bbmaxz; } } } @@ -131,14 +133,17 @@ void updateAtomsPbc(Atom *atom, Parameter *param) { #define ADDGHOST(dx,dy,dz); \ Nghost++; \ const int g_atom_idx = atom->Nclusters_local + Nghost; \ + const int natoms_ci = atom->iclusters[ci].natoms; \ border_map[Nghost] = ci; \ atom->PBCx[Nghost] = dx; \ atom->PBCy[Nghost] = dy; \ atom->PBCz[Nghost] = dz; \ - atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms; \ - Nghost_atoms += atom->clusters[g_atom_idx].natoms; \ - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { \ - atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii]; \ + atom->iclusters[g_atom_idx].natoms = natoms_ci; \ + Nghost_atoms += natoms_ci; \ + int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); \ + int cg_sca_base = CI_SCALAR_BASE_INDEX(cg); \ + for(int cii = 0; cii < natoms_ci; cii++) { \ + atom->cl_type[cg_sca_base + cii] = atom->cl_type[ci_sca_base + cii]; \ } /* internal subroutines */ @@ -171,12 +176,12 @@ void setupPbc(Atom *atom, Parameter *param) { border_map = atom->border_map; } - MD_FLOAT bbminx = atom->clusters[ci].bbminx; - MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx; - MD_FLOAT bbminy = atom->clusters[ci].bbminy; - MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy; - MD_FLOAT bbminz = atom->clusters[ci].bbminz; - MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz; + MD_FLOAT bbminx = atom->iclusters[ci].bbminx; + MD_FLOAT bbmaxx = atom->iclusters[ci].bbmaxx; + MD_FLOAT bbminy = atom->iclusters[ci].bbminy; + MD_FLOAT bbmaxy = atom->iclusters[ci].bbmaxy; + MD_FLOAT bbminz = atom->iclusters[ci].bbminz; + MD_FLOAT bbmaxz = atom->iclusters[ci].bbmaxz; /* Setup ghost atoms */ /* 6 planes */ @@ -210,16 +215,17 @@ void setupPbc(Atom *atom, Parameter *param) { if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } } - if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) { + if(atom->Nclusters_local + Nghost + 2 >= atom->Nclusters_max) { growClusters(atom); } // Add dummy cluster at the end - MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + int ci_vec_base = CI_VECTOR_BASE_INDEX(atom->Nclusters_local + Nghost + 1); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < MAX(CLUSTER_M, CLUSTER_N); cii++) { + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } // increase by one to make it the ghost atom count diff --git a/include_ISA.mk b/include_ISA.mk new file mode 100644 index 0000000..215f626 --- /dev/null +++ b/include_ISA.mk @@ -0,0 +1,15 @@ +ifeq ($(strip $(ISA)), SSE) +VECTOR_WIDTH=2 +else ifeq ($(strip $(ISA)), AVX) +# Vector width is 4 but AVX2 instruction set is not supported +NO_AVX2=true +VECTOR_WIDTH=4 +else ifeq ($(strip $(ISA)), AVX2) +VECTOR_WIDTH=4 +else ifeq ($(strip $(ISA)), AVX512) +VECTOR_WIDTH=8 +endif + +ifeq ($(strip $(DATA_TYPE)), SP) +VECTOR_WIDTH=$((VECTOR_WIDTH * 2)) +endif From 2b441e691e929d706d60a24de4960237e7e39a49 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Wed, 9 Mar 2022 17:23:49 +0100 Subject: [PATCH 2/7] Make code compilable Signed-off-by: Rafael Ravedutti --- gromacs/atom.c | 13 +- gromacs/force_lj.c | 258 ++++++++++++++++++++++++++++++++-------- gromacs/includes/simd.h | 30 +---- gromacs/main.c | 34 +++--- gromacs/neighbor.c | 12 +- gromacs/pbc.c | 9 +- gromacs/stats.c | 4 +- gromacs/vtk.c | 40 ++++--- gromacs/xtc.c | 9 +- 9 files changed, 276 insertions(+), 133 deletions(-) diff --git a/gromacs/atom.c b/gromacs/atom.c index 8c11da9..024363f 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -51,7 +51,8 @@ void initAtom(Atom *atom) { atom->sigma6 = NULL; atom->cutforcesq = NULL; atom->cutneighsq = NULL; - atom->clusters = NULL; + atom->iclusters = NULL; + atom->jclusters = NULL; } void createAtom(Atom *atom, Parameter *param) { @@ -426,9 +427,11 @@ void growAtom(Atom *atom) { void growClusters(Atom *atom) { int nold = atom->Nclusters_max; + int jfac = MIN(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters atom->Nclusters_max += DELTA; - atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); - atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT)); - atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT)); - atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT)); + atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); + atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster)); + atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); } diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 04739a6..d749bb1 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -41,11 +41,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT epsilon = param->epsilon; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *fptr = cluster_force_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - cluster_x(fptr, cii) = 0.0; - cluster_y(fptr, cii) = 0.0; - cluster_z(fptr, cii) = 0.0; + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; } } @@ -54,28 +55,31 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; for(int k = 0; k < numneighs; k++) { int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int any = 0; - MD_FLOAT *cjptr = cluster_pos_ptr(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + for(int cii = 0; cii < CLUSTER_M; cii++) { - MD_FLOAT xtmp = cluster_x(ciptr, cii); - MD_FLOAT ytmp = cluster_y(ciptr, cii); - MD_FLOAT ztmp = cluster_z(ciptr, cii); + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; MD_FLOAT fix = 0; MD_FLOAT fiy = 0; MD_FLOAT fiz = 0; - for(int cjj = 0; cjj < CLUSTER_M; cjj++) { + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { if(ci != cj || cii != cjj) { - MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj); - MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj); - MD_FLOAT delz = ztmp - cluster_z(cjptr, cjj); + MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj]; MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq < cutforcesq) { MD_FLOAT sr2 = 1.0 / rsq; @@ -98,9 +102,9 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->clusters_outside_cutoff, 1); } - cluster_x(cifptr, cii) += fix; - cluster_y(cifptr, cii) += fiy; - cluster_z(cifptr, cii) += fiz; + ci_f[CL_X_OFFSET + cii] += fix; + ci_f[CL_Y_OFFSET + cii] += fiy; + ci_f[CL_Z_OFFSET + cii] += fiz; } } @@ -115,8 +119,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat return E-S; } - -double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { +double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); int Nlocal = atom->Nlocal; int* neighs; @@ -128,30 +131,31 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = CLUSTER_N / CLUSTER_M; + const int unroll_j = VECTOR_WIDTH / CLUSTER_N; double S = getTimeStamp(); LIKWID_MARKER_START("force"); #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; - MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0)); - MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0)); - MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0)); - MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1)); - MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1)); - MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1)); - MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2)); - MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2)); - MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2)); - MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3)); - MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3)); - MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3)); + MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]); + MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]); + MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]); + MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]); + MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]); + MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]); + MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]); MD_SIMD_FLOAT fix0 = simd_zero(); MD_SIMD_FLOAT fiy0 = simd_zero(); MD_SIMD_FLOAT fiz0 = simd_zero(); @@ -166,10 +170,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT fiz3 = simd_zero(); for(int k = 0; k < numneighs; k += unroll_j) { - int cj0 = neighs[k + 0]; - unsigned int cond0 = (unsigned int)(ci == cj0); - MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0); + int cj = neighs[k + 0]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + unsigned int cond0 = (unsigned int)(ci == cj); + unsigned int cond1 = cond0; + MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp; + /* + MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0); #if VECTOR_WIDTH == 8 int cj1 = neighs[k + 1]; unsigned int cond1 = (unsigned int)(ci == cj1); @@ -182,6 +191,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1); MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2); #endif + */ MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); @@ -247,18 +257,166 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } - cluster_x(cifptr, 0) = simd_horizontal_sum(fix0); - cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0); - cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0); - cluster_x(cifptr, 1) = simd_horizontal_sum(fix1); - cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1); - cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1); - cluster_x(cifptr, 2) = simd_horizontal_sum(fix2); - cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2); - cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2); - cluster_x(cifptr, 3) = simd_horizontal_sum(fix3); - cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3); - cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3); + ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0); + ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1); + ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2); + ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3); + ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0); + ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1); + ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2); + ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3); + ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0); + ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1); + ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2); + ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3); + + addStat(stats->calculated_forces, 1); + addStat(stats->num_neighs, numneighs); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); + } + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + DEBUG_MESSAGE("computeForceLJ_4xn end\n"); + return E-S; +} + +double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + const int unroll_j = VECTOR_WIDTH / CLUSTER_N; + + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + #pragma omp parallel for + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[ci]; + + MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]); + MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]); + MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]); + MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]); + MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]); + MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]); + MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]); + MD_SIMD_FLOAT fix0 = simd_zero(); + MD_SIMD_FLOAT fiy0 = simd_zero(); + MD_SIMD_FLOAT fiz0 = simd_zero(); + MD_SIMD_FLOAT fix1 = simd_zero(); + MD_SIMD_FLOAT fiy1 = simd_zero(); + MD_SIMD_FLOAT fiz1 = simd_zero(); + MD_SIMD_FLOAT fix2 = simd_zero(); + MD_SIMD_FLOAT fiy2 = simd_zero(); + MD_SIMD_FLOAT fiz2 = simd_zero(); + MD_SIMD_FLOAT fix3 = simd_zero(); + MD_SIMD_FLOAT fiy3 = simd_zero(); + MD_SIMD_FLOAT fiz3 = simd_zero(); + + for(int k = 0; k < numneighs; k += unroll_j) { + int cj = neighs[k + 0]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + unsigned int cond0 = (unsigned int)(ci == cj); + unsigned int cond1 = cond0; + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]); + + MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); + MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); + MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); + MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp); + MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp); + MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp); + MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); + MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); + MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); + MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp); + MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); + MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); + + #if VECTOR_WIDTH == 8 + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); + #else + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + #endif + + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); + MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1))); + MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); + MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3))); + + MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec)); + + MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); + MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1); + MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); + MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3); + + MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec))); + MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec))); + MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); + MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); + + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec)))); + MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec)))); + MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec)))); + + fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); + fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); + fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0); + fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1); + fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1); + fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1); + fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2); + fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2); + fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2); + fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3); + fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3); + fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); + } + + ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0); + ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1); + ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2); + ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3); + ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0); + ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1); + ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2); + ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3); + ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0); + ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1); + ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2); + ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index 9c0a842..1d6a90b 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -45,20 +45,7 @@ static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { retur static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); } static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } - -static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) { - MD_SIMD_FLOAT x; -#ifdef CLUSTER_AOS - __m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0); - __m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d)); - x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double)); - x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double)); -#else - x = _mm512_load_pd(&c0[d * CLUSTER_M]); - x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_M]), 1); -#endif - return x; -} +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); } static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee)); @@ -82,6 +69,7 @@ static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); } static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); } static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); } #ifdef NO_AVX2 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); } @@ -124,20 +112,6 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { } #endif -static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) { - MD_SIMD_FLOAT x; -#ifdef CLUSTER_AOS - #ifdef NO_AVX2 - #error "Not possible to use AoS cluster layout without AVX2 support!" - #endif - __m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0); - __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d)); - x = _mm256_i32gather_pd(c0, vindex, sizeof(double)); -#else - x = _mm256_load_pd(&c0[d * CLUSTER_M]); -#endif - return x; -} #endif diff --git a/gromacs/main.c b/gromacs/main.c index 0bc8a3c..ade8f65 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -101,17 +101,18 @@ void initialIntegrate(Parameter *param, Atom *atom) { DEBUG_MESSAGE("initialIntegrate start\n"); for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *civptr = cluster_velocity_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii); - cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii); - cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii); - cluster_x(ciptr, cii) += param->dt * cluster_x(civptr, cii); - cluster_y(ciptr, cii) += param->dt * cluster_y(civptr, cii); - cluster_z(ciptr, cii) += param->dt * cluster_z(civptr, cii); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; + ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii]; + ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii]; + ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii]; } } @@ -122,13 +123,14 @@ void finalIntegrate(Parameter *param, Atom *atom) { DEBUG_MESSAGE("finalIntegrate start\n"); for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *civptr = cluster_velocity_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii); - cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii); - cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; } } diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 1f40ef9..557a9bb 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -753,7 +753,7 @@ void binClusters(Atom *atom) { if(CLUSTER_M > CLUSTER_N) { int cj1 = CJ1_FROM_CI(ci); - if(atoms->jclusters[cj1].natoms > 0) { + if(atom->jclusters[cj1].natoms > 0) { bin_clusters[bin * clusters_per_bin + c + 1] = cj1; bin_nclusters[bin]++; } @@ -780,15 +780,15 @@ void binClusters(Atom *atom) { MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT cj_minz = atom->jclusters[cj].bbminz; - xtmp = cj_x[CL_X_OFFSET + 0] - ytmp = cj_x[CL_Y_OFFSET + 0] + xtmp = cj_x[CL_X_OFFSET + 0]; + ytmp = cj_x[CL_Y_OFFSET + 0]; coord2bin2D(xtmp, ytmp, &ix, &iy); ix = MAX(MIN(ix, mbinx - 1), 0); iy = MAX(MIN(iy, mbiny - 1), 0); for(int cjj = 1; cjj < n; cjj++) { int nix, niy; - xtmp = cj_x[CL_X_OFFSET + cjj] - ytmp = cj_x[CL_Y_OFFSET + cjj] + xtmp = cj_x[CL_X_OFFSET + cjj]; + ytmp = cj_x[CL_Y_OFFSET + cjj]; coord2bin2D(xtmp, ytmp, &nix, &niy); nix = MAX(MIN(nix, mbinx - 1), 0); niy = MAX(MIN(niy, mbiny - 1), 0); @@ -862,7 +862,7 @@ void updateSingleAtoms(Atom *atom) { MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { atom_x(Natom) = ci_x[CL_X_OFFSET + cii]; atom_y(Natom) = ci_x[CL_Y_OFFSET + cii]; atom_z(Natom) = ci_x[CL_Z_OFFSET + cii]; diff --git a/gromacs/pbc.c b/gromacs/pbc.c index 9632e36..d1216ff 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -28,6 +28,7 @@ #include #include #include +#include #define DELTA 20000 @@ -61,7 +62,7 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd; MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd; MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd; @@ -82,7 +83,7 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { } if(firstUpdate) { - for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_M; cii++) { + for(int cii = atom->iclusters[ci].natoms; cii < CLUSTER_M; cii++) { ci_x[CL_X_OFFSET + cii] = INFINITY; ci_x[CL_Y_OFFSET + cii] = INFINITY; ci_x[CL_Z_OFFSET + cii] = INFINITY; @@ -132,13 +133,13 @@ void updateAtomsPbc(Atom *atom, Parameter *param) { * that are then enforced in updatePbc */ #define ADDGHOST(dx,dy,dz); \ Nghost++; \ - const int g_atom_idx = atom->Nclusters_local + Nghost; \ + const int cg = atom->Nclusters_local + Nghost; \ const int natoms_ci = atom->iclusters[ci].natoms; \ border_map[Nghost] = ci; \ atom->PBCx[Nghost] = dx; \ atom->PBCy[Nghost] = dy; \ atom->PBCz[Nghost] = dz; \ - atom->iclusters[g_atom_idx].natoms = natoms_ci; \ + atom->iclusters[cg].natoms = natoms_ci; \ Nghost_atoms += natoms_ci; \ int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); \ int cg_sca_base = CI_SCALAR_BASE_INDEX(cg); \ diff --git a/gromacs/stats.c b/gromacs/stats.c index 5f62918..c3c9f43 100644 --- a/gromacs/stats.c +++ b/gromacs/stats.c @@ -18,11 +18,11 @@ void initStats(Stats *s) { void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) { #ifdef COMPUTE_STATS - const int MxN = CLUSTER_DIM_M * CLUSTER_DIM_N; + const int MxN = CLUSTER_M * CLUSTER_N; double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local); double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) + (double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) ); - double avg_neigh_atom = (stats->num_neighs * CLUSTER_DIM_N) / (double)(atom->Nlocal * (param->ntimes + 1)); + double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1)); double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces); double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1)); diff --git a/gromacs/vtk.c b/gromacs/vtk.c index 35cc14e..2a29a26 100644 --- a/gromacs/vtk.c +++ b/gromacs/vtk.c @@ -27,9 +27,10 @@ int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); fprintf(fp, "POINTS %d double\n", atom->Nlocal); for(int ci = 0; ci < atom->Nclusters_local; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } } fprintf(fp, "\n\n"); @@ -70,9 +71,10 @@ int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); fprintf(fp, "POINTS %d double\n", atom->Nghost); for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } } fprintf(fp, "\n\n"); @@ -116,18 +118,19 @@ int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int fprintf(fp, "DATASET POLYDATA\n"); fprintf(fp, "POINTS %d double\n", atom->Nlocal); for(int ci = 0; ci < N; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } - tot_lines += atom->clusters[ci].natoms; + tot_lines += atom->iclusters[ci].natoms; } fprintf(fp, "\n\n"); fprintf(fp, "LINES %d %d\n", N, N + tot_lines); for(int ci = 0; ci < N; ++ci) { - fprintf(fp, "%d ", atom->clusters[ci].natoms); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { + fprintf(fp, "%d ", atom->iclusters[ci].natoms); + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { fprintf(fp, "%d ", i++); } @@ -157,18 +160,19 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int fprintf(fp, "DATASET POLYDATA\n"); fprintf(fp, "POINTS %d double\n", atom->Nghost); for(int ci = atom->Nclusters_local; ci < N; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } - tot_lines += atom->clusters[ci].natoms; + tot_lines += atom->iclusters[ci].natoms; } fprintf(fp, "\n\n"); fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines); for(int ci = atom->Nclusters_local; ci < N; ++ci) { - fprintf(fp, "%d ", atom->clusters[ci].natoms); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { + fprintf(fp, "%d ", atom->iclusters[ci].natoms); + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { fprintf(fp, "%d ", i++); } diff --git a/gromacs/xtc.c b/gromacs/xtc.c index d381f40..08fb0c9 100644 --- a/gromacs/xtc.c +++ b/gromacs/xtc.c @@ -52,11 +52,12 @@ void xtc_init(const char *filename, Atom *atom, int timestep) { void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) { int i = 0; for(int ci = 0; ci < atom->Nclusters_local; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - x_buf[i][XX] = cluster_x(cptr, cii); - x_buf[i][YY] = cluster_y(cptr, cii); - x_buf[i][ZZ] = cluster_z(cptr, cii); + x_buf[i][XX] = ci_x[CL_X_OFFSET + cii]; + x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii]; + x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii]; i++; } } From 22d0f0b958203f1792a6a295d6d6bc1c1a32906c Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Thu, 10 Mar 2022 01:31:50 +0100 Subject: [PATCH 3/7] Commit version that works for M=N Signed-off-by: Rafael Ravedutti --- gromacs/atom.c | 4 ++++ gromacs/force_lj.c | 30 ++++++++++++++++++++---------- gromacs/includes/atom.h | 14 +++++++++++++- gromacs/main.c | 8 ++------ gromacs/neighbor.c | 15 +++++++-------- gromacs/pbc.c | 5 +++++ 6 files changed, 51 insertions(+), 25 deletions(-) diff --git a/gromacs/atom.c b/gromacs/atom.c index 024363f..b96a91e 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -37,6 +37,7 @@ void initAtom(Atom *atom) { atom->cl_x = NULL; atom->cl_v = NULL; atom->cl_f = NULL; + atom->cl_type = NULL; atom->Natoms = 0; atom->Nlocal = 0; atom->Nghost = 0; @@ -53,6 +54,7 @@ void initAtom(Atom *atom) { atom->cutneighsq = NULL; atom->iclusters = NULL; atom->jclusters = NULL; + atom->icluster_bin = NULL; } void createAtom(Atom *atom, Parameter *param) { @@ -431,7 +433,9 @@ void growClusters(Atom *atom) { atom->Nclusters_max += DELTA; atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster)); + atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int)); atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int)); } diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index d749bb1..03d1926 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -293,13 +293,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = VECTOR_WIDTH / CLUSTER_N; - double S = getTimeStamp(); LIKWID_MARKER_START("force"); #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_cj0 = CJ0_FROM_CI(ci); + #if CLUSTER_M > CLUSTER_N + int ci_cj1 = CJ1_FROM_CI(ci); + #endif int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; @@ -331,11 +333,9 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT fiy3 = simd_zero(); MD_SIMD_FLOAT fiz3 = simd_zero(); - for(int k = 0; k < numneighs; k += unroll_j) { + for(int k = 0; k < numneighs; k++) { int cj = neighs[k + 0]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); - unsigned int cond0 = (unsigned int)(ci == cj); - unsigned int cond1 = cond0; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); @@ -354,16 +354,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); - #if VECTOR_WIDTH == 8 + #if CLUSTER_M == CLUSTER_N + int cond0 = (unsigned int)(cj == ci_cj0); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + #elif CLUSTER_M < CLUSTER_N + int cond0 = (unsigned int)(cj == (ci << 1) + 0); + int cond1 = (unsigned int)(cj == (ci << 1) + 1); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); #else - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1)); #endif MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 2373d6b..0eff9f5 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -33,12 +33,23 @@ // Simd4xN: M=4, N=VECTOR_WIDTH // Simd2xNN: M=4, N=(VECTOR_WIDTH/2) -// Simd2XNN (here used for single-precision) +// Simd2xNN (here used for single-precision) #if VECTOR_WIDTH > CLUSTER_M * 2 +# define KERNEL_NAME "Simd2xNN" # define CLUSTER_N (VECTOR_WIDTH / 2) +# define computeForceLJ computeForceLJ_2xnn // Simd4xN #else +# define KERNEL_NAME "Simd4xN" # define CLUSTER_N VECTOR_WIDTH +# define computeForceLJ computeForceLJ_4xn +#endif + +#ifdef USE_REFERENCE_VERSION +# undef KERNEL_NAME +# undef computeForceLJ +# define KERNEL_NAME "Reference" +# define computeForceLJ computeForceLJ_ref #endif #if CLUSTER_M == CLUSTER_N @@ -105,6 +116,7 @@ typedef struct { MD_FLOAT *cl_f; int *cl_type; Cluster *iclusters, *jclusters; + int *icluster_bin; } Atom; extern void initAtom(Atom*); diff --git a/gromacs/main.c b/gromacs/main.c index ade8f65..a4eda58 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -43,14 +43,9 @@ extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*); +extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); -#ifdef USE_REFERENCE_VERSION -# define computeForceLJ computeForceLJ_ref -#else -# define computeForceLJ computeForceLJ_4xn -#endif - double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; @@ -325,6 +320,7 @@ int main(int argc, char** argv) { } printf(HLINE); + printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); #if PRECISION == 1 printf("Using single precision floating point.\n"); diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 557a9bb..9634157 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -41,7 +41,6 @@ static int *bincount; static int *bins; static int *bin_nclusters; static int *bin_clusters; -static int *icluster_bin; static int mbins; //total number of bins static int atoms_per_bin; // max atoms per bin static int clusters_per_bin; // max clusters per bin @@ -64,7 +63,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { cutneigh = param->cutneigh; nmax = 0; atoms_per_bin = 8; - clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4; + clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10; stencil = NULL; bins = NULL; bincount = NULL; @@ -227,7 +226,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { for(int ci = 0; ci < atom->Nclusters_local; ci++) { int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; - int ibin = icluster_bin[ci]; + int ibin = atom->icluster_bin[ci]; MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; @@ -590,7 +589,7 @@ void buildClusters(Atom *atom) { if(bbminz > ztmp) { bbminz = ztmp; } if(bbmaxz < ztmp) { bbmaxz = ztmp; } - atom->cl_type[cii] = atom->type[i]; + ci_type[cii] = atom->type[i]; atom->iclusters[ci].natoms++; } else { ci_x[CL_X_OFFSET + cii] = INFINITY; @@ -601,7 +600,7 @@ void buildClusters(Atom *atom) { ac++; } - icluster_bin[ci] = bin; + atom->icluster_bin[ci] = bin; atom->iclusters[ci].bbminx = bbminx; atom->iclusters[ci].bbmaxx = bbmaxx; atom->iclusters[ci].bbminy = bbminy; @@ -711,7 +710,7 @@ void binClusters(Atom *atom) { MD_FLOAT *cptr = cluster_pos_ptr(ci); DEBUG_MESSAGE("Cluster %d:\n", ci); DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n", - icluster_bin[ci], + atom->icluster_bin[ci], atom->clusters[ci].natoms, atom->clusters[ci].bbminx, atom->clusters[ci].bbmaxx, @@ -745,7 +744,7 @@ void binClusters(Atom *atom) { for(int ci = 0; ci < nlocal && !resize; ci++) { // Assure we add this j-cluster only once in the bin if(!(CLUSTER_M < CLUSTER_N && ci % 2)) { - int bin = icluster_bin[ci]; + int bin = atom->icluster_bin[ci]; int c = bin_nclusters[bin]; if(c + 1 < clusters_per_bin) { bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci); @@ -827,7 +826,7 @@ void binClusters(Atom *atom) { bin_clusters[bin * clusters_per_bin + c] = cj; } - icluster_bin[ci] = bin; + atom->icluster_bin[ci] = bin; bin_nclusters[bin]++; } else { resize = 1; diff --git a/gromacs/pbc.c b/gromacs/pbc.c index d1216ff..03d669b 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -46,6 +46,7 @@ void initPbc(Atom* atom) { /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { + DEBUG_MESSAGE("updatePbc start\n"); int *border_map = atom->border_map; int nlocal = atom->Nclusters_local; MD_FLOAT xprd = param->xprd; @@ -97,6 +98,8 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { atom->iclusters[ci].bbmaxz = bbmaxz; } } + + DEBUG_MESSAGE("updatePbc end\n"); } /* relocate atoms that have left domain according @@ -159,6 +162,7 @@ void growPbc(Atom* atom) { } void setupPbc(Atom *atom, Parameter *param) { + DEBUG_MESSAGE("setupPbc start\n"); int *border_map = atom->border_map; MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; @@ -236,4 +240,5 @@ void setupPbc(Atom *atom, Parameter *param) { // Update created ghost clusters positions updatePbc(atom, param, 1); + DEBUG_MESSAGE("setupPbc end\n"); } From d79c3c2a1d313a09b5de95ae6047231987076d33 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Thu, 10 Mar 2022 22:33:41 +0100 Subject: [PATCH 4/7] Add first working version with 4x8 config (ref kernel) Signed-off-by: Rafael Ravedutti --- gromacs/atom.c | 4 +- gromacs/force_lj.c | 10 +- gromacs/includes/atom.h | 1 + gromacs/includes/neighbor.h | 1 + gromacs/main.c | 3 + gromacs/neighbor.c | 184 +++++++++++++++++------------------- gromacs/pbc.c | 175 +++++++++++++++++----------------- 7 files changed, 191 insertions(+), 187 deletions(-) diff --git a/gromacs/atom.c b/gromacs/atom.c index b96a91e..88ee0f8 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -429,10 +429,10 @@ void growAtom(Atom *atom) { void growClusters(Atom *atom) { int nold = atom->Nclusters_max; - int jfac = MIN(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters + int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters atom->Nclusters_max += DELTA; atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); - atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster)); + atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster)); atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int)); atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 03d1926..99d56a9 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -55,6 +55,8 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_cj0 = CJ0_FROM_CI(ci); + int ci_cj1 = CJ1_FROM_CI(ci); int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; @@ -76,7 +78,13 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT fiz = 0; for(int cjj = 0; cjj < CLUSTER_N; cjj++) { - if(ci != cj || cii != cjj) { + #if CLUSTER_M == CLUSTER_N + if(ci_cj0 != cj || cii != cjj) { + #elif CLUSTER_M < CLUSTER_N + if(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj) { + #else + if((ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N)) { + #endif MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj]; MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj]; MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj]; diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 0eff9f5..6590419 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -117,6 +117,7 @@ typedef struct { int *cl_type; Cluster *iclusters, *jclusters; int *icluster_bin; + int dummy_cj; } Atom; extern void initAtom(Atom*); diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index d1fbb5f..0ba00c0 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -40,6 +40,7 @@ extern void buildNeighbor(Atom*, Neighbor*); extern void pruneNeighbor(Parameter*, Atom*, Neighbor*); extern void sortAtom(Atom*); extern void buildClusters(Atom*); +extern void defineJClusters(Atom*); extern void binClusters(Atom*); extern void updateSingleAtoms(Atom*); #endif diff --git a/gromacs/main.c b/gromacs/main.c index a4eda58..a36af9d 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -70,6 +70,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * setupThermo(param, atom->Natoms); if(param->input_file == NULL) { adjustThermo(param, atom); } buildClusters(atom); + defineJClusters(atom); setupPbc(atom, param); binClusters(atom); buildNeighbor(atom, neighbor); @@ -84,6 +85,7 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { updateSingleAtoms(atom); updateAtomsPbc(atom, param); buildClusters(atom); + defineJClusters(atom); setupPbc(atom, param); binClusters(atom); buildNeighbor(atom, neighbor); @@ -313,6 +315,7 @@ int main(int argc, char** argv) { } timer[TOTAL] = getTimeStamp() - timer[TOTAL]; + updateSingleAtoms(&atom); computeThermo(-1, ¶m, &atom); if(param.xtc_file != NULL) { diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 9634157..694abd7 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -201,11 +201,10 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { void buildNeighbor(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighbor start\n"); - int nall = atom->Nclusters_local + atom->Nclusters_ghost; /* extend atom arrays if necessary */ - if(nall > nmax) { - nmax = nall; + if(atom->Nclusters_local > nmax) { + nmax = atom->Nclusters_local; if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->neighbors) free(neighbor->neighbors); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); @@ -307,7 +306,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { // Fill neighbor list with dummy values to fit vector width if(CLUSTER_N < VECTOR_WIDTH) { while(n % (VECTOR_WIDTH / CLUSTER_N)) { - neighptr[n++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster + neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster } } @@ -332,38 +331,40 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { /* DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq); for(int ci = 0; ci < 6; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n", ci, - atom->clusters[ci].bbminx, - atom->clusters[ci].bbmaxx, - atom->clusters[ci].bbminy, - atom->clusters[ci].bbmaxy, - atom->clusters[ci].bbminz, - atom->clusters[ci].bbmaxz); + atom->iclusters[ci].bbminx, + atom->iclusters[ci].bbmaxx, + atom->iclusters[ci].bbminy, + atom->iclusters[ci].bbmaxy, + atom->iclusters[ci].bbminz, + atom->iclusters[ci].bbmaxz); for(int cii = 0; cii < CLUSTER_M; cii++) { - DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii)); + DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } DEBUG_MESSAGE("Neighbors:\n"); for(int k = 0; k < neighbor->numneigh[ci]; k++) { - const int cj = neighptr[k]; - MD_FLOAT *cjptr = cluster_pos_ptr(cj); + int cj = neighptr[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n", cj, - atom->clusters[cj].bbminx, - atom->clusters[cj].bbmaxx, - atom->clusters[cj].bbminy, - atom->clusters[cj].bbmaxy, - atom->clusters[cj].bbminz, - atom->clusters[cj].bbmaxz); + atom->jclusters[cj].bbminx, + atom->jclusters[cj].bbmaxx, + atom->jclusters[cj].bbminy, + atom->jclusters[cj].bbmaxy, + atom->jclusters[cj].bbminz, + atom->jclusters[cj].bbmaxz); - for(int cjj = 0; cjj < CLUSTER_M; cjj++) { - DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj)); + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]); } } } @@ -374,7 +375,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("pruneNeighbor start\n"); - int nall = atom->Nclusters_local + atom->Nclusters_ghost; //MD_FLOAT cutsq = param->cutforce * param->cutforce; MD_FLOAT cutsq = cutneighsq; @@ -385,7 +385,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { // Remove dummy clusters if necessary if(CLUSTER_N < VECTOR_WIDTH) { - while(neighs[numneighs - 1] == CJ0_FROM_CI(nall - 1)) { + while(neighs[numneighs - 1] == atom->dummy_cj) { numneighs--; } } @@ -403,7 +403,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { // Readd dummy clusters if necessary if(CLUSTER_N < VECTOR_WIDTH) { while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { - neighs[numneighs++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster + neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster } } @@ -617,7 +617,7 @@ void buildClusters(Atom *atom) { void defineJClusters(Atom *atom) { DEBUG_MESSAGE("defineJClusters start\n"); - for(int ci = 0; ci < atom->Nclusters_local + atom->Nclusters_ghost; ci++) { + for(int ci = 0; ci < atom->Nclusters_local; ci++) { int cj0 = CJ0_FROM_CI(ci); if(CLUSTER_M == CLUSTER_N) { @@ -687,13 +687,14 @@ void defineJClusters(Atom *atom) { } else { if(ci % 2 == 0) { - atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci + 1].bbminx); - atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci + 1].bbmaxx); - atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci + 1].bbminy); - atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci + 1].bbmaxy); - atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci + 1].bbminz); - atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci + 1].bbmaxz); - atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci + 1].natoms; + const int ci1 = ci + 1; + atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx); + atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx); + atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy); + atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy); + atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz); + atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz); + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms; } } } @@ -726,12 +727,8 @@ void binClusters(Atom *atom) { */ const int nlocal = atom->Nclusters_local; - const int ncji = MAX(1, CLUSTER_M / CLUSTER_N); - if(ncji > 2) { - fprintf(stderr, "binClusters(): ncji cannot be higher than 2!"); - } - - defineJClusters(atom); + const int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + const int ncj = atom->Nclusters_local / jfac; int resize = 1; while(resize > 0) { @@ -764,74 +761,65 @@ void binClusters(Atom *atom) { } for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) { - // Assure we add this j-cluster only once in the bin - if(!(CLUSTER_M < CLUSTER_N && cg % 2)) { - const int ci = nlocal + cg; - int ix = -1, iy = -1; - MD_FLOAT xtmp, ytmp; + const int cj = ncj + cg; + int ix = -1, iy = -1; + MD_FLOAT xtmp, ytmp; - for(int cji = 0; cji < ncji && !resize; cji++) { - int cj = (cji == 0) ? CJ0_FROM_CI(ci) : CJ1_FROM_CI(ci); - const int n = atom->jclusters[cj].natoms; + if(atom->jclusters[cj].natoms > 0) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT cj_minz = atom->jclusters[cj].bbminz; - if(n > 0) { - int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); - MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; - MD_FLOAT cj_minz = atom->jclusters[cj].bbminz; + xtmp = cj_x[CL_X_OFFSET + 0]; + ytmp = cj_x[CL_Y_OFFSET + 0]; + coord2bin2D(xtmp, ytmp, &ix, &iy); + ix = MAX(MIN(ix, mbinx - 1), 0); + iy = MAX(MIN(iy, mbiny - 1), 0); + for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) { + int nix, niy; + xtmp = cj_x[CL_X_OFFSET + cjj]; + ytmp = cj_x[CL_Y_OFFSET + cjj]; + coord2bin2D(xtmp, ytmp, &nix, &niy); + nix = MAX(MIN(nix, mbinx - 1), 0); + niy = MAX(MIN(niy, mbiny - 1), 0); - xtmp = cj_x[CL_X_OFFSET + 0]; - ytmp = cj_x[CL_Y_OFFSET + 0]; - coord2bin2D(xtmp, ytmp, &ix, &iy); - ix = MAX(MIN(ix, mbinx - 1), 0); - iy = MAX(MIN(iy, mbiny - 1), 0); - for(int cjj = 1; cjj < n; cjj++) { - int nix, niy; - xtmp = cj_x[CL_X_OFFSET + cjj]; - ytmp = cj_x[CL_Y_OFFSET + cjj]; - coord2bin2D(xtmp, ytmp, &nix, &niy); - nix = MAX(MIN(nix, mbinx - 1), 0); - niy = MAX(MIN(niy, mbiny - 1), 0); + // Always put the cluster on the bin of its innermost atom so + // the cluster should be closer to local clusters + if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } + if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } + if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } + if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } + } - // Always put the cluster on the bin of its innermost atom so - // the cluster should be closer to local clusters - if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } - if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } - if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } - if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } - } + int bin = iy * mbinx + ix + 1; + int c = bin_nclusters[bin]; + if(c < clusters_per_bin) { + // Insert the current ghost cluster in the bin keeping clusters + // sorted by z coordinate + int inserted = 0; + for(int i = 0; i < c; i++) { + int last_cl = bin_clusters[bin * clusters_per_bin + i]; + if(atom->jclusters[last_cl].bbminz > cj_minz) { + bin_clusters[bin * clusters_per_bin + i] = cj; - int bin = iy * mbinx + ix + 1; - int c = bin_nclusters[bin]; - if(c < clusters_per_bin) { - // Insert the current ghost cluster in the bin keeping clusters - // sorted by z coordinate - int inserted = 0; - for(int i = 0; i < c; i++) { - int last_cl = bin_clusters[bin * clusters_per_bin + i]; - if(atom->jclusters[last_cl].bbminz > cj_minz) { - bin_clusters[bin * clusters_per_bin + i] = cj; - - for(int j = i + 1; j <= c; j++) { - int tmp = bin_clusters[bin * clusters_per_bin + j]; - bin_clusters[bin * clusters_per_bin + j] = last_cl; - last_cl = tmp; - } - - inserted = 1; - break; - } + for(int j = i + 1; j <= c; j++) { + int tmp = bin_clusters[bin * clusters_per_bin + j]; + bin_clusters[bin * clusters_per_bin + j] = last_cl; + last_cl = tmp; } - if(!inserted) { - bin_clusters[bin * clusters_per_bin + c] = cj; - } - - atom->icluster_bin[ci] = bin; - bin_nclusters[bin]++; - } else { - resize = 1; + inserted = 1; + break; } } + + if(!inserted) { + bin_clusters[bin * clusters_per_bin + c] = cj; + } + + bin_nclusters[bin]++; + } else { + resize = 1; } } } diff --git a/gromacs/pbc.c b/gromacs/pbc.c index 03d669b..301ee28 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -47,30 +47,30 @@ void initPbc(Atom* atom) { /* uses mapping created in setupPbc */ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { DEBUG_MESSAGE("updatePbc start\n"); - int *border_map = atom->border_map; - int nlocal = atom->Nclusters_local; + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = atom->Nclusters_local / jfac; MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; for(int cg = 0; cg < atom->Nclusters_ghost; cg++) { - const int ci = nlocal + cg; - int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); - int bmap_vec_base = CI_VECTOR_BASE_INDEX(border_map[cg]); - MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + const int cj = ncj + cg; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; - for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { - MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd; - MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd; - MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd; + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd; + MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd; + MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd; - ci_x[CL_X_OFFSET + cii] = xtmp; - ci_x[CL_Y_OFFSET + cii] = ytmp; - ci_x[CL_Z_OFFSET + cii] = ztmp; + cj_x[CL_X_OFFSET + cjj] = xtmp; + cj_x[CL_Y_OFFSET + cjj] = ytmp; + cj_x[CL_Z_OFFSET + cjj] = ztmp; if(firstUpdate) { // TODO: To create the bounding boxes faster, we can use SIMD operations @@ -84,18 +84,18 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { } if(firstUpdate) { - for(int cii = atom->iclusters[ci].natoms; cii < CLUSTER_M; cii++) { - ci_x[CL_X_OFFSET + cii] = INFINITY; - ci_x[CL_Y_OFFSET + cii] = INFINITY; - ci_x[CL_Z_OFFSET + cii] = INFINITY; + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + cj_x[CL_X_OFFSET + cjj] = INFINITY; + cj_x[CL_Y_OFFSET + cjj] = INFINITY; + cj_x[CL_Z_OFFSET + cjj] = INFINITY; } - atom->iclusters[ci].bbminx = bbminx; - atom->iclusters[ci].bbmaxx = bbmaxx; - atom->iclusters[ci].bbminy = bbminy; - atom->iclusters[ci].bbmaxy = bbmaxy; - atom->iclusters[ci].bbminz = bbminz; - atom->iclusters[ci].bbmaxz = bbmaxz; + atom->jclusters[cj].bbminx = bbminx; + atom->jclusters[cj].bbmaxx = bbmaxx; + atom->jclusters[cj].bbminy = bbminy; + atom->jclusters[cj].bbmaxy = bbmaxy; + atom->jclusters[cj].bbminz = bbminz; + atom->jclusters[cj].bbmaxz = bbmaxz; } } @@ -136,18 +136,18 @@ void updateAtomsPbc(Atom *atom, Parameter *param) { * that are then enforced in updatePbc */ #define ADDGHOST(dx,dy,dz); \ Nghost++; \ - const int cg = atom->Nclusters_local + Nghost; \ - const int natoms_ci = atom->iclusters[ci].natoms; \ - border_map[Nghost] = ci; \ + const int cg = ncj + Nghost; \ + const int cj_natoms = atom->jclusters[cj].natoms; \ + atom->border_map[Nghost] = cj; \ atom->PBCx[Nghost] = dx; \ atom->PBCy[Nghost] = dy; \ atom->PBCz[Nghost] = dz; \ - atom->iclusters[cg].natoms = natoms_ci; \ - Nghost_atoms += natoms_ci; \ - int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); \ - int cg_sca_base = CI_SCALAR_BASE_INDEX(cg); \ - for(int cii = 0; cii < natoms_ci; cii++) { \ - atom->cl_type[cg_sca_base + cii] = atom->cl_type[ci_sca_base + cii]; \ + atom->jclusters[cg].natoms = cj_natoms; \ + Nghost_atoms += cj_natoms; \ + int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \ + int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \ + for(int cjj = 0; cjj < cj_natoms; cjj++) { \ + atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \ } /* internal subroutines */ @@ -163,77 +163,80 @@ void growPbc(Atom* atom) { void setupPbc(Atom *atom, Parameter *param) { DEBUG_MESSAGE("setupPbc start\n"); - int *border_map = atom->border_map; MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; MD_FLOAT Cutneigh = param->cutneigh; + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = atom->Nclusters_local / jfac; int Nghost = -1; int Nghost_atoms = 0; - for(int ci = 0; ci < atom->Nclusters_local; ci++) { - if (atom->Nclusters_local + Nghost + 7 >= atom->Nclusters_max) { - growClusters(atom); + for(int cj = 0; cj < ncj; cj++) { + if(atom->jclusters[cj].natoms > 0) { + if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) { + growClusters(atom); + } + + if((Nghost + 7) * jfac >= NmaxGhost) { + growPbc(atom); + } + + MD_FLOAT bbminx = atom->jclusters[cj].bbminx; + MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx; + MD_FLOAT bbminy = atom->jclusters[cj].bbminy; + MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy; + MD_FLOAT bbminz = atom->jclusters[cj].bbminz; + MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz; + + /* Setup ghost atoms */ + /* 6 planes */ + if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); } + if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); } + if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); } + if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); } + if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); } + if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); } + /* 8 corners */ + if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); } + if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); } + if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); } + if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); } + if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); } + if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); } + if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); } + if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); } + /* 12 edges */ + if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); } + if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); } + if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); } + if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); } + if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); } + if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); } + if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); } + if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); } + if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); } + if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); } + if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); } + if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } } - - if (Nghost + 7 >= NmaxGhost) { - growPbc(atom); - border_map = atom->border_map; - } - - MD_FLOAT bbminx = atom->iclusters[ci].bbminx; - MD_FLOAT bbmaxx = atom->iclusters[ci].bbmaxx; - MD_FLOAT bbminy = atom->iclusters[ci].bbminy; - MD_FLOAT bbmaxy = atom->iclusters[ci].bbmaxy; - MD_FLOAT bbminz = atom->iclusters[ci].bbminz; - MD_FLOAT bbmaxz = atom->iclusters[ci].bbmaxz; - - /* Setup ghost atoms */ - /* 6 planes */ - if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); } - if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); } - if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); } - if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); } - if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); } - if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); } - /* 8 corners */ - if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); } - if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); } - if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); } - if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); } - if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); } - if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); } - if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); } - if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); } - /* 12 edges */ - if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); } - if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); } - if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); } - if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); } - if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); } - if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); } - if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); } - if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); } - if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); } - if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); } - if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); } - if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } } - if(atom->Nclusters_local + Nghost + 2 >= atom->Nclusters_max) { + if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) { growClusters(atom); } // Add dummy cluster at the end - int ci_vec_base = CI_VECTOR_BASE_INDEX(atom->Nclusters_local + Nghost + 1); - MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; - for(int cii = 0; cii < MAX(CLUSTER_M, CLUSTER_N); cii++) { - ci_x[CL_X_OFFSET + cii] = INFINITY; - ci_x[CL_Y_OFFSET + cii] = INFINITY; - ci_x[CL_Z_OFFSET + cii] = INFINITY; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + cj_x[CL_X_OFFSET + cjj] = INFINITY; + cj_x[CL_Y_OFFSET + cjj] = INFINITY; + cj_x[CL_Z_OFFSET + cjj] = INFINITY; } // increase by one to make it the ghost atom count + atom->dummy_cj = ncj + Nghost + 1; atom->Nghost = Nghost_atoms; atom->Nclusters_ghost = Nghost + 1; atom->Nclusters = atom->Nclusters_local + Nghost + 1; From 8669f2f6d73ed89836f27877e91de1e0f028843c Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Fri, 11 Mar 2022 01:12:59 +0100 Subject: [PATCH 5/7] Fix LJ Simd4xN kernel Signed-off-by: Rafael Ravedutti --- gromacs/force_lj.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 99d56a9..7b6d37f 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -342,7 +342,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT fiz3 = simd_zero(); for(int k = 0; k < numneighs; k++) { - int cj = neighs[k + 0]; + int cj = neighs[k]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); @@ -369,8 +369,8 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); #elif CLUSTER_M < CLUSTER_N - int cond0 = (unsigned int)(cj == (ci << 1) + 0); - int cond1 = (unsigned int)(cj == (ci << 1) + 1); + int cond0 = (unsigned int)((cj << 1) + 0 == ci); + int cond1 = (unsigned int)((cj << 1) + 1 == ci); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); From d61576699dad256d50c53cb3b3982390d69eb7c8 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Tue, 15 Mar 2022 02:40:56 +0100 Subject: [PATCH 6/7] Add first compilable version of Gromacs with SP Signed-off-by: Rafael Ravedutti --- Makefile | 15 +--- config.mk | 8 +- gromacs/force_lj.c | 172 +++++++++++++++------------------------- gromacs/includes/simd.h | 145 +++++++++++++++++++++++++++++++-- include_ISA.mk | 13 +-- lammps/main-stub.c | 7 +- 6 files changed, 216 insertions(+), 144 deletions(-) diff --git a/Makefile b/Makefile index 78289dc..9d3cc90 100644 --- a/Makefile +++ b/Makefile @@ -17,9 +17,6 @@ INCLUDES += -I./$(SRC_DIR)/includes ifeq ($(strip $(DATA_LAYOUT)),AOS) DEFINES += -DAOS endif -ifeq ($(strip $(CLUSTER_LAYOUT)),AOS) -DEFINES += -DCLUSTER_AOS -endif ifeq ($(strip $(DATA_TYPE)),SP) DEFINES += -DPRECISION=1 else @@ -30,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT) ASFLAGS += -masm=intel endif -ifneq ($(ATOMS_LOOP_RUNS),) - DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS) -endif - -ifneq ($(NEIGHBORS_LOOP_RUNS),) - DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS) -endif - ifeq ($(strip $(EXPLICIT_TYPES)),true) DEFINES += -DEXPLICIT_TYPES endif @@ -74,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true) DEFINES += -DNO_AVX2 endif +ifeq ($(strip $(AVX512)),true) + DEFINES += -DAVX512 +endif + VPATH = $(SRC_DIR) $(ASM_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) diff --git a/config.mk b/config.mk index de2bf9d..9afe7a0 100644 --- a/config.mk +++ b/config.mk @@ -7,7 +7,7 @@ OPT_SCHEME ?= gromacs # Enable likwid (true or false) ENABLE_LIKWID ?= true # SP or DP -DATA_TYPE ?= DP +DATA_TYPE ?= SP # AOS or SOA DATA_LAYOUT ?= AOS # Assembly syntax to generate (ATT/INTEL) @@ -15,10 +15,6 @@ ASM_SYNTAX ?= ATT # Debug DEBUG ?= false -# Number of times to run the atoms loop on stubbed variant -ATOMS_LOOP_RUNS ?= 1 -# Number of times to run the neighbors loop on stubbed variant -NEIGHBORS_LOOP_RUNS ?= 1 # Explicitly store and load atom types (true or false) EXPLICIT_TYPES ?= false # Trace memory addresses for cache simulator (true or false) @@ -29,8 +25,6 @@ INDEX_TRACER ?= false COMPUTE_STATS ?= true # Configurations for gromacs optimization scheme -# AOS or SOA -CLUSTER_LAYOUT ?= SOA # Use reference version USE_REFERENCE_VERSION ?= false # Enable XTC output diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 7b6d37f..41749ad 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -128,7 +128,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat } double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { - DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); + DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); int Nlocal = atom->Nlocal; int* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; @@ -136,10 +136,10 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta MD_FLOAT epsilon = param->epsilon; MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = VECTOR_WIDTH / CLUSTER_N; + const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; double S = getTimeStamp(); LIKWID_MARKER_START("force"); @@ -152,131 +152,86 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; - MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]); - MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]); - MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]); - MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]); - MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]); - MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]); - MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]); - MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]); - MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]); - MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]); - MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]); - MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]); + MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]); MD_SIMD_FLOAT fix0 = simd_zero(); MD_SIMD_FLOAT fiy0 = simd_zero(); MD_SIMD_FLOAT fiz0 = simd_zero(); - MD_SIMD_FLOAT fix1 = simd_zero(); - MD_SIMD_FLOAT fiy1 = simd_zero(); - MD_SIMD_FLOAT fiz1 = simd_zero(); MD_SIMD_FLOAT fix2 = simd_zero(); MD_SIMD_FLOAT fiy2 = simd_zero(); MD_SIMD_FLOAT fiz2 = simd_zero(); - MD_SIMD_FLOAT fix3 = simd_zero(); - MD_SIMD_FLOAT fiy3 = simd_zero(); - MD_SIMD_FLOAT fiz3 = simd_zero(); - for(int k = 0; k < numneighs; k += unroll_j) { - int cj = neighs[k + 0]; + for(int k = 0; k < numneighs; k += 2) { + int cj = neighs[k]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; - unsigned int cond0 = (unsigned int)(ci == cj); - unsigned int cond1 = cond0; - MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp; - - /* - MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0); - #if VECTOR_WIDTH == 8 - int cj1 = neighs[k + 1]; - unsigned int cond1 = (unsigned int)(ci == cj1); - MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1); - MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0); - MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1); - MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2); - #else - MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0); - MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1); - MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2); - #endif - */ + unsigned int mask0, mask1, mask2, mask3; + MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); - MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp); - MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp); - MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp); MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); - MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp); - MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); - MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); - #if VECTOR_WIDTH == 8 - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); + #if CLUSTER_M == CLUSTER_N + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + mask0 = (unsigned int)(0xf - 0x1 * cond0); + mask1 = (unsigned int)(0xf - 0x2 * cond0); + mask2 = (unsigned int)(0xf - 0x4 * cond0); + mask3 = (unsigned int)(0xf - 0x8 * cond0); + #elif CLUSTER_M < CLUSTER_N + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); + mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1); + mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1); + mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1); + mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1); #else - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + mask0 = (unsigned int)(0x3 - 0x1 * cond0); + mask1 = (unsigned int)(0x3 - 0x2 * cond0); + mask2 = (unsigned int)(0x3 - 0x1 * cond1); + mask3 = (unsigned int)(0x3 - 0x2 * cond1); #endif + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2); + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); - MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1))); MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); - MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3))); MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); - MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec)); MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); - MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec)); MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); - MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1); MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); - MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3); MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec))); - MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec))); MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); - MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); - MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec)))); - MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec)))); - MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec)))); - MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec)))); + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0); - fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1); - fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1); - fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1); fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2); fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2); fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2); - fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3); - fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3); - fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } - ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0); - ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1); - ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2); - ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3); - ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0); - ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1); - ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2); - ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3); - ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0); - ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1); - ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2); - ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3); + simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2); + simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); + simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); @@ -285,7 +240,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta LIKWID_MARKER_STOP("force"); double E = getTimeStamp(); - DEBUG_MESSAGE("computeForceLJ_4xn end\n"); + DEBUG_MESSAGE("computeForceLJ_2xnn end\n"); return E-S; } @@ -298,7 +253,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT epsilon = param->epsilon; MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); double S = getTimeStamp(); @@ -348,7 +303,6 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]); - MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); @@ -363,14 +317,14 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); #if CLUSTER_M == CLUSTER_N - int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); #elif CLUSTER_M < CLUSTER_N - int cond0 = (unsigned int)((cj << 1) + 0 == ci); - int cond1 = (unsigned int)((cj << 1) + 1 == ci); + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); @@ -404,10 +358,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); - MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec)))); - MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec)))); - MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec)))); - MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec)))); + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); + MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec)))); fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); @@ -423,18 +377,18 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } - ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0); - ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1); - ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2); - ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3); - ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0); - ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1); - ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2); - ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3); - ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0); - ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1); - ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2); - ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3); + ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0); + ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1); + ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2); + ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3); + ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0); + ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1); + ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2); + ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3); + ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0); + ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1); + ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2); + ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index 1d6a90b..59a044a 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -21,6 +21,7 @@ * ======================================================================================= */ +#include #include #include #include @@ -28,7 +29,9 @@ #define SIMD_PRINT_REAL(a) simd_print_real(#a, a); #define SIMD_PRINT_MASK(a) simd_print_mask(#a, a); -#if VECTOR_WIDTH == 8 // AVX512 +#ifdef AVX512 + +#if PRECISION == 2 // Double precision #define MD_SIMD_FLOAT __m512d #define MD_SIMD_MASK __mmask8 @@ -47,15 +50,98 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_m static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee)); x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11)); x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01)); return *((MD_FLOAT *) &x); } +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) { + return _mm512_broadcast_f64x4(_mm256_load_pd(m)); +} + +static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) { + return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1); +} + +static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { + __m512d t0; + __m256d t2, t3; + + t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e)); + t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee); + t2 = _mm512_castpd512_pd256(t0); + t3 = _mm256_load_pd(m); + t3 = _mm256_add_pd(t3, t2); + _mm256_store_pd(m, t3); + + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0)); +} + +#else // Single-precision + +#define MD_SIMD_FLOAT __m512 +#define MD_SIMD_MASK __mmask16 + +static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); } +static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); } +static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); } +static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); } +static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); } +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); } +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { + // This would only be called in a Mx16 configuration, which is not valid in GROMACS + fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n"); + exit(-1); + return 0.0; +} + +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) { + return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m)))); +} + +static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) { + return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44); +} + +static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { + __m512 t0, t1; + __m128 t2, t3; + + t0 = _mm512_shuffle_f32x4(v0, v1, 0x88); + t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd); + t0 = _mm512_add_ps(t0, t1); + t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e)); + t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1)); + t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0); + t3 = _mm512_castps512_ps128(t0); + t2 = _mm_load_ps(m); + t2 = _mm_add_ps(t2, t3); + _mm_store_ps(m, t2); + + t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e)); + t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1)); + return _mm_cvtss_f32(t3); +} + +#endif // PRECISION + #else // AVX or AVX2 +#if PRECISION == 2 // Double precision + #define MD_SIMD_FLOAT __m256d #ifdef NO_AVX2 @@ -72,6 +158,7 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); } #ifdef NO_AVX2 + static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); } static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); } @@ -85,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { } // TODO: Implement this, althrough it is just required for debugging static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101)); a0 = _mm256_castpd256_pd128(a); @@ -93,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm_add_sd(a0, a1); return *((MD_FLOAT *) &a0); } -#else + +#else // AVX2 + static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); } static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); } @@ -101,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; // test with shuffle & add as an alternative to hadd later a = _mm256_hadd_pd(a, a); @@ -110,11 +199,53 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm_add_sd(a0, a1); return *((MD_FLOAT *) &a0); } -#endif - #endif +#else // Single-precision + +#define MD_SIMD_FLOAT __m256 + +#ifdef NO_AVX2 +#define MD_SIMD_MASK __m256 +#else +#define MD_SIMD_MASK __mmask8 +#endif + +static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); } +static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); } +static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); } +static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); } +static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); } + +#ifdef NO_AVX2 + +#error "AVX intrinsincs with single-precision not implemented!" + +#else // AVX2 + +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { + __m128 t0; + t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1)); + t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2))); + t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1))); + return *((MD_FLOAT *) &t0); +} + +#endif // NO_AVX2 + +#endif // PRECISION + +#endif // AVX or AVX2 + static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) { double x[VECTOR_WIDTH]; memcpy(x, &a, sizeof(x)); diff --git a/include_ISA.mk b/include_ISA.mk index 215f626..fa9b90b 100644 --- a/include_ISA.mk +++ b/include_ISA.mk @@ -1,15 +1,18 @@ ifeq ($(strip $(ISA)), SSE) -VECTOR_WIDTH=2 +_VECTOR_WIDTH=2 else ifeq ($(strip $(ISA)), AVX) # Vector width is 4 but AVX2 instruction set is not supported NO_AVX2=true -VECTOR_WIDTH=4 +_VECTOR_WIDTH=4 else ifeq ($(strip $(ISA)), AVX2) -VECTOR_WIDTH=4 +_VECTOR_WIDTH=4 else ifeq ($(strip $(ISA)), AVX512) -VECTOR_WIDTH=8 +AVX512=true +_VECTOR_WIDTH=8 endif ifeq ($(strip $(DATA_TYPE)), SP) -VECTOR_WIDTH=$((VECTOR_WIDTH * 2)) +VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 ))) +else +VECTOR_WIDTH=$(_VECTOR_WIDTH) endif diff --git a/lammps/main-stub.c b/lammps/main-stub.c index 4da90db..c7596e7 100644 --- a/lammps/main-stub.c +++ b/lammps/main-stub.c @@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); - printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); - printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); @@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) { E = getTimeStamp(); double T_accum = E-S; double freq_hz = param.proc_freq * 1.e9; - const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) { From d47173d7a2bd486deecab52c444fc0293baff837 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Tue, 15 Mar 2022 19:59:10 +0100 Subject: [PATCH 7/7] Fix Simd2xNN kernel Signed-off-by: Rafael Ravedutti --- gromacs/force_lj.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 41749ad..afdc311 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -141,6 +141,16 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; + } + } + double S = getTimeStamp(); LIKWID_MARKER_START("force"); @@ -165,7 +175,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta MD_SIMD_FLOAT fiy2 = simd_zero(); MD_SIMD_FLOAT fiz2 = simd_zero(); - for(int k = 0; k < numneighs; k += 2) { + for(int k = 0; k < numneighs; k++) { int cj = neighs[k]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];