diff --git a/Makefile b/Makefile index e214557..78289dc 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ Q ?= @ include $(MAKE_DIR)/config.mk include $(MAKE_DIR)/include_$(TAG).mk include $(MAKE_DIR)/include_LIKWID.mk +include $(MAKE_DIR)/include_ISA.mk include $(MAKE_DIR)/include_GROMACS.mk INCLUDES += -I./$(SRC_DIR)/includes diff --git a/config.mk b/config.mk index 3e8bb88..de2bf9d 100644 --- a/config.mk +++ b/config.mk @@ -1,5 +1,7 @@ # Compiler tag (GCC/CLANG/ICC) TAG ?= ICC +# Instruction set (SSE/AVX/AVX2/AVX512) +ISA ?= AVX512 # Optimization scheme (lammps/gromacs/clusters_per_bin) OPT_SCHEME ?= gromacs # Enable likwid (true or false) @@ -23,10 +25,6 @@ EXPLICIT_TYPES ?= false MEM_TRACER ?= false # Trace indexes and distances for gather-md (true or false) INDEX_TRACER ?= false -# Vector width (elements) for index and distance tracer -VECTOR_WIDTH ?= 8 -# When vector width is 4 but AVX2 is not supported (AVX only), set this to true -NO_AVX2 ?= false # Compute statistics COMPUTE_STATS ?= true diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 07ed0e5..04739a6 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -63,7 +63,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat int cj = neighs[k]; int any = 0; MD_FLOAT *cjptr = cluster_pos_ptr(cj); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { MD_FLOAT xtmp = cluster_x(ciptr, cii); MD_FLOAT ytmp = cluster_y(ciptr, cii); MD_FLOAT ztmp = cluster_z(ciptr, cii); @@ -71,7 +71,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT fiy = 0; MD_FLOAT fiz = 0; - for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) { + for(int cjj = 0; cjj < CLUSTER_M; cjj++) { if(ci != cj || cii != cjj) { MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj); MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj); @@ -106,7 +106,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); - addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N)); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } LIKWID_MARKER_STOP("force"); @@ -128,7 +128,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M; + const int unroll_j = CLUSTER_N / CLUSTER_M; double S = getTimeStamp(); LIKWID_MARKER_START("force"); @@ -262,7 +262,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); - addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N)); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } LIKWID_MARKER_STOP("force"); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 854d0b0..2373d6b 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -27,13 +27,60 @@ #define DELTA 20000 -#define CLUSTER_DIM_M 4 -#define CLUSTER_DIM_N VECTOR_WIDTH +#define CLUSTER_M 4 + +// Nbnxn layouts (as of GROMACS): +// Simd4xN: M=4, N=VECTOR_WIDTH +// Simd2xNN: M=4, N=(VECTOR_WIDTH/2) + +// Simd2XNN (here used for single-precision) +#if VECTOR_WIDTH > CLUSTER_M * 2 +# define CLUSTER_N (VECTOR_WIDTH / 2) +// Simd4xN +#else +# define CLUSTER_N VECTOR_WIDTH +#endif + +#if CLUSTER_M == CLUSTER_N +# define CJ0_FROM_CI(a) (a) +# define CJ1_FROM_CI(a) (a) +# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#elif CLUSTER_M == CLUSTER_N * 2 // M > N +# define CJ0_FROM_CI(a) ((a) << 1) +# define CJ1_FROM_CI(a) (((a) << 1) | 0x1) +# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b)) +# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1)) +#elif CLUSTER_M == CLUSTER_N / 2 // M < N +# define CJ0_FROM_CI(a) ((a) >> 1) +# define CJ1_FROM_CI(a) ((a) >> 1) +# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1)) +# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#else +# error "Invalid cluster configuration!" +#endif + +#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8 +# error "Cluster N dimension can be only 2, 4 and 8" +#endif + +#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1)) +#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3)) +#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1)) +#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3)) + +#if CLUSTER_M >= CLUSTER_N +# define CL_X_OFFSET (0 * CLUSTER_M) +# define CL_Y_OFFSET (1 * CLUSTER_M) +# define CL_Z_OFFSET (2 * CLUSTER_M) +#else +# define CL_X_OFFSET (0 * CLUSTER_N) +# define CL_Y_OFFSET (1 * CLUSTER_N) +# define CL_Z_OFFSET (2 * CLUSTER_N) +#endif typedef struct { - int bin; int natoms; - int type[CLUSTER_DIM_M]; MD_FLOAT bbminx, bbmaxx; MD_FLOAT bbminy, bbmaxy; MD_FLOAT bbminz, bbmaxz; @@ -44,9 +91,6 @@ typedef struct { int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max; MD_FLOAT *x, *y, *z; MD_FLOAT *vx, *vy, *vz; - MD_FLOAT *cl_x; - MD_FLOAT *cl_v; - MD_FLOAT *cl_f; int *border_map; int *type; int ntypes; @@ -54,8 +98,13 @@ typedef struct { MD_FLOAT *sigma6; MD_FLOAT *cutforcesq; MD_FLOAT *cutneighsq; - Cluster *clusters; int *PBCx, *PBCy, *PBCz; + // Data in cluster format + MD_FLOAT *cl_x; + MD_FLOAT *cl_v; + MD_FLOAT *cl_f; + int *cl_type; + Cluster *iclusters, *jclusters; } Atom; extern void initAtom(Atom*); @@ -67,10 +116,6 @@ extern int readAtom_dmp(Atom*, Parameter*); extern void growAtom(Atom*); extern void growClusters(Atom*); -#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3]) -#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3]) -#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3]) - #ifdef AOS #define POS_DATA_LAYOUT "AoS" #define atom_x(i) atom->x[(i) * 3 + 0] @@ -83,14 +128,4 @@ extern void growClusters(Atom*); #define atom_z(i) atom->z[i] #endif -#ifdef CLUSTER_AOS -#define cluster_x(cptr, i) cptr[(i) * 3 + 0] -#define cluster_y(cptr, i) cptr[(i) * 3 + 1] -#define cluster_z(cptr, i) cptr[(i) * 3 + 2] -#else -#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_M + (i)] -#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_M + (i)] -#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_M + (i)] -#endif - #endif diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index ad86e7b..9c0a842 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -54,8 +54,8 @@ static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) { x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double)); x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double)); #else - x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]); - x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1); + x = _mm512_load_pd(&c0[d * CLUSTER_M]); + x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_M]), 1); #endif return x; } @@ -134,7 +134,7 @@ static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) { __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d)); x = _mm256_i32gather_pd(c0, vindex, sizeof(double)); #else - x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]); + x = _mm256_load_pd(&c0[d * CLUSTER_M]); #endif return x; } diff --git a/gromacs/main-stub.c b/gromacs/main-stub.c index 4da90db..c7596e7 100644 --- a/gromacs/main-stub.c +++ b/gromacs/main-stub.c @@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); - printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); - printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); @@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) { E = getTimeStamp(); double T_accum = E-S; double freq_hz = param.proc_freq * 1.e9; - const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) { diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 3d86183..1f40ef9 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -39,8 +39,9 @@ static int nbinx, nbiny; static int mbinx, mbiny; // n bins in x, y static int *bincount; static int *bins; -static int *cluster_bincount; -static int *cluster_bins; +static int *bin_nclusters; +static int *bin_clusters; +static int *icluster_bin; static int mbins; //total number of bins static int atoms_per_bin; // max atoms per bin static int clusters_per_bin; // max clusters per bin @@ -63,12 +64,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { cutneigh = param->cutneigh; nmax = 0; atoms_per_bin = 8; - clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4; + clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4; stencil = NULL; bins = NULL; bincount = NULL; - cluster_bins = NULL; - cluster_bincount = NULL; + bin_clusters = NULL; + bin_nclusters = NULL; neighbor->maxneighs = 100; neighbor->numneigh = NULL; neighbor->neighbors = NULL; @@ -91,7 +92,7 @@ void setupNeighbor(Parameter *param, Atom *atom) { MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd; MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); - MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N); + MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N); MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density); nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex)); @@ -103,20 +104,16 @@ void setupNeighbor(Parameter *param, Atom *atom) { cutneighsq = cutneigh * cutneigh; coord = xlo - cutneigh - SMALL * xprd; - mbinxlo = (int) (coord * bininvx); - if (coord < 0.0) { - mbinxlo = mbinxlo - 1; - } + mbinxlo = (int)(coord * bininvx); + if(coord < 0.0) { mbinxlo = mbinxlo - 1; } coord = xhi + cutneigh + SMALL * xprd; - mbinxhi = (int) (coord * bininvx); + mbinxhi = (int)(coord * bininvx); coord = ylo - cutneigh - SMALL * yprd; - mbinylo = (int) (coord * bininvy); - if (coord < 0.0) { - mbinylo = mbinylo - 1; - } + mbinylo = (int)(coord * bininvy); + if(coord < 0.0) { mbinylo = mbinylo - 1; } coord = yhi + cutneigh + SMALL * yprd; - mbinyhi = (int) (coord * bininvy); + mbinyhi = (int)(coord * bininvy); mbinxlo = mbinxlo - 1; mbinxhi = mbinxhi + 1; @@ -126,16 +123,12 @@ void setupNeighbor(Parameter *param, Atom *atom) { mbinyhi = mbinyhi + 1; mbiny = mbinyhi - mbinylo + 1; - nextx = (int) (cutneigh * bininvx); + nextx = (int)(cutneigh * bininvx); + nexty = (int)(cutneigh * bininvy); if(nextx * binsizex < FACTOR * cutneigh) nextx++; - - nexty = (int) (cutneigh * bininvy); if(nexty * binsizey < FACTOR * cutneigh) nexty++; - if (stencil) { - free(stencil); - } - + if (stencil) { free(stencil); } stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int)); nstencil = 0; @@ -147,19 +140,15 @@ void setupNeighbor(Parameter *param, Atom *atom) { } } + if(bincount) { free(bincount); } + if(bins) { free(bins); } + if(bin_nclusters) { free(bin_nclusters); } + if(bin_clusters) { free(bin_clusters); } mbins = mbinx * mbiny; - - if (bincount) { free(bincount); } bincount = (int*) malloc(mbins * sizeof(int)); - - if (bins) { free(bins); } bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int)); - - if (cluster_bincount) { free(cluster_bincount); } - cluster_bincount = (int*) malloc(mbins * sizeof(int)); - - if (cluster_bins) { free(cluster_bins); } - cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); + bin_nclusters = (int*) malloc(mbins * sizeof(int)); + bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); /* DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi); @@ -171,20 +160,20 @@ void setupNeighbor(Parameter *param, Atom *atom) { } MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { - MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx; - MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx; + MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx; + MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx; MD_FLOAT dm = MAX(dl, dh); MD_FLOAT dm0 = MAX(dm, 0.0); MD_FLOAT d2 = dm0 * dm0; - dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy; - dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy; + dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy; + dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; - dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz; - dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz; + dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz; + dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; @@ -192,14 +181,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { } int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cjptr = cluster_pos_ptr(cj); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) { - MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj); - MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj); - MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj]; if(delx * delx + dely * dely + delz * delz < rsq) { return 1; } @@ -234,22 +225,22 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { resize = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); + int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; - int ibin = atom->clusters[ci].bin; - MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx; - MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx; - MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy; - MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy; - MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz; - MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz; + int ibin = icluster_bin[ci]; + MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; + MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; + MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; + MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy; + MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz; + MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz; for(int k = 0; k < nstencil; k++) { int jbin = ibin + stencil[k]; - int *loc_bin = &cluster_bins[jbin * clusters_per_bin]; + int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; int cj, m = -1; MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; - const int c = cluster_bincount[jbin]; + const int c = bin_nclusters[jbin]; if(c > 0) { MD_FLOAT dl, dh, dm, dm0, d_bb_sq; @@ -257,8 +248,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { do { m++; cj = loc_bin[m]; - jbb_zmin = atom->clusters[cj].bbminz; - jbb_zmax = atom->clusters[cj].bbmaxz; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; dl = ibb_zmin - jbb_zmax; dh = jbb_zmin - ibb_zmax; dm = MAX(dl, dh); @@ -266,10 +257,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { d_bb_sq = dm0 * dm0; } while(m + 1 < c && d_bb_sq > cutneighsq); - jbb_xmin = atom->clusters[cj].bbminx; - jbb_xmax = atom->clusters[cj].bbmaxx; - jbb_ymin = atom->clusters[cj].bbminy; - jbb_ymax = atom->clusters[cj].bbmaxy; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; while(m < c) { dl = ibb_zmin - jbb_zmax; @@ -303,20 +294,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { m++; if(m < c) { cj = loc_bin[m]; - jbb_xmin = atom->clusters[cj].bbminx; - jbb_xmax = atom->clusters[cj].bbmaxx; - jbb_ymin = atom->clusters[cj].bbminy; - jbb_ymax = atom->clusters[cj].bbmaxy; - jbb_zmin = atom->clusters[cj].bbminz; - jbb_zmax = atom->clusters[cj].bbmaxz; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; } } } } - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) { - neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster + // Fill neighbor list with dummy values to fit vector width + if(CLUSTER_N < VECTOR_WIDTH) { + while(n % (VECTOR_WIDTH / CLUSTER_N)) { + neighptr[n++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster } } @@ -353,7 +345,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { atom->clusters[ci].bbminz, atom->clusters[ci].bbmaxz); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii)); } @@ -371,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { atom->clusters[cj].bbminz, atom->clusters[cj].bbmaxz); - for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) { + for(int cjj = 0; cjj < CLUSTER_M; cjj++) { DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj)); } } @@ -393,8 +385,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { int k = 0; // Remove dummy clusters if necessary - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(neighs[numneighs - 1] == nall - 1) { + if(CLUSTER_N < VECTOR_WIDTH) { + while(neighs[numneighs - 1] == CJ0_FROM_CI(nall - 1)) { numneighs--; } } @@ -410,9 +402,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { } // Readd dummy clusters if necessary - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) { - neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster + if(CLUSTER_N < VECTOR_WIDTH) { + while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { + neighs[numneighs++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster } } @@ -558,34 +550,37 @@ void buildClusters(Atom *atom) { for(int bin = 0; bin < mbins; bin++) { int c = bincount[bin]; int ac = 0; - const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M); + int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M); + if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; } for(int cl = 0; cl < nclusters; cl++) { const int ci = atom->Nclusters_local; - if(ci >= atom->Nclusters_max) { growClusters(atom); } - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *cvptr = cluster_velocity_ptr(ci); + int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + int *ci_type = &atom->cl_type[ci_sca_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; - atom->clusters[ci].natoms = 0; - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + atom->iclusters[ci].natoms = 0; + for(int cii = 0; cii < CLUSTER_M; cii++) { if(ac < c) { int i = bins[bin * atoms_per_bin + ac]; MD_FLOAT xtmp = atom_x(i); MD_FLOAT ytmp = atom_y(i); MD_FLOAT ztmp = atom_z(i); - cluster_x(cptr, cii) = xtmp; - cluster_y(cptr, cii) = ytmp; - cluster_z(cptr, cii) = ztmp; - cluster_x(cvptr, cii) = atom->vx[i]; - cluster_y(cvptr, cii) = atom->vy[i]; - cluster_z(cvptr, cii) = atom->vz[i]; + ci_x[CL_X_OFFSET + cii] = xtmp; + ci_x[CL_Y_OFFSET + cii] = ytmp; + ci_x[CL_Z_OFFSET + cii] = ztmp; + ci_v[CL_X_OFFSET + cii] = atom->vx[i]; + ci_v[CL_Y_OFFSET + cii] = atom->vy[i]; + ci_v[CL_Z_OFFSET + cii] = atom->vz[i]; // TODO: To create the bounding boxes faster, we can use SIMD operations if(bbminx > xtmp) { bbminx = xtmp; } @@ -595,24 +590,24 @@ void buildClusters(Atom *atom) { if(bbminz > ztmp) { bbminz = ztmp; } if(bbmaxz < ztmp) { bbmaxz = ztmp; } - atom->clusters[ci].type[cii] = atom->type[i]; - atom->clusters[ci].natoms++; + atom->cl_type[cii] = atom->type[i]; + atom->iclusters[ci].natoms++; } else { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } ac++; } - atom->clusters[ci].bin = bin; - atom->clusters[ci].bbminx = bbminx; - atom->clusters[ci].bbmaxx = bbmaxx; - atom->clusters[ci].bbminy = bbminy; - atom->clusters[ci].bbmaxy = bbmaxy; - atom->clusters[ci].bbminz = bbminz; - atom->clusters[ci].bbmaxz = bbmaxz; + icluster_bin[ci] = bin; + atom->iclusters[ci].bbminx = bbminx; + atom->iclusters[ci].bbmaxx = bbmaxx; + atom->iclusters[ci].bbminy = bbminy; + atom->iclusters[ci].bbmaxy = bbmaxy; + atom->iclusters[ci].bbminz = bbminz; + atom->iclusters[ci].bbmaxz = bbmaxz; atom->Nclusters_local++; } } @@ -620,6 +615,93 @@ void buildClusters(Atom *atom) { DEBUG_MESSAGE("buildClusters end\n"); } +void defineJClusters(Atom *atom) { + DEBUG_MESSAGE("defineJClusters start\n"); + + for(int ci = 0; ci < atom->Nclusters_local + atom->Nclusters_ghost; ci++) { + int cj0 = CJ0_FROM_CI(ci); + + if(CLUSTER_M == CLUSTER_N) { + atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx; + atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx; + atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy; + atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy; + atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz; + atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz; + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms; + + } else if(CLUSTER_M > CLUSTER_N) { + int cj1 = CJ1_FROM_CI(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; + MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; + MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; + + for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + atom->jclusters[cj0].bbminx = bbminx; + atom->jclusters[cj0].bbmaxx = bbmaxx; + atom->jclusters[cj0].bbminy = bbminy; + atom->jclusters[cj0].bbmaxy = bbmaxy; + atom->jclusters[cj0].bbminz = bbminz; + atom->jclusters[cj0].bbmaxz = bbmaxz; + atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N); + + bbminx = INFINITY, bbmaxx = -INFINITY; + bbminy = INFINITY, bbmaxy = -INFINITY; + bbminz = INFINITY, bbmaxz = -INFINITY; + + for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + atom->jclusters[cj1].bbminx = bbminx; + atom->jclusters[cj1].bbmaxx = bbmaxx; + atom->jclusters[cj1].bbminy = bbminy; + atom->jclusters[cj1].bbmaxy = bbmaxy; + atom->jclusters[cj1].bbminz = bbminz; + atom->jclusters[cj1].bbmaxz = bbmaxz; + atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N); + + } else { + if(ci % 2 == 0) { + atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci + 1].bbminx); + atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci + 1].bbmaxx); + atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci + 1].bbminy); + atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci + 1].bbmaxy); + atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci + 1].bbminz); + atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci + 1].bbmaxz); + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci + 1].natoms; + } + } + } + + DEBUG_MESSAGE("defineJClusters end\n"); +} + void binClusters(Atom *atom) { DEBUG_MESSAGE("binClusters start\n"); @@ -629,7 +711,7 @@ void binClusters(Atom *atom) { MD_FLOAT *cptr = cluster_pos_ptr(ci); DEBUG_MESSAGE("Cluster %d:\n", ci); DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n", - atom->clusters[ci].bin, + icluster_bin[ci], atom->clusters[ci].natoms, atom->clusters[ci].bbminx, atom->clusters[ci].bbmaxx, @@ -638,104 +720,133 @@ void binClusters(Atom *atom) { atom->clusters[ci].bbminz, atom->clusters[ci].bbmaxz); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); } } */ const int nlocal = atom->Nclusters_local; - int resize = 1; + const int ncji = MAX(1, CLUSTER_M / CLUSTER_N); + if(ncji > 2) { + fprintf(stderr, "binClusters(): ncji cannot be higher than 2!"); + } + defineJClusters(atom); + + int resize = 1; while(resize > 0) { resize = 0; for(int bin = 0; bin < mbins; bin++) { - cluster_bincount[bin] = 0; + bin_nclusters[bin] = 0; } for(int ci = 0; ci < nlocal && !resize; ci++) { - int bin = atom->clusters[ci].bin; - int c = cluster_bincount[bin]; - if(c < clusters_per_bin) { - cluster_bins[bin * clusters_per_bin + c] = ci; - cluster_bincount[bin]++; - } else { - resize = 1; + // Assure we add this j-cluster only once in the bin + if(!(CLUSTER_M < CLUSTER_N && ci % 2)) { + int bin = icluster_bin[ci]; + int c = bin_nclusters[bin]; + if(c + 1 < clusters_per_bin) { + bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci); + bin_nclusters[bin]++; + + if(CLUSTER_M > CLUSTER_N) { + int cj1 = CJ1_FROM_CI(ci); + if(atoms->jclusters[cj1].natoms > 0) { + bin_clusters[bin * clusters_per_bin + c + 1] = cj1; + bin_nclusters[bin]++; + } + } + } else { + resize = 1; + } } } for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) { - const int ci = nlocal + cg; - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT ci_minz = atom->clusters[ci].bbminz; - MD_FLOAT xtmp, ytmp; - int ix = -1, iy = -1; + // Assure we add this j-cluster only once in the bin + if(!(CLUSTER_M < CLUSTER_N && cg % 2)) { + const int ci = nlocal + cg; + int ix = -1, iy = -1; + MD_FLOAT xtmp, ytmp; - xtmp = cluster_x(cptr, 0); - ytmp = cluster_y(cptr, 0); - coord2bin2D(xtmp, ytmp, &ix, &iy); - ix = MAX(MIN(ix, mbinx - 1), 0); - iy = MAX(MIN(iy, mbiny - 1), 0); - for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) { - int nix, niy; - xtmp = cluster_x(cptr, cii); - ytmp = cluster_y(cptr, cii); - coord2bin2D(xtmp, ytmp, &nix, &niy); - nix = MAX(MIN(nix, mbinx - 1), 0); - niy = MAX(MIN(niy, mbiny - 1), 0); + for(int cji = 0; cji < ncji && !resize; cji++) { + int cj = (cji == 0) ? CJ0_FROM_CI(ci) : CJ1_FROM_CI(ci); + const int n = atom->jclusters[cj].natoms; - // Always put the cluster on the bin of its innermost atom so - // the cluster should be closer to local clusters - if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } - if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } - if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } - if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } - } + if(n > 0) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT cj_minz = atom->jclusters[cj].bbminz; - int bin = iy * mbinx + ix + 1; - int c = cluster_bincount[bin]; - if(c < clusters_per_bin) { - // Insert the current ghost cluster in the bin keeping clusters - // sorted by z coordinate - int inserted = 0; - for(int i = 0; i < c; i++) { - int last_cl = cluster_bins[bin * clusters_per_bin + i]; - if(atom->clusters[last_cl].bbminz > ci_minz) { - cluster_bins[bin * clusters_per_bin + i] = ci; + xtmp = cj_x[CL_X_OFFSET + 0] + ytmp = cj_x[CL_Y_OFFSET + 0] + coord2bin2D(xtmp, ytmp, &ix, &iy); + ix = MAX(MIN(ix, mbinx - 1), 0); + iy = MAX(MIN(iy, mbiny - 1), 0); + for(int cjj = 1; cjj < n; cjj++) { + int nix, niy; + xtmp = cj_x[CL_X_OFFSET + cjj] + ytmp = cj_x[CL_Y_OFFSET + cjj] + coord2bin2D(xtmp, ytmp, &nix, &niy); + nix = MAX(MIN(nix, mbinx - 1), 0); + niy = MAX(MIN(niy, mbiny - 1), 0); - for(int j = i + 1; j <= c; j++) { - int tmp = cluster_bins[bin * clusters_per_bin + j]; - cluster_bins[bin * clusters_per_bin + j] = last_cl; - last_cl = tmp; + // Always put the cluster on the bin of its innermost atom so + // the cluster should be closer to local clusters + if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } + if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } + if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } + if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } } - inserted = 1; - break; + int bin = iy * mbinx + ix + 1; + int c = bin_nclusters[bin]; + if(c < clusters_per_bin) { + // Insert the current ghost cluster in the bin keeping clusters + // sorted by z coordinate + int inserted = 0; + for(int i = 0; i < c; i++) { + int last_cl = bin_clusters[bin * clusters_per_bin + i]; + if(atom->jclusters[last_cl].bbminz > cj_minz) { + bin_clusters[bin * clusters_per_bin + i] = cj; + + for(int j = i + 1; j <= c; j++) { + int tmp = bin_clusters[bin * clusters_per_bin + j]; + bin_clusters[bin * clusters_per_bin + j] = last_cl; + last_cl = tmp; + } + + inserted = 1; + break; + } + } + + if(!inserted) { + bin_clusters[bin * clusters_per_bin + c] = cj; + } + + icluster_bin[ci] = bin; + bin_nclusters[bin]++; + } else { + resize = 1; + } } } - - if(!inserted) { - cluster_bins[bin * clusters_per_bin + c] = ci; - } - - atom->clusters[ci].bin = bin; - cluster_bincount[bin]++; - } else { - resize = 1; } } if(resize) { - free(cluster_bins); + free(bin_clusters); clusters_per_bin *= 2; - cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); + bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); } } /* - DEBUG_MESSAGE("cluster_bincount\n"); - for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); } + DEBUG_MESSAGE("bin_nclusters\n"); + for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); } DEBUG_MESSAGE("\n"); */ @@ -747,16 +858,17 @@ void updateSingleAtoms(Atom *atom) { int Natom = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *cvptr = cluster_velocity_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - atom_x(Natom) = cluster_x(cptr, cii); - atom_y(Natom) = cluster_y(cptr, cii); - atom_z(Natom) = cluster_z(cptr, cii); - atom->vx[Natom] = cluster_x(cvptr, cii); - atom->vy[Natom] = cluster_y(cvptr, cii); - atom->vz[Natom] = cluster_z(cvptr, cii); + atom_x(Natom) = ci_x[CL_X_OFFSET + cii]; + atom_y(Natom) = ci_x[CL_Y_OFFSET + cii]; + atom_z(Natom) = ci_x[CL_Z_OFFSET + cii]; + atom->vx[Natom] = ci_v[CL_X_OFFSET + cii]; + atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii]; + atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii]; Natom++; } } diff --git a/gromacs/pbc.c b/gromacs/pbc.c index acb203e..9632e36 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -53,20 +53,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { for(int cg = 0; cg < atom->Nclusters_ghost; cg++) { const int ci = nlocal + cg; - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + int bmap_vec_base = CI_VECTOR_BASE_INDEX(border_map[cg]); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd; - MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd; - MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd; + MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd; + MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd; + MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd; - cluster_x(cptr, cii) = xtmp; - cluster_y(cptr, cii) = ytmp; - cluster_z(cptr, cii) = ztmp; + ci_x[CL_X_OFFSET + cii] = xtmp; + ci_x[CL_Y_OFFSET + cii] = ytmp; + ci_x[CL_Z_OFFSET + cii] = ztmp; if(firstUpdate) { // TODO: To create the bounding boxes faster, we can use SIMD operations @@ -80,18 +82,18 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { } if(firstUpdate) { - for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_M; cii++) { + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } - atom->clusters[ci].bbminx = bbminx; - atom->clusters[ci].bbmaxx = bbmaxx; - atom->clusters[ci].bbminy = bbminy; - atom->clusters[ci].bbmaxy = bbmaxy; - atom->clusters[ci].bbminz = bbminz; - atom->clusters[ci].bbmaxz = bbmaxz; + atom->iclusters[ci].bbminx = bbminx; + atom->iclusters[ci].bbmaxx = bbmaxx; + atom->iclusters[ci].bbminy = bbminy; + atom->iclusters[ci].bbmaxy = bbmaxy; + atom->iclusters[ci].bbminz = bbminz; + atom->iclusters[ci].bbmaxz = bbmaxz; } } } @@ -131,14 +133,17 @@ void updateAtomsPbc(Atom *atom, Parameter *param) { #define ADDGHOST(dx,dy,dz); \ Nghost++; \ const int g_atom_idx = atom->Nclusters_local + Nghost; \ + const int natoms_ci = atom->iclusters[ci].natoms; \ border_map[Nghost] = ci; \ atom->PBCx[Nghost] = dx; \ atom->PBCy[Nghost] = dy; \ atom->PBCz[Nghost] = dz; \ - atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms; \ - Nghost_atoms += atom->clusters[g_atom_idx].natoms; \ - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { \ - atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii]; \ + atom->iclusters[g_atom_idx].natoms = natoms_ci; \ + Nghost_atoms += natoms_ci; \ + int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); \ + int cg_sca_base = CI_SCALAR_BASE_INDEX(cg); \ + for(int cii = 0; cii < natoms_ci; cii++) { \ + atom->cl_type[cg_sca_base + cii] = atom->cl_type[ci_sca_base + cii]; \ } /* internal subroutines */ @@ -171,12 +176,12 @@ void setupPbc(Atom *atom, Parameter *param) { border_map = atom->border_map; } - MD_FLOAT bbminx = atom->clusters[ci].bbminx; - MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx; - MD_FLOAT bbminy = atom->clusters[ci].bbminy; - MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy; - MD_FLOAT bbminz = atom->clusters[ci].bbminz; - MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz; + MD_FLOAT bbminx = atom->iclusters[ci].bbminx; + MD_FLOAT bbmaxx = atom->iclusters[ci].bbmaxx; + MD_FLOAT bbminy = atom->iclusters[ci].bbminy; + MD_FLOAT bbmaxy = atom->iclusters[ci].bbmaxy; + MD_FLOAT bbminz = atom->iclusters[ci].bbminz; + MD_FLOAT bbmaxz = atom->iclusters[ci].bbmaxz; /* Setup ghost atoms */ /* 6 planes */ @@ -210,16 +215,17 @@ void setupPbc(Atom *atom, Parameter *param) { if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } } - if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) { + if(atom->Nclusters_local + Nghost + 2 >= atom->Nclusters_max) { growClusters(atom); } // Add dummy cluster at the end - MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + int ci_vec_base = CI_VECTOR_BASE_INDEX(atom->Nclusters_local + Nghost + 1); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < MAX(CLUSTER_M, CLUSTER_N); cii++) { + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } // increase by one to make it the ghost atom count diff --git a/include_ISA.mk b/include_ISA.mk new file mode 100644 index 0000000..215f626 --- /dev/null +++ b/include_ISA.mk @@ -0,0 +1,15 @@ +ifeq ($(strip $(ISA)), SSE) +VECTOR_WIDTH=2 +else ifeq ($(strip $(ISA)), AVX) +# Vector width is 4 but AVX2 instruction set is not supported +NO_AVX2=true +VECTOR_WIDTH=4 +else ifeq ($(strip $(ISA)), AVX2) +VECTOR_WIDTH=4 +else ifeq ($(strip $(ISA)), AVX512) +VECTOR_WIDTH=8 +endif + +ifeq ($(strip $(DATA_TYPE)), SP) +VECTOR_WIDTH=$((VECTOR_WIDTH * 2)) +endif