From 22d0f0b958203f1792a6a295d6d6bc1c1a32906c Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Thu, 10 Mar 2022 01:31:50 +0100 Subject: [PATCH] Commit version that works for M=N Signed-off-by: Rafael Ravedutti --- gromacs/atom.c | 4 ++++ gromacs/force_lj.c | 30 ++++++++++++++++++++---------- gromacs/includes/atom.h | 14 +++++++++++++- gromacs/main.c | 8 ++------ gromacs/neighbor.c | 15 +++++++-------- gromacs/pbc.c | 5 +++++ 6 files changed, 51 insertions(+), 25 deletions(-) diff --git a/gromacs/atom.c b/gromacs/atom.c index 024363f..b96a91e 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -37,6 +37,7 @@ void initAtom(Atom *atom) { atom->cl_x = NULL; atom->cl_v = NULL; atom->cl_f = NULL; + atom->cl_type = NULL; atom->Natoms = 0; atom->Nlocal = 0; atom->Nghost = 0; @@ -53,6 +54,7 @@ void initAtom(Atom *atom) { atom->cutneighsq = NULL; atom->iclusters = NULL; atom->jclusters = NULL; + atom->icluster_bin = NULL; } void createAtom(Atom *atom, Parameter *param) { @@ -431,7 +433,9 @@ void growClusters(Atom *atom) { atom->Nclusters_max += DELTA; atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster)); + atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int)); atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int)); } diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index d749bb1..03d1926 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -293,13 +293,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = VECTOR_WIDTH / CLUSTER_N; - double S = getTimeStamp(); LIKWID_MARKER_START("force"); #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_cj0 = CJ0_FROM_CI(ci); + #if CLUSTER_M > CLUSTER_N + int ci_cj1 = CJ1_FROM_CI(ci); + #endif int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; @@ -331,11 +333,9 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT fiy3 = simd_zero(); MD_SIMD_FLOAT fiz3 = simd_zero(); - for(int k = 0; k < numneighs; k += unroll_j) { + for(int k = 0; k < numneighs; k++) { int cj = neighs[k + 0]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); - unsigned int cond0 = (unsigned int)(ci == cj); - unsigned int cond1 = cond0; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); @@ -354,16 +354,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); - #if VECTOR_WIDTH == 8 + #if CLUSTER_M == CLUSTER_N + int cond0 = (unsigned int)(cj == ci_cj0); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + #elif CLUSTER_M < CLUSTER_N + int cond0 = (unsigned int)(cj == (ci << 1) + 0); + int cond1 = (unsigned int)(cj == (ci << 1) + 1); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); #else - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1)); #endif MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 2373d6b..0eff9f5 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -33,12 +33,23 @@ // Simd4xN: M=4, N=VECTOR_WIDTH // Simd2xNN: M=4, N=(VECTOR_WIDTH/2) -// Simd2XNN (here used for single-precision) +// Simd2xNN (here used for single-precision) #if VECTOR_WIDTH > CLUSTER_M * 2 +# define KERNEL_NAME "Simd2xNN" # define CLUSTER_N (VECTOR_WIDTH / 2) +# define computeForceLJ computeForceLJ_2xnn // Simd4xN #else +# define KERNEL_NAME "Simd4xN" # define CLUSTER_N VECTOR_WIDTH +# define computeForceLJ computeForceLJ_4xn +#endif + +#ifdef USE_REFERENCE_VERSION +# undef KERNEL_NAME +# undef computeForceLJ +# define KERNEL_NAME "Reference" +# define computeForceLJ computeForceLJ_ref #endif #if CLUSTER_M == CLUSTER_N @@ -105,6 +116,7 @@ typedef struct { MD_FLOAT *cl_f; int *cl_type; Cluster *iclusters, *jclusters; + int *icluster_bin; } Atom; extern void initAtom(Atom*); diff --git a/gromacs/main.c b/gromacs/main.c index ade8f65..a4eda58 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -43,14 +43,9 @@ extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*); +extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); -#ifdef USE_REFERENCE_VERSION -# define computeForceLJ computeForceLJ_ref -#else -# define computeForceLJ computeForceLJ_4xn -#endif - double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; @@ -325,6 +320,7 @@ int main(int argc, char** argv) { } printf(HLINE); + printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); #if PRECISION == 1 printf("Using single precision floating point.\n"); diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 557a9bb..9634157 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -41,7 +41,6 @@ static int *bincount; static int *bins; static int *bin_nclusters; static int *bin_clusters; -static int *icluster_bin; static int mbins; //total number of bins static int atoms_per_bin; // max atoms per bin static int clusters_per_bin; // max clusters per bin @@ -64,7 +63,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { cutneigh = param->cutneigh; nmax = 0; atoms_per_bin = 8; - clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4; + clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10; stencil = NULL; bins = NULL; bincount = NULL; @@ -227,7 +226,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { for(int ci = 0; ci < atom->Nclusters_local; ci++) { int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; - int ibin = icluster_bin[ci]; + int ibin = atom->icluster_bin[ci]; MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; @@ -590,7 +589,7 @@ void buildClusters(Atom *atom) { if(bbminz > ztmp) { bbminz = ztmp; } if(bbmaxz < ztmp) { bbmaxz = ztmp; } - atom->cl_type[cii] = atom->type[i]; + ci_type[cii] = atom->type[i]; atom->iclusters[ci].natoms++; } else { ci_x[CL_X_OFFSET + cii] = INFINITY; @@ -601,7 +600,7 @@ void buildClusters(Atom *atom) { ac++; } - icluster_bin[ci] = bin; + atom->icluster_bin[ci] = bin; atom->iclusters[ci].bbminx = bbminx; atom->iclusters[ci].bbmaxx = bbmaxx; atom->iclusters[ci].bbminy = bbminy; @@ -711,7 +710,7 @@ void binClusters(Atom *atom) { MD_FLOAT *cptr = cluster_pos_ptr(ci); DEBUG_MESSAGE("Cluster %d:\n", ci); DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n", - icluster_bin[ci], + atom->icluster_bin[ci], atom->clusters[ci].natoms, atom->clusters[ci].bbminx, atom->clusters[ci].bbmaxx, @@ -745,7 +744,7 @@ void binClusters(Atom *atom) { for(int ci = 0; ci < nlocal && !resize; ci++) { // Assure we add this j-cluster only once in the bin if(!(CLUSTER_M < CLUSTER_N && ci % 2)) { - int bin = icluster_bin[ci]; + int bin = atom->icluster_bin[ci]; int c = bin_nclusters[bin]; if(c + 1 < clusters_per_bin) { bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci); @@ -827,7 +826,7 @@ void binClusters(Atom *atom) { bin_clusters[bin * clusters_per_bin + c] = cj; } - icluster_bin[ci] = bin; + atom->icluster_bin[ci] = bin; bin_nclusters[bin]++; } else { resize = 1; diff --git a/gromacs/pbc.c b/gromacs/pbc.c index d1216ff..03d669b 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -46,6 +46,7 @@ void initPbc(Atom* atom) { /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { + DEBUG_MESSAGE("updatePbc start\n"); int *border_map = atom->border_map; int nlocal = atom->Nclusters_local; MD_FLOAT xprd = param->xprd; @@ -97,6 +98,8 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { atom->iclusters[ci].bbmaxz = bbmaxz; } } + + DEBUG_MESSAGE("updatePbc end\n"); } /* relocate atoms that have left domain according @@ -159,6 +162,7 @@ void growPbc(Atom* atom) { } void setupPbc(Atom *atom, Parameter *param) { + DEBUG_MESSAGE("setupPbc start\n"); int *border_map = atom->border_map; MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; @@ -236,4 +240,5 @@ void setupPbc(Atom *atom, Parameter *param) { // Update created ghost clusters positions updatePbc(atom, param, 1); + DEBUG_MESSAGE("setupPbc end\n"); }