From 85f1484449b415483e530fa5a2977f264257df25 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Tue, 28 Mar 2023 18:04:18 +0200 Subject: [PATCH] Specialize force kernel when there are no masks to be checked Signed-off-by: Rafael Ravedutti --- gromacs/force_lj.c | 55 ++++++++++++++++++++++++++++++++++++- gromacs/includes/neighbor.h | 1 + gromacs/neighbor.c | 23 ++++++++++++++-- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 4070ae1..4655629 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -220,6 +220,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; + int numneighs_masked = neighbor->numneigh_masked[ci]; MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]); MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]); @@ -234,7 +235,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT fiy2 = simd_zero(); MD_SIMD_FLOAT fiz2 = simd_zero(); - for(int k = 0; k < numneighs; k++) { + for(int k = 0; k < numneighs_masked; k++) { int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int imask = neighs[k].imask; @@ -328,6 +329,58 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor #endif } + for(int k = numneighs_masked; k < numneighs; k++) { + int cj = neighs[k].cj; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + + MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); + MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); + MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); + MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); + MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); + MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); + MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); + MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); + + MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec); + MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec); + + MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); + MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); + MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec; + MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec; + + MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec; + MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec; + + MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0); + MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0); + MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0); + MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2); + MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2); + MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2); + + fix0 += tx0; + fiy0 += ty0; + fiz0 += tz0; + fix2 += tx2; + fiy2 += ty2; + fiz2 += tz2; + + #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ + if(cj < CJ1_FROM_CI(atom->Nlocal)) { + simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2); + } + #else + simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2); + #endif + } + simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2); simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index ace503e..6d3de64 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -35,6 +35,7 @@ typedef struct { int ncalls; int maxneighs; int* numneigh; + int* numneigh_masked; int half_neigh; NeighborCluster* neighbors; } Neighbor; diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 56c3191..123ede0 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -56,6 +56,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { neighbor->half_neigh = param->half_neigh; neighbor->maxneighs = 100; neighbor->numneigh = NULL; + neighbor->numneigh_masked = NULL; neighbor->neighbors = NULL; } @@ -230,6 +231,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->neighbors) free(neighbor->neighbors); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); + neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int)); neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster)); } @@ -247,7 +249,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj1 = CJ1_FROM_CI(ci); NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); - int n = 0; + int n = 0, nmasked = 0; int ibin = atom->icluster_bin[ci]; MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; @@ -319,8 +321,17 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { imask = get_imask_simd_4xn(neighbor->half_neigh, ci, cj); #endif - neighptr[n].cj = cj; - neighptr[n].imask = imask; + if(imask == NBNXN_INTERACTION_MASK_ALL) { + neighptr[n].cj = cj; + neighptr[n].imask = imask; + } else { + neighptr[n].cj = neighptr[nmasked].cj; + neighptr[n].imask = neighptr[nmasked].imask; + neighptr[nmasked].cj = cj; + neighptr[nmasked].imask = imask; + nmasked++; + } + n++; } } @@ -350,6 +361,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { } neighbor->numneigh[ci] = n; + neighbor->numneigh_masked[ci] = nmasked; if(n >= neighbor->maxneighs) { resize = 1; @@ -420,6 +432,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { for(int ci = 0; ci < atom->Nclusters_local; ci++) { NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; + int numneighs_masked = neighbor->numneigh_masked[ci]; int k = 0; // Remove dummy clusters if necessary @@ -435,6 +448,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { k++; } else { numneighs--; + if(k < numneighs_masked) { + numneighs_masked--; + } neighs[k] = neighs[numneighs]; } } @@ -449,6 +465,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { } neighbor->numneigh[ci] = numneighs; + neighbor->numneigh_masked[ci] = numneighs_masked; } DEBUG_MESSAGE("pruneNeighbor end\n");