From 8709bc2a06c1ec2f1294440a738e8cc9f7bb8f2b Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Tue, 22 Mar 2022 23:47:05 +0100 Subject: [PATCH] Add first version for half neighbor lists in GROMACS variant Signed-off-by: Rafael Ravedutti --- gromacs/force_lj.c | 327 ++++++++++++++++++++++++++- gromacs/includes/neighbor.h | 1 + gromacs/includes/parameter.h | 3 +- gromacs/includes/simd/avx512_float.h | 16 ++ gromacs/main.c | 4 + gromacs/neighbor.c | 49 ++-- gromacs/parameter.c | 3 + 7 files changed, 379 insertions(+), 24 deletions(-) diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 841dcaf..6f9071c 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -68,6 +68,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int any = 0; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; for(int cii = 0; cii < CLUSTER_M; cii++) { MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; @@ -93,6 +94,13 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT sr2 = 1.0 / rsq; MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + + if(neighbor->half_neigh) { + cj_f[CL_X_OFFSET + cjj] -= delx * force; + cj_f[CL_Y_OFFSET + cjj] -= dely * force; + cj_f[CL_Z_OFFSET + cjj] -= delz * force; + } + fix += delx * force; fiy += dely * force; fiz += delz * force; @@ -127,7 +135,148 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat return E-S; } -double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { +double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; + } + } + + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + #pragma omp parallel for + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_cj0 = CJ0_FROM_CI(ci); + #if CLUSTER_M > CLUSTER_N + int ci_cj1 = CJ1_FROM_CI(ci); + #endif + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[ci]; + + MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]); + MD_SIMD_FLOAT fix0 = simd_zero(); + MD_SIMD_FLOAT fiy0 = simd_zero(); + MD_SIMD_FLOAT fiz0 = simd_zero(); + MD_SIMD_FLOAT fix2 = simd_zero(); + MD_SIMD_FLOAT fiy2 = simd_zero(); + MD_SIMD_FLOAT fiz2 = simd_zero(); + + for(int k = 0; k < numneighs; k++) { + int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + unsigned int mask0, mask1, mask2, mask3; + + MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); + MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); + MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); + MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); + MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); + MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); + MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); + + #if CLUSTER_M == CLUSTER_N + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + mask0 = (unsigned int)(0xf - 0x1 * cond0); + mask1 = (unsigned int)(0xf - 0x3 * cond0); + mask2 = (unsigned int)(0xf - 0x7 * cond0); + mask3 = (unsigned int)(0xf - 0xf * cond0); + #elif CLUSTER_M < CLUSTER_N + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); + mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1); + mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1); + mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1); + mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1); + #else + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + mask0 = (unsigned int)(0x3 - 0x1 * cond0); + mask1 = (unsigned int)(0x3 - 0x3 * cond0); + mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1); + mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1); + #endif + + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2); + + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); + MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); + + MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); + + MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); + MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); + + MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec))); + MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); + + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); + + MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0); + MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0); + MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0); + MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2); + MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2); + MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2); + + fix0 = simd_add(fix0, tx0); + fiy0 = simd_add(fiy0, ty0); + fiz0 = simd_add(fiz0, tz0); + fix2 = simd_add(fix2, tx2); + fiy2 = simd_add(fiy2, ty2); + fiz2 = simd_add(fiz2, tz2); + + simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2); + } + + simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2); + simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); + simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); + + addStat(stats->calculated_forces, 1); + addStat(stats->num_neighs, numneighs); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); + } + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + DEBUG_MESSAGE("computeForceLJ_2xnn end\n"); + return E-S; +} + +double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); int Nlocal = atom->Nlocal; int* neighs; @@ -258,7 +407,173 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta return E-S; } -double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { +double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + if(neighbor->half_neigh) { + return computeForceLJ_2xnn_half(param, atom, neighbor, stats); + } + + return computeForceLJ_2xnn_full(param, atom, neighbor, stats); +} + +double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; + } + } + + #pragma omp parallel for + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_cj0 = CJ0_FROM_CI(ci); + #if CLUSTER_M > CLUSTER_N + int ci_cj1 = CJ1_FROM_CI(ci); + #endif + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[ci]; + + MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]); + MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]); + MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]); + MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]); + MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]); + MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]); + MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]); + MD_SIMD_FLOAT fix0 = simd_zero(); + MD_SIMD_FLOAT fiy0 = simd_zero(); + MD_SIMD_FLOAT fiz0 = simd_zero(); + MD_SIMD_FLOAT fix1 = simd_zero(); + MD_SIMD_FLOAT fiy1 = simd_zero(); + MD_SIMD_FLOAT fiz1 = simd_zero(); + MD_SIMD_FLOAT fix2 = simd_zero(); + MD_SIMD_FLOAT fiy2 = simd_zero(); + MD_SIMD_FLOAT fiz2 = simd_zero(); + MD_SIMD_FLOAT fix3 = simd_zero(); + MD_SIMD_FLOAT fiy3 = simd_zero(); + MD_SIMD_FLOAT fiz3 = simd_zero(); + + for(int k = 0; k < numneighs; k++) { + int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]); + MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); + MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); + MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); + MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp); + MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp); + MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp); + MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); + MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); + MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); + MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp); + MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); + MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); + + #if CLUSTER_M == CLUSTER_N + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0)); + #elif CLUSTER_M < CLUSTER_N + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1)); + #else + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1)); + #endif + + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); + MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1))); + MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); + MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3))); + + MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec)); + + MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); + MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1); + MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); + MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3); + + MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec))); + MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec))); + MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); + MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); + + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); + MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec)))); + + fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); + fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); + fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0); + fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1); + fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1); + fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1); + fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2); + fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2); + fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2); + fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3); + fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3); + fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); + } + + simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3); + simd_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy1, fiy2, fiy3); + simd_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz1, fiz2, fiz3); + + addStat(stats->calculated_forces, 1); + addStat(stats->num_neighs, numneighs); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); + } + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + DEBUG_MESSAGE("computeForceLJ_4xn end\n"); + return E-S; +} + +double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); int Nlocal = atom->Nlocal; int* neighs; @@ -415,3 +730,11 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat DEBUG_MESSAGE("computeForceLJ_4xn end\n"); return E-S; } + +double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + if(neighbor->half_neigh) { + return computeForceLJ_4xn_half(param, atom, neighbor, stats); + } + + return computeForceLJ_4xn_full(param, atom, neighbor, stats); +} diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index 0ba00c0..69c9093 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -31,6 +31,7 @@ typedef struct { int* neighbors; int maxneighs; int* numneigh; + int half_neigh; } Neighbor; extern void initNeighbor(Neighbor*, Parameter*); diff --git a/gromacs/includes/parameter.h b/gromacs/includes/parameter.h index 4b31aca..1177b94 100644 --- a/gromacs/includes/parameter.h +++ b/gromacs/includes/parameter.h @@ -48,12 +48,13 @@ typedef struct { int prune_every; int x_out_every; int v_out_every; + int half_neigh; + int nx, ny, nz; MD_FLOAT dt; MD_FLOAT dtforce; MD_FLOAT cutforce; MD_FLOAT skin; MD_FLOAT cutneigh; - int nx, ny, nz; MD_FLOAT lattice; MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi; MD_FLOAT xprd, yprd, zprd; diff --git a/gromacs/includes/simd/avx512_float.h b/gromacs/includes/simd/avx512_float.h index 8043bbe..20d54f0 100644 --- a/gromacs/includes/simd/avx512_float.h +++ b/gromacs/includes/simd/avx512_float.h @@ -42,6 +42,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); } static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); } static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); } +static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); } static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { // This would only be called in a Mx16 configuration, which is not valid in GROMACS fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n"); @@ -83,3 +84,18 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0, t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1)); return _mm_cvtss_f32(t3); } + +inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) { + __m256 t; + a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee)); + t = _mm256_load_ps(m); + t = _mm256_sub_ps(t, _mm512_castps512_ps256(a)); + _mm256_store_ps(m, t); +} + +static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) { + simd_h_decr(m, a0); + simd_h_decr(m + CLUSTER_M, a1); + simd_h_decr(m + CLUSTER_M * 2, a2); +} + diff --git a/gromacs/main.c b/gromacs/main.c index a36af9d..47d587f 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -198,6 +198,10 @@ int main(int argc, char** argv) { param.nz = atoi(argv[++i]); continue; } + if((strcmp(argv[i], "-half") == 0)) { + param.half_neigh = atoi(argv[++i]); + continue; + } if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) { param.mass = atof(argv[++i]); continue; diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 694abd7..bf14f13 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -69,6 +69,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { bincount = NULL; bin_clusters = NULL; bin_nclusters = NULL; + neighbor->half_neigh = param->half_neigh; neighbor->maxneighs = 100; neighbor->numneigh = NULL; neighbor->neighbors = NULL; @@ -223,6 +224,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { resize = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_cj1 = CJ1_FROM_CI(ci); int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; int ibin = atom->icluster_bin[ci]; @@ -246,6 +248,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { do { m++; cj = loc_bin[m]; + if(neighbor->half_neigh && ci_cj1 > cj) { + continue; + } jbb_zmin = atom->jclusters[cj].bbminz; jbb_zmax = atom->jclusters[cj].bbmaxz; dl = ibb_zmin - jbb_zmax; @@ -261,31 +266,33 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { jbb_ymax = atom->jclusters[cj].bbmaxy; while(m < c) { - dl = ibb_zmin - jbb_zmax; - dh = jbb_zmin - ibb_zmax; - dm = MAX(dl, dh); - dm0 = MAX(dm, 0.0); - d_bb_sq = dm0 * dm0; + if(!neighbor->half_neigh || ci_cj1 <= cj) { + dl = ibb_zmin - jbb_zmax; + dh = jbb_zmin - ibb_zmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq = dm0 * dm0; - /*if(d_bb_sq > cutneighsq) { - break; - }*/ + /*if(d_bb_sq > cutneighsq) { + break; + }*/ - dl = ibb_ymin - jbb_ymax; - dh = jbb_ymin - ibb_ymax; - dm = MAX(dl, dh); - dm0 = MAX(dm, 0.0); - d_bb_sq += dm0 * dm0; + dl = ibb_ymin - jbb_ymax; + dh = jbb_ymin - ibb_ymax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq += dm0 * dm0; - dl = ibb_xmin - jbb_xmax; - dh = jbb_xmin - ibb_xmax; - dm = MAX(dl, dh); - dm0 = MAX(dm, 0.0); - d_bb_sq += dm0 * dm0; + dl = ibb_xmin - jbb_xmax; + dh = jbb_xmin - ibb_xmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq += dm0 * dm0; - if(d_bb_sq < cutneighsq) { - if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) { - neighptr[n++] = cj; + if(d_bb_sq < cutneighsq) { + if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) { + neighptr[n++] = cj; + } } } diff --git a/gromacs/parameter.c b/gromacs/parameter.c index 0a71e52..288657e 100644 --- a/gromacs/parameter.c +++ b/gromacs/parameter.c @@ -43,6 +43,7 @@ void initParameter(Parameter *param) { param->nx = 32; param->ny = 32; param->nz = 32; + param->half_neigh = 0; param->cutforce = 2.5; param->skin = 0.3; param->cutneigh = param->cutforce + param->skin; @@ -101,6 +102,7 @@ void readParameter(Parameter *param, const char *filename) { PARSE_INT(nx); PARSE_INT(ny); PARSE_INT(nz); + PARSE_INT(half_neigh); PARSE_INT(nstat); PARSE_INT(reneigh_every); PARSE_INT(prune_every); @@ -152,5 +154,6 @@ void printParameter(Parameter *param) { printf("\tDelta time (dt): %e\n", param->dt); printf("\tCutoff radius: %e\n", param->cutforce); printf("\tSkin: %e\n", param->skin); + printf("\tHalf neighbor lists: %d\n", param->half_neigh); printf("\tProcessor frequency (GHz): %.4f\n\n", param->proc_freq); }