Add first version for half neighbor lists in GROMACS variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
2a555a7deb
commit
8709bc2a06
@ -68,6 +68,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
int any = 0;
|
int any = 0;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||||
@ -93,6 +94,13 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
MD_FLOAT sr2 = 1.0 / rsq;
|
MD_FLOAT sr2 = 1.0 / rsq;
|
||||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||||
|
|
||||||
|
if(neighbor->half_neigh) {
|
||||||
|
cj_f[CL_X_OFFSET + cjj] -= delx * force;
|
||||||
|
cj_f[CL_Y_OFFSET + cjj] -= dely * force;
|
||||||
|
cj_f[CL_Z_OFFSET + cjj] -= delz * force;
|
||||||
|
}
|
||||||
|
|
||||||
fix += delx * force;
|
fix += delx * force;
|
||||||
fiy += dely * force;
|
fiy += dely * force;
|
||||||
fiz += delz * force;
|
fiz += delz * force;
|
||||||
@ -127,7 +135,148 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
|
int Nlocal = atom->Nlocal;
|
||||||
|
int* neighs;
|
||||||
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
|
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||||
|
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||||
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
|
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||||
|
|
||||||
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
|
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||||
|
ci_f[CL_X_OFFSET + cii] = 0.0;
|
||||||
|
ci_f[CL_Y_OFFSET + cii] = 0.0;
|
||||||
|
ci_f[CL_Z_OFFSET + cii] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double S = getTimeStamp();
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
|
#if CLUSTER_M > CLUSTER_N
|
||||||
|
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||||
|
#endif
|
||||||
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
|
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||||
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||||
|
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||||
|
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
|
||||||
|
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
|
||||||
|
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
|
||||||
|
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
|
||||||
|
MD_SIMD_FLOAT fix0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
|
||||||
|
for(int k = 0; k < numneighs; k++) {
|
||||||
|
int cj = neighs[k];
|
||||||
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
|
unsigned int mask0, mask1, mask2, mask3;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||||
|
|
||||||
|
#if CLUSTER_M == CLUSTER_N
|
||||||
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
|
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||||
|
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||||
|
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||||
|
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||||
|
#elif CLUSTER_M < CLUSTER_N
|
||||||
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
|
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||||
|
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||||
|
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||||
|
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||||
|
#else
|
||||||
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
|
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||||
|
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||||
|
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||||
|
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||||
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||||
|
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||||
|
|
||||||
|
fix0 = simd_add(fix0, tx0);
|
||||||
|
fiy0 = simd_add(fiy0, ty0);
|
||||||
|
fiz0 = simd_add(fiz0, tz0);
|
||||||
|
fix2 = simd_add(fix2, tx2);
|
||||||
|
fiy2 = simd_add(fiy2, ty2);
|
||||||
|
fiz2 = simd_add(fiz2, tz2);
|
||||||
|
|
||||||
|
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||||
|
}
|
||||||
|
|
||||||
|
simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
|
||||||
|
simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
|
||||||
|
simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
|
||||||
|
|
||||||
|
addStat(stats->calculated_forces, 1);
|
||||||
|
addStat(stats->num_neighs, numneighs);
|
||||||
|
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||||
|
}
|
||||||
|
|
||||||
|
LIKWID_MARKER_STOP("force");
|
||||||
|
double E = getTimeStamp();
|
||||||
|
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||||
|
return E-S;
|
||||||
|
}
|
||||||
|
|
||||||
|
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
int* neighs;
|
||||||
@ -258,7 +407,173 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
|||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
|
if(neighbor->half_neigh) {
|
||||||
|
return computeForceLJ_2xnn_half(param, atom, neighbor, stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
return computeForceLJ_2xnn_full(param, atom, neighbor, stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
|
int Nlocal = atom->Nlocal;
|
||||||
|
int* neighs;
|
||||||
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
|
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||||
|
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||||
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
|
double S = getTimeStamp();
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
|
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||||
|
ci_f[CL_X_OFFSET + cii] = 0.0;
|
||||||
|
ci_f[CL_Y_OFFSET + cii] = 0.0;
|
||||||
|
ci_f[CL_Z_OFFSET + cii] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
|
#if CLUSTER_M > CLUSTER_N
|
||||||
|
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||||
|
#endif
|
||||||
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
|
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||||
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||||
|
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||||
|
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
|
||||||
|
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
|
||||||
|
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
|
||||||
|
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
|
||||||
|
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
|
||||||
|
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
|
||||||
|
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
|
||||||
|
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
|
||||||
|
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
|
||||||
|
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
|
||||||
|
MD_SIMD_FLOAT fix0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix1 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy1 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz1 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix3 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||||
|
|
||||||
|
for(int k = 0; k < numneighs; k++) {
|
||||||
|
int cj = neighs[k];
|
||||||
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||||
|
|
||||||
|
#if CLUSTER_M == CLUSTER_N
|
||||||
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||||
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
|
||||||
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
|
||||||
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
|
||||||
|
#elif CLUSTER_M < CLUSTER_N
|
||||||
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
|
||||||
|
#else
|
||||||
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||||
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
|
||||||
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||||
|
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||||
|
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||||
|
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||||
|
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||||
|
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||||
|
|
||||||
|
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||||
|
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||||
|
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||||
|
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
||||||
|
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
||||||
|
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
||||||
|
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||||
|
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||||
|
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||||
|
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
||||||
|
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
||||||
|
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||||
|
}
|
||||||
|
|
||||||
|
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
||||||
|
simd_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy1, fiy2, fiy3);
|
||||||
|
simd_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz1, fiz2, fiz3);
|
||||||
|
|
||||||
|
addStat(stats->calculated_forces, 1);
|
||||||
|
addStat(stats->num_neighs, numneighs);
|
||||||
|
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||||
|
}
|
||||||
|
|
||||||
|
LIKWID_MARKER_STOP("force");
|
||||||
|
double E = getTimeStamp();
|
||||||
|
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||||
|
return E-S;
|
||||||
|
}
|
||||||
|
|
||||||
|
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
int* neighs;
|
||||||
@ -415,3 +730,11 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
|
if(neighbor->half_neigh) {
|
||||||
|
return computeForceLJ_4xn_half(param, atom, neighbor, stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
return computeForceLJ_4xn_full(param, atom, neighbor, stats);
|
||||||
|
}
|
||||||
|
@ -31,6 +31,7 @@ typedef struct {
|
|||||||
int* neighbors;
|
int* neighbors;
|
||||||
int maxneighs;
|
int maxneighs;
|
||||||
int* numneigh;
|
int* numneigh;
|
||||||
|
int half_neigh;
|
||||||
} Neighbor;
|
} Neighbor;
|
||||||
|
|
||||||
extern void initNeighbor(Neighbor*, Parameter*);
|
extern void initNeighbor(Neighbor*, Parameter*);
|
||||||
|
@ -48,12 +48,13 @@ typedef struct {
|
|||||||
int prune_every;
|
int prune_every;
|
||||||
int x_out_every;
|
int x_out_every;
|
||||||
int v_out_every;
|
int v_out_every;
|
||||||
|
int half_neigh;
|
||||||
|
int nx, ny, nz;
|
||||||
MD_FLOAT dt;
|
MD_FLOAT dt;
|
||||||
MD_FLOAT dtforce;
|
MD_FLOAT dtforce;
|
||||||
MD_FLOAT cutforce;
|
MD_FLOAT cutforce;
|
||||||
MD_FLOAT skin;
|
MD_FLOAT skin;
|
||||||
MD_FLOAT cutneigh;
|
MD_FLOAT cutneigh;
|
||||||
int nx, ny, nz;
|
|
||||||
MD_FLOAT lattice;
|
MD_FLOAT lattice;
|
||||||
MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
|
MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
|
||||||
MD_FLOAT xprd, yprd, zprd;
|
MD_FLOAT xprd, yprd, zprd;
|
||||||
|
@ -42,6 +42,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
|
|||||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
|
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
|
||||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
|
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
|
||||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
|
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
|
||||||
|
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); }
|
||||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||||
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||||
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||||
@ -83,3 +84,18 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
|
|||||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
|
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
|
||||||
return _mm_cvtss_f32(t3);
|
return _mm_cvtss_f32(t3);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||||
|
__m256 t;
|
||||||
|
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||||
|
t = _mm256_load_ps(m);
|
||||||
|
t = _mm256_sub_ps(t, _mm512_castps512_ps256(a));
|
||||||
|
_mm256_store_ps(m, t);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
|
||||||
|
simd_h_decr(m, a0);
|
||||||
|
simd_h_decr(m + CLUSTER_M, a1);
|
||||||
|
simd_h_decr(m + CLUSTER_M * 2, a2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -198,6 +198,10 @@ int main(int argc, char** argv) {
|
|||||||
param.nz = atoi(argv[++i]);
|
param.nz = atoi(argv[++i]);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if((strcmp(argv[i], "-half") == 0)) {
|
||||||
|
param.half_neigh = atoi(argv[++i]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
|
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
|
||||||
param.mass = atof(argv[++i]);
|
param.mass = atof(argv[++i]);
|
||||||
continue;
|
continue;
|
||||||
|
@ -69,6 +69,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
|||||||
bincount = NULL;
|
bincount = NULL;
|
||||||
bin_clusters = NULL;
|
bin_clusters = NULL;
|
||||||
bin_nclusters = NULL;
|
bin_nclusters = NULL;
|
||||||
|
neighbor->half_neigh = param->half_neigh;
|
||||||
neighbor->maxneighs = 100;
|
neighbor->maxneighs = 100;
|
||||||
neighbor->numneigh = NULL;
|
neighbor->numneigh = NULL;
|
||||||
neighbor->neighbors = NULL;
|
neighbor->neighbors = NULL;
|
||||||
@ -223,6 +224,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
resize = 0;
|
resize = 0;
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||||
int n = 0;
|
int n = 0;
|
||||||
int ibin = atom->icluster_bin[ci];
|
int ibin = atom->icluster_bin[ci];
|
||||||
@ -246,6 +248,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
do {
|
do {
|
||||||
m++;
|
m++;
|
||||||
cj = loc_bin[m];
|
cj = loc_bin[m];
|
||||||
|
if(neighbor->half_neigh && ci_cj1 > cj) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||||
dl = ibb_zmin - jbb_zmax;
|
dl = ibb_zmin - jbb_zmax;
|
||||||
@ -261,31 +266,33 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||||
|
|
||||||
while(m < c) {
|
while(m < c) {
|
||||||
dl = ibb_zmin - jbb_zmax;
|
if(!neighbor->half_neigh || ci_cj1 <= cj) {
|
||||||
dh = jbb_zmin - ibb_zmax;
|
dl = ibb_zmin - jbb_zmax;
|
||||||
dm = MAX(dl, dh);
|
dh = jbb_zmin - ibb_zmax;
|
||||||
dm0 = MAX(dm, 0.0);
|
dm = MAX(dl, dh);
|
||||||
d_bb_sq = dm0 * dm0;
|
dm0 = MAX(dm, 0.0);
|
||||||
|
d_bb_sq = dm0 * dm0;
|
||||||
|
|
||||||
/*if(d_bb_sq > cutneighsq) {
|
/*if(d_bb_sq > cutneighsq) {
|
||||||
break;
|
break;
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
dl = ibb_ymin - jbb_ymax;
|
dl = ibb_ymin - jbb_ymax;
|
||||||
dh = jbb_ymin - ibb_ymax;
|
dh = jbb_ymin - ibb_ymax;
|
||||||
dm = MAX(dl, dh);
|
dm = MAX(dl, dh);
|
||||||
dm0 = MAX(dm, 0.0);
|
dm0 = MAX(dm, 0.0);
|
||||||
d_bb_sq += dm0 * dm0;
|
d_bb_sq += dm0 * dm0;
|
||||||
|
|
||||||
dl = ibb_xmin - jbb_xmax;
|
dl = ibb_xmin - jbb_xmax;
|
||||||
dh = jbb_xmin - ibb_xmax;
|
dh = jbb_xmin - ibb_xmax;
|
||||||
dm = MAX(dl, dh);
|
dm = MAX(dl, dh);
|
||||||
dm0 = MAX(dm, 0.0);
|
dm0 = MAX(dm, 0.0);
|
||||||
d_bb_sq += dm0 * dm0;
|
d_bb_sq += dm0 * dm0;
|
||||||
|
|
||||||
if(d_bb_sq < cutneighsq) {
|
if(d_bb_sq < cutneighsq) {
|
||||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||||
neighptr[n++] = cj;
|
neighptr[n++] = cj;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,6 +43,7 @@ void initParameter(Parameter *param) {
|
|||||||
param->nx = 32;
|
param->nx = 32;
|
||||||
param->ny = 32;
|
param->ny = 32;
|
||||||
param->nz = 32;
|
param->nz = 32;
|
||||||
|
param->half_neigh = 0;
|
||||||
param->cutforce = 2.5;
|
param->cutforce = 2.5;
|
||||||
param->skin = 0.3;
|
param->skin = 0.3;
|
||||||
param->cutneigh = param->cutforce + param->skin;
|
param->cutneigh = param->cutforce + param->skin;
|
||||||
@ -101,6 +102,7 @@ void readParameter(Parameter *param, const char *filename) {
|
|||||||
PARSE_INT(nx);
|
PARSE_INT(nx);
|
||||||
PARSE_INT(ny);
|
PARSE_INT(ny);
|
||||||
PARSE_INT(nz);
|
PARSE_INT(nz);
|
||||||
|
PARSE_INT(half_neigh);
|
||||||
PARSE_INT(nstat);
|
PARSE_INT(nstat);
|
||||||
PARSE_INT(reneigh_every);
|
PARSE_INT(reneigh_every);
|
||||||
PARSE_INT(prune_every);
|
PARSE_INT(prune_every);
|
||||||
@ -152,5 +154,6 @@ void printParameter(Parameter *param) {
|
|||||||
printf("\tDelta time (dt): %e\n", param->dt);
|
printf("\tDelta time (dt): %e\n", param->dt);
|
||||||
printf("\tCutoff radius: %e\n", param->cutforce);
|
printf("\tCutoff radius: %e\n", param->cutforce);
|
||||||
printf("\tSkin: %e\n", param->skin);
|
printf("\tSkin: %e\n", param->skin);
|
||||||
|
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
|
||||||
printf("\tProcessor frequency (GHz): %.4f\n\n", param->proc_freq);
|
printf("\tProcessor frequency (GHz): %.4f\n\n", param->proc_freq);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user