Set interaction masks as gromacs does
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
f5fd3e265a
commit
296a4c4e01
@ -8,9 +8,11 @@
|
|||||||
#define __PARAMETER_H_
|
#define __PARAMETER_H_
|
||||||
|
|
||||||
#if PRECISION == 1
|
#if PRECISION == 1
|
||||||
#define MD_FLOAT float
|
# define MD_FLOAT float
|
||||||
|
# define MD_UINT unsigned int
|
||||||
#else
|
#else
|
||||||
#define MD_FLOAT double
|
# define MD_FLOAT double
|
||||||
|
# define MD_UINT unsigned long long int
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -9,10 +9,13 @@
|
|||||||
# include <zmmintrin.h>
|
# include <zmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MD_SIMD_FLOAT __m512d
|
#define MD_SIMD_FLOAT __m512d
|
||||||
#define MD_SIMD_MASK __mmask8
|
#define MD_SIMD_MASK __mmask8
|
||||||
#define MD_SIMD_INT __m256i
|
#define MD_SIMD_INT __m256i
|
||||||
|
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||||
|
#define MD_SIMD_IBOOL __mmask16
|
||||||
|
|
||||||
|
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||||
|
@ -11,10 +11,26 @@
|
|||||||
# include <zmmintrin.h>
|
# include <zmmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MD_SIMD_FLOAT __m512
|
#define MD_SIMD_FLOAT __m512
|
||||||
#define MD_SIMD_MASK __mmask16
|
#define MD_SIMD_MASK __mmask16
|
||||||
#define MD_SIMD_INT __m256i
|
#define MD_SIMD_INT __m256i
|
||||||
|
#define MD_SIMD_IBOOL __mmask16
|
||||||
|
#define MD_SIMD_INT32 __m512i
|
||||||
|
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||||
|
|
||||||
|
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||||
|
return _mm512_load_si512(m);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||||
|
return _mm512_set1_epi32(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||||
|
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||||
|
@ -50,6 +50,8 @@ void createAtom(Atom *atom, Parameter *param) {
|
|||||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||||
|
|
||||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||||
atom->epsilon[i] = param->epsilon;
|
atom->epsilon[i] = param->epsilon;
|
||||||
atom->sigma6[i] = param->sigma6;
|
atom->sigma6[i] = param->sigma6;
|
||||||
@ -57,6 +59,10 @@ void createAtom(Atom *atom, Parameter *param) {
|
|||||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||||
|
atom->exclusion_filter[i] = (1U << i);
|
||||||
|
}
|
||||||
|
|
||||||
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
|
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||||
int ilo = (int) (xlo / (0.5 * alat) - 1);
|
int ilo = (int) (xlo / (0.5 * alat) - 1);
|
||||||
int ihi = (int) (xhi / (0.5 * alat) + 1);
|
int ihi = (int) (xhi / (0.5 * alat) + 1);
|
||||||
|
@ -16,10 +16,36 @@
|
|||||||
#include <simd.h>
|
#include <simd.h>
|
||||||
|
|
||||||
|
|
||||||
|
static inline void gmx_load_simd_2xnn_interactions(
|
||||||
|
int excl,
|
||||||
|
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
|
||||||
|
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
|
||||||
|
|
||||||
|
//SimdInt32 mask_pr_S(excl);
|
||||||
|
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||||
|
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S));
|
||||||
|
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S));
|
||||||
|
//*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||||
|
//*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void gmx_load_simd_4xn_interactions(
|
||||||
|
int excl,
|
||||||
|
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
|
||||||
|
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
|
||||||
|
|
||||||
|
//SimdInt32 mask_pr_S(excl);
|
||||||
|
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||||
|
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||||
|
*interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
|
||||||
|
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||||
|
*interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
|
||||||
|
}
|
||||||
|
|
||||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@ -51,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
int any = 0;
|
int any = 0;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
@ -132,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@ -159,6 +185,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
{
|
{
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||||
|
MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||||
|
|
||||||
#pragma omp for
|
#pragma omp for
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
@ -185,11 +214,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
unsigned int mask0, mask1, mask2, mask3;
|
MD_SIMD_MASK interact0;
|
||||||
|
MD_SIMD_MASK interact2;
|
||||||
|
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
@ -201,36 +232,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||||
|
|
||||||
#if CLUSTER_M == CLUSTER_N
|
gmx_load_simd_2xnn_interactions((int) imask, filter0, filter2, &interact0, &interact2);
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
|
||||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
|
||||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
|
||||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
|
||||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
|
||||||
#elif CLUSTER_M < CLUSTER_N
|
|
||||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
|
||||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
|
||||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
|
||||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
|
||||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
|
||||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
|
||||||
#else
|
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
|
||||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
|
||||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
|
||||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
|
||||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
|
||||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
|
||||||
|
|
||||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||||
|
|
||||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(interact0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(interact2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
@ -284,7 +292,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@ -337,8 +345,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
unsigned int mask0, mask1, mask2, mask3;
|
unsigned int mask0, mask1, mask2, mask3;
|
||||||
|
|
||||||
@ -429,7 +438,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
|||||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@ -493,8 +502,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
@ -619,7 +629,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@ -683,8 +693,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
# define KERNEL_NAME "CUDA"
|
# define KERNEL_NAME "CUDA"
|
||||||
# define CLUSTER_M 8
|
# define CLUSTER_M 8
|
||||||
# define CLUSTER_N VECTOR_WIDTH
|
# define CLUSTER_N VECTOR_WIDTH
|
||||||
|
# define UNROLL_J 1
|
||||||
# define computeForceLJ computeForceLJ_cuda
|
# define computeForceLJ computeForceLJ_cuda
|
||||||
# define initialIntegrate cudaInitialIntegrate
|
# define initialIntegrate cudaInitialIntegrate
|
||||||
# define finalIntegrate cudaFinalIntegrate
|
# define finalIntegrate cudaFinalIntegrate
|
||||||
@ -32,11 +33,13 @@
|
|||||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||||
# define KERNEL_NAME "Simd2xNN"
|
# define KERNEL_NAME "Simd2xNN"
|
||||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||||
|
# define UNROLL_J 2
|
||||||
# define computeForceLJ computeForceLJ_2xnn
|
# define computeForceLJ computeForceLJ_2xnn
|
||||||
// Simd4xN
|
// Simd4xN
|
||||||
# else
|
# else
|
||||||
# define KERNEL_NAME "Simd4xN"
|
# define KERNEL_NAME "Simd4xN"
|
||||||
# define CLUSTER_N VECTOR_WIDTH
|
# define CLUSTER_N VECTOR_WIDTH
|
||||||
|
# define UNROLL_J 1
|
||||||
# define computeForceLJ computeForceLJ_4xn
|
# define computeForceLJ computeForceLJ_4xn
|
||||||
# endif
|
# endif
|
||||||
# ifdef USE_REFERENCE_VERSION
|
# ifdef USE_REFERENCE_VERSION
|
||||||
@ -116,6 +119,7 @@ typedef struct {
|
|||||||
Cluster *iclusters, *jclusters;
|
Cluster *iclusters, *jclusters;
|
||||||
int *icluster_bin;
|
int *icluster_bin;
|
||||||
int dummy_cj;
|
int dummy_cj;
|
||||||
|
MD_UINT *exclusion_filter;
|
||||||
} Atom;
|
} Atom;
|
||||||
|
|
||||||
extern void initAtom(Atom*);
|
extern void initAtom(Atom*);
|
||||||
|
@ -9,13 +9,34 @@
|
|||||||
|
|
||||||
#ifndef __NEIGHBOR_H_
|
#ifndef __NEIGHBOR_H_
|
||||||
#define __NEIGHBOR_H_
|
#define __NEIGHBOR_H_
|
||||||
|
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
|
||||||
|
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
|
||||||
|
// interaction masks (1 = interaction, 0 = no interaction)
|
||||||
|
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
|
||||||
|
// so read them from right to left (least significant to most significant bit)
|
||||||
|
// All interaction mask is the same for all kernels
|
||||||
|
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
|
||||||
|
// 4x4 kernel diagonal mask
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
|
||||||
|
// 4x2 kernel diagonal masks
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
|
||||||
|
// 4x8 kernel diagonal masks
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int cj;
|
||||||
|
unsigned int imask;
|
||||||
|
} NeighborCluster;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int every;
|
int every;
|
||||||
int ncalls;
|
int ncalls;
|
||||||
int* neighbors;
|
|
||||||
int maxneighs;
|
int maxneighs;
|
||||||
int* numneigh;
|
int* numneigh;
|
||||||
int half_neigh;
|
int half_neigh;
|
||||||
|
NeighborCluster* neighbors;
|
||||||
} Neighbor;
|
} Neighbor;
|
||||||
|
|
||||||
extern void initNeighbor(Neighbor*, Parameter*);
|
extern void initNeighbor(Neighbor*, Parameter*);
|
||||||
|
@ -184,6 +184,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
|
||||||
|
static unsigned int get_imask(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
|
||||||
|
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
|
||||||
|
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
|
||||||
|
: NBNXN_INTERACTION_MASK_ALL));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
|
||||||
|
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
|
||||||
|
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
|
||||||
|
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
|
||||||
|
: NBNXN_INTERACTION_MASK_ALL));
|
||||||
|
}
|
||||||
|
|
||||||
|
#if VECTOR_WIDTH == 2
|
||||||
|
# define get_imask_simd_4xn get_imask_simd_j2
|
||||||
|
#elif VECTOR_WIDTH== 4
|
||||||
|
# define get_imask_simd_4xn get_imask_simd_j4
|
||||||
|
#elif VECTOR_WIDTH == 8
|
||||||
|
# define get_imask_simd_4xn get_imask_simd_j8
|
||||||
|
# define get_imask_simd_2xnn get_imask_simd_j4
|
||||||
|
#elif VECTOR_WIDTH == 16
|
||||||
|
# define get_imask_simd_2xnn get_imask_simd_j8
|
||||||
|
#else
|
||||||
|
# error "Invalid cluster configuration"
|
||||||
|
#endif
|
||||||
|
|
||||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||||
|
|
||||||
@ -193,7 +230,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
|
||||||
}
|
}
|
||||||
|
|
||||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||||
@ -209,7 +246,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||||
int n = 0;
|
int n = 0;
|
||||||
int ibin = atom->icluster_bin[ci];
|
int ibin = atom->icluster_bin[ci];
|
||||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||||
@ -275,7 +312,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
|
|
||||||
if(d_bb_sq < cutneighsq) {
|
if(d_bb_sq < cutneighsq) {
|
||||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||||
neighptr[n++] = cj;
|
unsigned int imask;
|
||||||
|
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
|
||||||
|
imask = get_imask_simd_2xnn(neighbor->half_neigh, ci, cj);
|
||||||
|
#else // 4xn
|
||||||
|
imask = get_imask_simd_4xn(neighbor->half_neigh, ci, cj);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
neighptr[n].cj = cj;
|
||||||
|
neighptr[n].imask = imask;
|
||||||
|
n++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -297,7 +343,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
// Fill neighbor list with dummy values to fit vector width
|
// Fill neighbor list with dummy values to fit vector width
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||||
|
neighptr[n].imask = 0;
|
||||||
|
n++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -315,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||||
free(neighbor->neighbors);
|
free(neighbor->neighbors);
|
||||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -370,19 +418,19 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
|||||||
MD_FLOAT cutsq = cutneighsq;
|
MD_FLOAT cutsq = cutneighsq;
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
int k = 0;
|
int k = 0;
|
||||||
|
|
||||||
// Remove dummy clusters if necessary
|
// Remove dummy clusters if necessary
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
|
||||||
numneighs--;
|
numneighs--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while(k < numneighs) {
|
while(k < numneighs) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||||
k++;
|
k++;
|
||||||
} else {
|
} else {
|
||||||
@ -394,7 +442,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
|||||||
// Readd dummy clusters if necessary
|
// Readd dummy clusters if necessary
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||||
|
neighs[numneighs].imask = 0;
|
||||||
|
numneighs++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
|||||||
MEM_TRACER_INIT;
|
MEM_TRACER_INIT;
|
||||||
INDEX_TRACER_INIT;
|
INDEX_TRACER_INIT;
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||||
|
|
||||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||||
@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
|||||||
DIST_TRACE(neighs, numneighs);
|
DIST_TRACE(neighs, numneighs);
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
MEM_TRACE(neighs[k], 'R');
|
int j = neighs[k].cj;
|
||||||
|
MEM_TRACE(j, 'R');
|
||||||
MEM_TRACE(atom_x(j), 'R');
|
MEM_TRACE(atom_x(j), 'R');
|
||||||
MEM_TRACE(atom_y(j), 'R');
|
MEM_TRACE(atom_y(j), 'R');
|
||||||
MEM_TRACE(atom_z(j), 'R');
|
MEM_TRACE(atom_z(j), 'R');
|
||||||
|
Loading…
Reference in New Issue
Block a user