Set interaction masks as gromacs does

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2023-03-23 00:58:25 +01:00
parent f5fd3e265a
commit 296a4c4e01
9 changed files with 171 additions and 57 deletions

View File

@ -9,8 +9,10 @@
#if PRECISION == 1 #if PRECISION == 1
# define MD_FLOAT float # define MD_FLOAT float
# define MD_UINT unsigned int
#else #else
# define MD_FLOAT double # define MD_FLOAT double
# define MD_UINT unsigned long long int
#endif #endif
typedef struct { typedef struct {

View File

@ -12,7 +12,10 @@
#define MD_SIMD_FLOAT __m512d #define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8 #define MD_SIMD_MASK __mmask8
#define MD_SIMD_INT __m256i #define MD_SIMD_INT __m256i
#define MD_SIMD_BITMASK MD_SIMD_INT
#define MD_SIMD_IBOOL __mmask16
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); } static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); } static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }

View File

@ -14,7 +14,23 @@
#define MD_SIMD_FLOAT __m512 #define MD_SIMD_FLOAT __m512
#define MD_SIMD_MASK __mmask16 #define MD_SIMD_MASK __mmask16
#define MD_SIMD_INT __m256i #define MD_SIMD_INT __m256i
#define MD_SIMD_IBOOL __mmask16
#define MD_SIMD_INT32 __m512i
#define MD_SIMD_BITMASK MD_SIMD_INT32
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
return _mm512_load_si512(m);
}
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
return _mm512_set1_epi32(a);
}
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
}
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); } static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); } static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }

View File

@ -50,6 +50,8 @@ void createAtom(Atom *atom, Parameter *param) {
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) { for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon; atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6; atom->sigma6[i] = param->sigma6;
@ -57,6 +59,10 @@ void createAtom(Atom *atom, Parameter *param) {
atom->cutforcesq[i] = param->cutforce * param->cutforce; atom->cutforcesq[i] = param->cutforce * param->cutforce;
} }
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
atom->exclusion_filter[i] = (1U << i);
}
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0)); MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
int ilo = (int) (xlo / (0.5 * alat) - 1); int ilo = (int) (xlo / (0.5 * alat) - 1);
int ihi = (int) (xhi / (0.5 * alat) + 1); int ihi = (int) (xhi / (0.5 * alat) + 1);

View File

@ -16,10 +16,36 @@
#include <simd.h> #include <simd.h>
static inline void gmx_load_simd_2xnn_interactions(
int excl,
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
//SimdInt32 mask_pr_S(excl);
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S));
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S));
//*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
//*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
}
static inline void gmx_load_simd_4xn_interactions(
int excl,
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
//SimdInt32 mask_pr_S(excl);
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
*interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
*interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
}
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ begin\n"); DEBUG_MESSAGE("computeForceLJ begin\n");
int Nlocal = atom->Nlocal; int Nlocal = atom->Nlocal;
int* neighs; NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6; MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon; MD_FLOAT epsilon = param->epsilon;
@ -51,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
int numneighs = neighbor->numneigh[ci]; int numneighs = neighbor->numneigh[ci];
for(int k = 0; k < numneighs; k++) { for(int k = 0; k < numneighs; k++) {
int cj = neighs[k]; int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int any = 0; int any = 0;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@ -132,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal; int Nlocal = atom->Nlocal;
int* neighs; NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6; MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon; MD_FLOAT epsilon = param->epsilon;
@ -159,6 +185,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
{ {
LIKWID_MARKER_START("force"); LIKWID_MARKER_START("force");
MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
#pragma omp for #pragma omp for
for(int ci = 0; ci < atom->Nclusters_local; ci++) { for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci); int ci_cj0 = CJ0_FROM_CI(ci);
@ -185,11 +214,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT fiz2 = simd_zero(); MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs; k++) { for(int k = 0; k < numneighs; k++) {
int cj = neighs[k]; int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3; MD_SIMD_MASK interact0;
MD_SIMD_MASK interact2;
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
@ -201,36 +232,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N gmx_load_simd_2xnn_interactions((int) imask, filter0, filter2, &interact0, &interact2);
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x3 * cond0);
mask2 = (unsigned int)(0xf - 0x7 * cond0);
mask3 = (unsigned int)(0xf - 0xf * cond0);
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
#else
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
#endif
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); MD_SIMD_MASK cutoff_mask0 = simd_mask_and(interact0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); MD_SIMD_MASK cutoff_mask2 = simd_mask_and(interact2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
@ -284,7 +292,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal; int Nlocal = atom->Nlocal;
int* neighs; NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6; MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon; MD_FLOAT epsilon = param->epsilon;
@ -337,8 +345,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT fiz2 = simd_zero(); MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs; k++) { for(int k = 0; k < numneighs; k++) {
int cj = neighs[k]; int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3; unsigned int mask0, mask1, mask2, mask3;
@ -429,7 +438,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal; int Nlocal = atom->Nlocal;
int* neighs; NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6; MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon; MD_FLOAT epsilon = param->epsilon;
@ -493,8 +502,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
MD_SIMD_FLOAT fiz3 = simd_zero(); MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k++) { for(int k = 0; k < numneighs; k++) {
int cj = neighs[k]; int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@ -619,7 +629,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal; int Nlocal = atom->Nlocal;
int* neighs; NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6; MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon; MD_FLOAT epsilon = param->epsilon;
@ -683,8 +693,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
MD_SIMD_FLOAT fiz3 = simd_zero(); MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k++) { for(int k = 0; k < numneighs; k++) {
int cj = neighs[k]; int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);

View File

@ -22,6 +22,7 @@
# define KERNEL_NAME "CUDA" # define KERNEL_NAME "CUDA"
# define CLUSTER_M 8 # define CLUSTER_M 8
# define CLUSTER_N VECTOR_WIDTH # define CLUSTER_N VECTOR_WIDTH
# define UNROLL_J 1
# define computeForceLJ computeForceLJ_cuda # define computeForceLJ computeForceLJ_cuda
# define initialIntegrate cudaInitialIntegrate # define initialIntegrate cudaInitialIntegrate
# define finalIntegrate cudaFinalIntegrate # define finalIntegrate cudaFinalIntegrate
@ -32,11 +33,13 @@
# if VECTOR_WIDTH > CLUSTER_M * 2 # if VECTOR_WIDTH > CLUSTER_M * 2
# define KERNEL_NAME "Simd2xNN" # define KERNEL_NAME "Simd2xNN"
# define CLUSTER_N (VECTOR_WIDTH / 2) # define CLUSTER_N (VECTOR_WIDTH / 2)
# define UNROLL_J 2
# define computeForceLJ computeForceLJ_2xnn # define computeForceLJ computeForceLJ_2xnn
// Simd4xN // Simd4xN
# else # else
# define KERNEL_NAME "Simd4xN" # define KERNEL_NAME "Simd4xN"
# define CLUSTER_N VECTOR_WIDTH # define CLUSTER_N VECTOR_WIDTH
# define UNROLL_J 1
# define computeForceLJ computeForceLJ_4xn # define computeForceLJ computeForceLJ_4xn
# endif # endif
# ifdef USE_REFERENCE_VERSION # ifdef USE_REFERENCE_VERSION
@ -116,6 +119,7 @@ typedef struct {
Cluster *iclusters, *jclusters; Cluster *iclusters, *jclusters;
int *icluster_bin; int *icluster_bin;
int dummy_cj; int dummy_cj;
MD_UINT *exclusion_filter;
} Atom; } Atom;
extern void initAtom(Atom*); extern void initAtom(Atom*);

View File

@ -9,13 +9,34 @@
#ifndef __NEIGHBOR_H_ #ifndef __NEIGHBOR_H_
#define __NEIGHBOR_H_ #define __NEIGHBOR_H_
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
// interaction masks (1 = interaction, 0 = no interaction)
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
// so read them from right to left (least significant to most significant bit)
// All interaction mask is the same for all kernels
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
// 4x4 kernel diagonal mask
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
// 4x2 kernel diagonal masks
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
// 4x8 kernel diagonal masks
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
typedef struct {
int cj;
unsigned int imask;
} NeighborCluster;
typedef struct { typedef struct {
int every; int every;
int ncalls; int ncalls;
int* neighbors;
int maxneighs; int maxneighs;
int* numneigh; int* numneigh;
int half_neigh; int half_neigh;
NeighborCluster* neighbors;
} Neighbor; } Neighbor;
extern void initNeighbor(Neighbor*, Parameter*); extern void initNeighbor(Neighbor*, Parameter*);

View File

@ -184,6 +184,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
return 0; return 0;
} }
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
static unsigned int get_imask(int rdiag, int ci, int cj) {
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
}
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
: NBNXN_INTERACTION_MASK_ALL));
}
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
}
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
: NBNXN_INTERACTION_MASK_ALL));
}
#if VECTOR_WIDTH == 2
# define get_imask_simd_4xn get_imask_simd_j2
#elif VECTOR_WIDTH== 4
# define get_imask_simd_4xn get_imask_simd_j4
#elif VECTOR_WIDTH == 8
# define get_imask_simd_4xn get_imask_simd_j8
# define get_imask_simd_2xnn get_imask_simd_j4
#elif VECTOR_WIDTH == 16
# define get_imask_simd_2xnn get_imask_simd_j8
#else
# error "Invalid cluster configuration"
#endif
void buildNeighbor(Atom *atom, Neighbor *neighbor) { void buildNeighbor(Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("buildNeighbor start\n"); DEBUG_MESSAGE("buildNeighbor start\n");
@ -193,7 +230,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->neighbors) free(neighbor->neighbors); if(neighbor->neighbors) free(neighbor->neighbors);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*)); neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
} }
MD_FLOAT bbx = 0.5 * (binsizex + binsizex); MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@ -209,7 +246,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) { for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj1 = CJ1_FROM_CI(ci); int ci_cj1 = CJ1_FROM_CI(ci);
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
int n = 0; int n = 0;
int ibin = atom->icluster_bin[ci]; int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
@ -275,7 +312,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
if(d_bb_sq < cutneighsq) { if(d_bb_sq < cutneighsq) {
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) { if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
neighptr[n++] = cj; unsigned int imask;
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
imask = get_imask_simd_2xnn(neighbor->half_neigh, ci, cj);
#else // 4xn
imask = get_imask_simd_4xn(neighbor->half_neigh, ci, cj);
#endif
neighptr[n].cj = cj;
neighptr[n].imask = imask;
n++;
} }
} }
} }
@ -297,7 +343,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
// Fill neighbor list with dummy values to fit vector width // Fill neighbor list with dummy values to fit vector width
if(CLUSTER_N < VECTOR_WIDTH) { if(CLUSTER_N < VECTOR_WIDTH) {
while(n % (VECTOR_WIDTH / CLUSTER_N)) { while(n % (VECTOR_WIDTH / CLUSTER_N)) {
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
neighptr[n].imask = 0;
n++;
} }
} }
@ -315,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2; neighbor->maxneighs = new_maxneighs * 1.2;
free(neighbor->neighbors); free(neighbor->neighbors);
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
} }
} }
@ -370,19 +418,19 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
MD_FLOAT cutsq = cutneighsq; MD_FLOAT cutsq = cutneighsq;
for(int ci = 0; ci < atom->Nclusters_local; ci++) { for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci]; int numneighs = neighbor->numneigh[ci];
int k = 0; int k = 0;
// Remove dummy clusters if necessary // Remove dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) { if(CLUSTER_N < VECTOR_WIDTH) {
while(neighs[numneighs - 1] == atom->dummy_cj) { while(neighs[numneighs - 1].cj == atom->dummy_cj) {
numneighs--; numneighs--;
} }
} }
while(k < numneighs) { while(k < numneighs) {
int cj = neighs[k]; int cj = neighs[k].cj;
if(atomDistanceInRange(atom, ci, cj, cutsq)) { if(atomDistanceInRange(atom, ci, cj, cutsq)) {
k++; k++;
} else { } else {
@ -394,7 +442,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
// Readd dummy clusters if necessary // Readd dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) { if(CLUSTER_N < VECTOR_WIDTH) {
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
neighs[numneighs].imask = 0;
numneighs++;
} }
} }

View File

@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
MEM_TRACER_INIT; MEM_TRACER_INIT;
INDEX_TRACER_INIT; INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal; int Nlocal = atom->Nlocal;
int* neighs; NeighborCluster* neighs;
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs); INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
DIST_TRACE(neighs, numneighs); DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) { for(int k = 0; k < numneighs; k++) {
MEM_TRACE(neighs[k], 'R'); int j = neighs[k].cj;
MEM_TRACE(j, 'R');
MEM_TRACE(atom_x(j), 'R'); MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R'); MEM_TRACE(atom_y(j), 'R');
MEM_TRACE(atom_z(j), 'R'); MEM_TRACE(atom_z(j), 'R');