diff --git a/common/includes/parameter.h b/common/includes/parameter.h index c76dcd5..f4ff676 100644 --- a/common/includes/parameter.h +++ b/common/includes/parameter.h @@ -8,9 +8,11 @@ #define __PARAMETER_H_ #if PRECISION == 1 -#define MD_FLOAT float +# define MD_FLOAT float +# define MD_UINT unsigned int #else -#define MD_FLOAT double +# define MD_FLOAT double +# define MD_UINT unsigned long long int #endif typedef struct { diff --git a/common/includes/simd/avx512_double.h b/common/includes/simd/avx512_double.h index fac5cd2..76c4bc0 100644 --- a/common/includes/simd/avx512_double.h +++ b/common/includes/simd/avx512_double.h @@ -9,10 +9,13 @@ # include #endif -#define MD_SIMD_FLOAT __m512d -#define MD_SIMD_MASK __mmask8 -#define MD_SIMD_INT __m256i +#define MD_SIMD_FLOAT __m512d +#define MD_SIMD_MASK __mmask8 +#define MD_SIMD_INT __m256i +#define MD_SIMD_BITMASK MD_SIMD_INT +#define MD_SIMD_IBOOL __mmask16 +static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); } static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); } static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); } diff --git a/common/includes/simd/avx512_float.h b/common/includes/simd/avx512_float.h index 1fe5803..3d5bea4 100644 --- a/common/includes/simd/avx512_float.h +++ b/common/includes/simd/avx512_float.h @@ -11,10 +11,26 @@ # include #endif -#define MD_SIMD_FLOAT __m512 -#define MD_SIMD_MASK __mmask16 -#define MD_SIMD_INT __m256i +#define MD_SIMD_FLOAT __m512 +#define MD_SIMD_MASK __mmask16 +#define MD_SIMD_INT __m256i +#define MD_SIMD_IBOOL __mmask16 +#define MD_SIMD_INT32 __m512i +#define MD_SIMD_BITMASK MD_SIMD_INT32 +static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) { + return _mm512_load_si512(m); +} + +static inline MD_SIMD_INT32 simd_int32_broadcast(int a) { + return _mm512_set1_epi32(a); +} + +static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) { + return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a)); +} + +static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; } static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); } static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); } diff --git a/gromacs/atom.c b/gromacs/atom.c index 3f45116..cf57b42 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -50,6 +50,8 @@ void createAtom(Atom *atom, Parameter *param) { atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT)); + for(int i = 0; i < atom->ntypes * atom->ntypes; i++) { atom->epsilon[i] = param->epsilon; atom->sigma6[i] = param->sigma6; @@ -57,6 +59,10 @@ void createAtom(Atom *atom, Parameter *param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } + for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) { + atom->exclusion_filter[i] = (1U << i); + } + MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0)); int ilo = (int) (xlo / (0.5 * alat) - 1); int ihi = (int) (xhi / (0.5 * alat) + 1); diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 24cf8ae..65fe5a5 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -16,10 +16,36 @@ #include +static inline void gmx_load_simd_2xnn_interactions( + int excl, + MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2, + MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) { + + //SimdInt32 mask_pr_S(excl); + MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl); + *interact0 = cvtIB2B(simd_test_bits(mask_pr_S)); + *interact2 = cvtIB2B(simd_test_bits(mask_pr_S)); + //*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0)); + //*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2)); +} + +static inline void gmx_load_simd_4xn_interactions( + int excl, + MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3, + MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) { + + //SimdInt32 mask_pr_S(excl); + MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl); + *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0)); + *interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1)); + *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2)); + *interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3)); +} + double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ begin\n"); int Nlocal = atom->Nlocal; - int* neighs; + NeighborCluster* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; @@ -51,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat int numneighs = neighbor->numneigh[ci]; for(int k = 0; k < numneighs; k++) { - int cj = neighs[k]; + int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int any = 0; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; @@ -132,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); int Nlocal = atom->Nlocal; - int* neighs; + NeighborCluster* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; @@ -159,6 +185,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor { LIKWID_MARKER_START("force"); + MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]); + MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]); + #pragma omp for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); @@ -185,11 +214,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT fiz2 = simd_zero(); for(int k = 0; k < numneighs; k++) { - int cj = neighs[k]; + int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int imask = neighs[k].imask; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; - unsigned int mask0, mask1, mask2, mask3; + MD_SIMD_MASK interact0; + MD_SIMD_MASK interact2; MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); @@ -201,36 +232,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); - #if CLUSTER_M == CLUSTER_N - unsigned int cond0 = (unsigned int)(cj == ci_cj0); - mask0 = (unsigned int)(0xf - 0x1 * cond0); - mask1 = (unsigned int)(0xf - 0x3 * cond0); - mask2 = (unsigned int)(0xf - 0x7 * cond0); - mask3 = (unsigned int)(0xf - 0xf * cond0); - #elif CLUSTER_M < CLUSTER_N - unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); - unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); - mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1); - mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1); - mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1); - mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1); - #else - unsigned int cond0 = (unsigned int)(cj == ci_cj0); - unsigned int cond1 = (unsigned int)(cj == ci_cj1); - mask0 = (unsigned int)(0x3 - 0x1 * cond0); - mask1 = (unsigned int)(0x3 - 0x3 * cond0); - mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1); - mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1); - #endif - - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2); + gmx_load_simd_2xnn_interactions((int) imask, filter0, filter2, &interact0, &interact2); MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); - MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); - MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask0 = simd_mask_and(interact0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask2 = simd_mask_and(interact2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); @@ -284,7 +292,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); int Nlocal = atom->Nlocal; - int* neighs; + NeighborCluster* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; @@ -337,8 +345,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT fiz2 = simd_zero(); for(int k = 0; k < numneighs; k++) { - int cj = neighs[k]; + int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int imask = neighs[k].imask; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; unsigned int mask0, mask1, mask2, mask3; @@ -429,7 +438,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); int Nlocal = atom->Nlocal; - int* neighs; + NeighborCluster* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; @@ -493,8 +502,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, MD_SIMD_FLOAT fiz3 = simd_zero(); for(int k = 0; k < numneighs; k++) { - int cj = neighs[k]; + int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int imask = neighs[k].imask; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); @@ -619,7 +629,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); int Nlocal = atom->Nlocal; - int* neighs; + NeighborCluster* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; @@ -683,8 +693,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, MD_SIMD_FLOAT fiz3 = simd_zero(); for(int k = 0; k < numneighs; k++) { - int cj = neighs[k]; + int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int imask = neighs[k].imask; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 8c4cb98..f1cdf6f 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -22,6 +22,7 @@ # define KERNEL_NAME "CUDA" # define CLUSTER_M 8 # define CLUSTER_N VECTOR_WIDTH +# define UNROLL_J 1 # define computeForceLJ computeForceLJ_cuda # define initialIntegrate cudaInitialIntegrate # define finalIntegrate cudaFinalIntegrate @@ -32,11 +33,13 @@ # if VECTOR_WIDTH > CLUSTER_M * 2 # define KERNEL_NAME "Simd2xNN" # define CLUSTER_N (VECTOR_WIDTH / 2) +# define UNROLL_J 2 # define computeForceLJ computeForceLJ_2xnn // Simd4xN # else # define KERNEL_NAME "Simd4xN" # define CLUSTER_N VECTOR_WIDTH +# define UNROLL_J 1 # define computeForceLJ computeForceLJ_4xn # endif # ifdef USE_REFERENCE_VERSION @@ -116,6 +119,7 @@ typedef struct { Cluster *iclusters, *jclusters; int *icluster_bin; int dummy_cj; + MD_UINT *exclusion_filter; } Atom; extern void initAtom(Atom*); diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index ddaaeed..ace503e 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -9,13 +9,34 @@ #ifndef __NEIGHBOR_H_ #define __NEIGHBOR_H_ +// Interaction masks from GROMACS, things to remember (maybe these confused just me): +// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather +// interaction masks (1 = interaction, 0 = no interaction) +// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking), +// so read them from right to left (least significant to most significant bit) +// All interaction mask is the same for all kernels +#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU +// 4x4 kernel diagonal mask +#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU +// 4x2 kernel diagonal masks +#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U +#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU +// 4x8 kernel diagonal masks +#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU +#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U + +typedef struct { + int cj; + unsigned int imask; +} NeighborCluster; + typedef struct { int every; int ncalls; - int* neighbors; int maxneighs; int* numneigh; int half_neigh; + NeighborCluster* neighbors; } Neighbor; extern void initNeighbor(Neighbor*, Parameter*); diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index fea0d2f..56c3191 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -184,6 +184,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { return 0; } +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */ +static unsigned int get_imask(int rdiag, int ci, int cj) { + return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL); +} + +/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */ +static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) { + return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 + : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 + : NBNXN_INTERACTION_MASK_ALL)); +} + +/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */ +static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) { + return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL); +} + +/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */ +static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) { + return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 + : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 + : NBNXN_INTERACTION_MASK_ALL)); +} + +#if VECTOR_WIDTH == 2 +# define get_imask_simd_4xn get_imask_simd_j2 +#elif VECTOR_WIDTH== 4 +# define get_imask_simd_4xn get_imask_simd_j4 +#elif VECTOR_WIDTH == 8 +# define get_imask_simd_4xn get_imask_simd_j8 +# define get_imask_simd_2xnn get_imask_simd_j4 +#elif VECTOR_WIDTH == 16 +# define get_imask_simd_2xnn get_imask_simd_j8 +#else +# error "Invalid cluster configuration" +#endif + void buildNeighbor(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighbor start\n"); @@ -193,7 +230,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->neighbors) free(neighbor->neighbors); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); - neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*)); + neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster)); } MD_FLOAT bbx = 0.5 * (binsizex + binsizex); @@ -209,7 +246,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj1 = CJ1_FROM_CI(ci); - int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); + NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; int ibin = atom->icluster_bin[ci]; MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; @@ -275,7 +312,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(d_bb_sq < cutneighsq) { if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) { - neighptr[n++] = cj; + unsigned int imask; + #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn + imask = get_imask_simd_2xnn(neighbor->half_neigh, ci, cj); + #else // 4xn + imask = get_imask_simd_4xn(neighbor->half_neigh, ci, cj); + #endif + + neighptr[n].cj = cj; + neighptr[n].imask = imask; + n++; } } } @@ -297,7 +343,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { // Fill neighbor list with dummy values to fit vector width if(CLUSTER_N < VECTOR_WIDTH) { while(n % (VECTOR_WIDTH / CLUSTER_N)) { - neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster + neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster + neighptr[n].imask = 0; + n++; } } @@ -315,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); neighbor->maxneighs = new_maxneighs * 1.2; free(neighbor->neighbors); - neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); + neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); } } @@ -370,19 +418,19 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { MD_FLOAT cutsq = cutneighsq; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; + NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; int k = 0; // Remove dummy clusters if necessary if(CLUSTER_N < VECTOR_WIDTH) { - while(neighs[numneighs - 1] == atom->dummy_cj) { + while(neighs[numneighs - 1].cj == atom->dummy_cj) { numneighs--; } } while(k < numneighs) { - int cj = neighs[k]; + int cj = neighs[k].cj; if(atomDistanceInRange(atom, ci, cj, cutsq)) { k++; } else { @@ -394,7 +442,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { // Readd dummy clusters if necessary if(CLUSTER_N < VECTOR_WIDTH) { while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { - neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster + neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster + neighs[numneighs].imask = 0; + numneighs++; } } diff --git a/gromacs/tracing.c b/gromacs/tracing.c index ef44ad9..c87b73d 100644 --- a/gromacs/tracing.c +++ b/gromacs/tracing.c @@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest MEM_TRACER_INIT; INDEX_TRACER_INIT; int Nlocal = atom->Nlocal; - int* neighs; + NeighborCluster* neighs; //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs); @@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest DIST_TRACE(neighs, numneighs); for(int k = 0; k < numneighs; k++) { - MEM_TRACE(neighs[k], 'R'); + int j = neighs[k].cj; + MEM_TRACE(j, 'R'); MEM_TRACE(atom_x(j), 'R'); MEM_TRACE(atom_y(j), 'R'); MEM_TRACE(atom_z(j), 'R');