Set interaction masks as gromacs does

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-03-23 00:58:25 +01:00
parent f5fd3e265a
commit 296a4c4e01
9 changed files with 171 additions and 57 deletions
--- a/common/includes/parameter.h
+++ b/common/includes/parameter.h
@@ -9,8 +9,10 @@

 #if PRECISION == 1
 #   define MD_FLOAT float
+#   define MD_UINT  unsigned int
 #else
 #   define MD_FLOAT double
+#   define MD_UINT  unsigned long long int
 #endif

 typedef struct {
--- a/common/includes/simd/avx512_double.h
+++ b/common/includes/simd/avx512_double.h
@@ -12,7 +12,10 @@
 #define MD_SIMD_FLOAT       __m512d
 #define MD_SIMD_MASK        __mmask8
 #define MD_SIMD_INT         __m256i
+#define MD_SIMD_BITMASK     MD_SIMD_INT
+#define MD_SIMD_IBOOL       __mmask16

+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
 static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
--- a/common/includes/simd/avx512_float.h
+++ b/common/includes/simd/avx512_float.h
@@ -14,7 +14,23 @@
 #define MD_SIMD_FLOAT       __m512
 #define MD_SIMD_MASK        __mmask16
 #define MD_SIMD_INT         __m256i
+#define MD_SIMD_IBOOL       __mmask16
+#define MD_SIMD_INT32       __m512i
+#define MD_SIMD_BITMASK     MD_SIMD_INT32

+static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
+    return _mm512_load_si512(m);
+}
+
+static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
+    return _mm512_set1_epi32(a);
+}
+
+static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
+    return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
+}
+
+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
 static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -50,6 +50,8 @@ void createAtom(Atom *atom, Parameter *param) {
    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
+
    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
        atom->epsilon[i] = param->epsilon;
        atom->sigma6[i] = param->sigma6;
@@ -57,6 +59,10 @@ void createAtom(Atom *atom, Parameter *param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

+    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
+        atom->exclusion_filter[i] = (1U << i);
+    }
+
    MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
    int ilo = (int) (xlo / (0.5 * alat) - 1);
    int ihi = (int) (xhi / (0.5 * alat) + 1);
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -16,10 +16,36 @@
 #include <simd.h>


+static inline void gmx_load_simd_2xnn_interactions(
+    int excl,
+    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
+    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
+
+    //SimdInt32 mask_pr_S(excl);
+    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
+    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S));
+    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S));
+    //*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
+    //*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
+}
+
+static inline void gmx_load_simd_4xn_interactions(
+    int excl,
+    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
+    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
+
+    //SimdInt32 mask_pr_S(excl);
+    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
+    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
+    *interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
+    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
+    *interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
+}
+
 double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ begin\n");
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
@@ -51,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
        int numneighs = neighbor->numneigh[ci];

        for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            int any = 0;
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -132,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
@@ -159,6 +185,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
    {
    LIKWID_MARKER_START("force");

+    MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
+    MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
+
    #pragma omp for
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_cj0 = CJ0_FROM_CI(ci);
@@ -185,11 +214,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
        MD_SIMD_FLOAT fiz2 = simd_zero();

        for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
-            unsigned int mask0, mask1, mask2, mask3;
+            MD_SIMD_MASK interact0;
+            MD_SIMD_MASK interact2;

            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
@@ -201,36 +232,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);

-            #if CLUSTER_M == CLUSTER_N
-            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            mask0 = (unsigned int)(0xf - 0x1 * cond0);
-            mask1 = (unsigned int)(0xf - 0x3 * cond0);
-            mask2 = (unsigned int)(0xf - 0x7 * cond0);
-            mask3 = (unsigned int)(0xf - 0xf * cond0);
-            #elif CLUSTER_M < CLUSTER_N
-            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
-            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
-            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
-            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
-            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
-            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
-            #else
-            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
-            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
-            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
-            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
-            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
-            #endif
-
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
+            gmx_load_simd_2xnn_interactions((int) imask, filter0, filter2, &interact0, &interact2);

            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));

-            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
-            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(interact0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(interact2, simd_mask_cond_lt(rsq2, cutforcesq_vec));

            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
@@ -284,7 +292,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
@@ -337,8 +345,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
        MD_SIMD_FLOAT fiz2 = simd_zero();

        for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            unsigned int mask0, mask1, mask2, mask3;

@@ -429,7 +438,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
 double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
@@ -493,8 +502,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
        MD_SIMD_FLOAT fiz3 = simd_zero();

        for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@@ -619,7 +629,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
@@ -683,8 +693,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
        MD_SIMD_FLOAT fiz3 = simd_zero();

        for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -22,6 +22,7 @@
 #   define KERNEL_NAME              "CUDA"
 #   define CLUSTER_M                8
 #   define CLUSTER_N                VECTOR_WIDTH
+#   define UNROLL_J                 1
 #   define computeForceLJ           computeForceLJ_cuda
 #   define initialIntegrate         cudaInitialIntegrate
 #   define finalIntegrate           cudaFinalIntegrate
@@ -32,11 +33,13 @@
 #   if VECTOR_WIDTH > CLUSTER_M * 2
 #       define KERNEL_NAME          "Simd2xNN"
 #       define CLUSTER_N            (VECTOR_WIDTH / 2)
+#       define UNROLL_J             2
 #       define computeForceLJ       computeForceLJ_2xnn
 // Simd4xN
 #   else
 #       define KERNEL_NAME          "Simd4xN"
 #       define CLUSTER_N            VECTOR_WIDTH
+#       define UNROLL_J             1
 #       define computeForceLJ       computeForceLJ_4xn
 #   endif
 #   ifdef USE_REFERENCE_VERSION
@@ -116,6 +119,7 @@ typedef struct {
    Cluster *iclusters, *jclusters;
    int *icluster_bin;
    int dummy_cj;
+    MD_UINT *exclusion_filter;
 } Atom;

 extern void initAtom(Atom*);
--- a/gromacs/includes/neighbor.h
+++ b/gromacs/includes/neighbor.h
@@ -9,13 +9,34 @@

 #ifndef __NEIGHBOR_H_
 #define __NEIGHBOR_H_
+// Interaction masks from GROMACS, things to remember (maybe these confused just me):
+//   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
+//      interaction masks (1 = interaction, 0 = no interaction)
+//   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
+//      so read them from right to left (least significant to most significant bit)
+// All interaction mask is the same for all kernels
+#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
+// 4x4 kernel diagonal mask
+#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
+// 4x2 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
+#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
+// 4x8 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
+#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
+
+typedef struct {
+    int cj;
+    unsigned int imask;
+} NeighborCluster;
+
 typedef struct {
    int every;
    int ncalls;
-    int* neighbors;
    int maxneighs;
    int* numneigh;
    int half_neigh;
+    NeighborCluster* neighbors;
 } Neighbor;

 extern void initNeighbor(Neighbor*, Parameter*);
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -184,6 +184,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
    return 0;
 }

+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
+    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
+                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
+                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+#if VECTOR_WIDTH == 2
+#   define get_imask_simd_4xn get_imask_simd_j2
+#elif VECTOR_WIDTH== 4
+#   define get_imask_simd_4xn get_imask_simd_j4
+#elif VECTOR_WIDTH == 8
+#   define get_imask_simd_4xn get_imask_simd_j8
+#   define get_imask_simd_2xnn get_imask_simd_j4
+#elif VECTOR_WIDTH == 16
+#   define get_imask_simd_2xnn get_imask_simd_j8
+#else
+#   error "Invalid cluster configuration"
+#endif
+
 void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("buildNeighbor start\n");

@@ -193,7 +230,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
        if(neighbor->numneigh) free(neighbor->numneigh);
        if(neighbor->neighbors) free(neighbor->neighbors);
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
-        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+        neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
    }

    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@@ -209,7 +246,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {

        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
            int ci_cj1 = CJ1_FROM_CI(ci);
-            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+            NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
            int n = 0;
            int ibin = atom->icluster_bin[ci];
            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
@@ -275,7 +312,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {

                            if(d_bb_sq < cutneighsq) {
                                if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
-                                    neighptr[n++] = cj;
+                                    unsigned int imask;
+                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
+                                    imask = get_imask_simd_2xnn(neighbor->half_neigh, ci, cj);
+                                    #else // 4xn
+                                    imask = get_imask_simd_4xn(neighbor->half_neigh, ci, cj);
+                                    #endif
+
+                                    neighptr[n].cj = cj;
+                                    neighptr[n].imask = imask;
+                                    n++;
                                }
                            }
                        }
@@ -297,7 +343,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
            // Fill neighbor list with dummy values to fit vector width
            if(CLUSTER_N < VECTOR_WIDTH) {
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
-                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr[n].imask = 0;
+                    n++;
                }
            }

@@ -315,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
            neighbor->maxneighs = new_maxneighs * 1.2;
            free(neighbor->neighbors);
-            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+            neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
        }
    }

@@ -370,19 +418,19 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
    MD_FLOAT cutsq = cutneighsq;

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
        int k = 0;

        // Remove dummy clusters if necessary
        if(CLUSTER_N < VECTOR_WIDTH) {
-            while(neighs[numneighs - 1] == atom->dummy_cj) {
+            while(neighs[numneighs - 1].cj == atom->dummy_cj) {
                numneighs--;
            }
        }

        while(k < numneighs) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
            if(atomDistanceInRange(atom, ci, cj, cutsq)) {
                k++;
            } else {
@@ -394,7 +442,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
        // Readd dummy clusters if necessary
        if(CLUSTER_N < VECTOR_WIDTH) {
            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
-                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs[numneighs].imask = 0;
+                numneighs++;
            }
        }

--- a/gromacs/tracing.c
+++ b/gromacs/tracing.c
@@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
    MEM_TRACER_INIT;
    INDEX_TRACER_INIT;
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;

    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
        DIST_TRACE(neighs, numneighs);

        for(int k = 0; k < numneighs; k++) {
-            MEM_TRACE(neighs[k], 'R');
+            int j = neighs[k].cj;
+            MEM_TRACE(j, 'R');
            MEM_TRACE(atom_x(j), 'R');
            MEM_TRACE(atom_y(j), 'R');
            MEM_TRACE(atom_z(j), 'R');