diff --git a/common/includes/parameter.h b/common/includes/parameter.h
index c76dcd5..f4ff676 100644
--- a/common/includes/parameter.h
+++ b/common/includes/parameter.h
@@ -8,9 +8,11 @@
 #define __PARAMETER_H_
 
 #if PRECISION == 1
-#define MD_FLOAT float
+#   define MD_FLOAT float
+#   define MD_UINT  unsigned int
 #else
-#define MD_FLOAT double
+#   define MD_FLOAT double
+#   define MD_UINT  unsigned long long int
 #endif
 
 typedef struct {
diff --git a/common/includes/simd/avx512_double.h b/common/includes/simd/avx512_double.h
index fac5cd2..76c4bc0 100644
--- a/common/includes/simd/avx512_double.h
+++ b/common/includes/simd/avx512_double.h
@@ -9,10 +9,13 @@
 #   include <zmmintrin.h>
 #endif
 
-#define MD_SIMD_FLOAT   __m512d
-#define MD_SIMD_MASK    __mmask8
-#define MD_SIMD_INT     __m256i
+#define MD_SIMD_FLOAT       __m512d
+#define MD_SIMD_MASK        __mmask8
+#define MD_SIMD_INT         __m256i
+#define MD_SIMD_BITMASK     MD_SIMD_INT
+#define MD_SIMD_IBOOL       __mmask16
 
+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
 static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
diff --git a/common/includes/simd/avx512_float.h b/common/includes/simd/avx512_float.h
index 1fe5803..3d5bea4 100644
--- a/common/includes/simd/avx512_float.h
+++ b/common/includes/simd/avx512_float.h
@@ -11,10 +11,26 @@
 #   include <zmmintrin.h>
 #endif
 
-#define MD_SIMD_FLOAT   __m512
-#define MD_SIMD_MASK    __mmask16
-#define MD_SIMD_INT     __m256i
+#define MD_SIMD_FLOAT       __m512
+#define MD_SIMD_MASK        __mmask16
+#define MD_SIMD_INT         __m256i
+#define MD_SIMD_IBOOL       __mmask16
+#define MD_SIMD_INT32       __m512i
+#define MD_SIMD_BITMASK     MD_SIMD_INT32
 
+static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
+    return _mm512_load_si512(m);
+}
+
+static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
+    return _mm512_set1_epi32(a);
+}
+
+static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
+    return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
+}
+
+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
 static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
diff --git a/gromacs/atom.c b/gromacs/atom.c
index 3f45116..cf57b42 100644
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -50,6 +50,8 @@ void createAtom(Atom *atom, Parameter *param) {
     atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
     atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
     atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
+
     for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
         atom->epsilon[i] = param->epsilon;
         atom->sigma6[i] = param->sigma6;
@@ -57,6 +59,10 @@ void createAtom(Atom *atom, Parameter *param) {
         atom->cutforcesq[i] = param->cutforce * param->cutforce;
     }
 
+    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
+        atom->exclusion_filter[i] = (1U << i);
+    }
+
     MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
     int ilo = (int) (xlo / (0.5 * alat) - 1);
     int ihi = (int) (xhi / (0.5 * alat) + 1);
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 24cf8ae..65fe5a5 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -16,10 +16,36 @@
 #include <simd.h>
 
 
+static inline void gmx_load_simd_2xnn_interactions(
+    int excl,
+    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
+    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
+
+    //SimdInt32 mask_pr_S(excl);
+    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
+    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S));
+    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S));
+    //*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
+    //*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
+}
+
+static inline void gmx_load_simd_4xn_interactions(
+    int excl,
+    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
+    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
+
+    //SimdInt32 mask_pr_S(excl);
+    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
+    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
+    *interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
+    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
+    *interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
+}
+
 double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     DEBUG_MESSAGE("computeForceLJ begin\n");
     int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
@@ -51,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
         int numneighs = neighbor->numneigh[ci];
 
         for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             int any = 0;
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -132,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
     int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
@@ -159,6 +185,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
     {
     LIKWID_MARKER_START("force");
 
+    MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
+    MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
+
     #pragma omp for
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
         int ci_cj0 = CJ0_FROM_CI(ci);
@@ -185,11 +214,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
         MD_SIMD_FLOAT fiz2 = simd_zero();
 
         for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
             MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
-            unsigned int mask0, mask1, mask2, mask3;
+            MD_SIMD_MASK interact0;
+            MD_SIMD_MASK interact2;
 
             MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
             MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
@@ -201,36 +232,13 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
             MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
             MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
 
-            #if CLUSTER_M == CLUSTER_N
-            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            mask0 = (unsigned int)(0xf - 0x1 * cond0);
-            mask1 = (unsigned int)(0xf - 0x3 * cond0);
-            mask2 = (unsigned int)(0xf - 0x7 * cond0);
-            mask3 = (unsigned int)(0xf - 0xf * cond0);
-            #elif CLUSTER_M < CLUSTER_N
-            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
-            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
-            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
-            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
-            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
-            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
-            #else
-            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
-            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
-            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
-            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
-            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
-            #endif
-
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
+            gmx_load_simd_2xnn_interactions((int) imask, filter0, filter2, &interact0, &interact2);
 
             MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
             MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
 
-            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
-            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(interact0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(interact2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
 
             MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
             MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
@@ -284,7 +292,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
     int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
@@ -337,8 +345,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
         MD_SIMD_FLOAT fiz2 = simd_zero();
 
         for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
             unsigned int mask0, mask1, mask2, mask3;
 
@@ -429,7 +438,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
 double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
     int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
@@ -493,8 +502,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
         MD_SIMD_FLOAT fiz3 = simd_zero();
 
         for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
             MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
             MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@@ -619,7 +629,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
     int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
@@ -683,8 +693,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
         MD_SIMD_FLOAT fiz3 = simd_zero();
 
         for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            int imask = neighs[k].imask;
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
             MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
             MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 8c4cb98..f1cdf6f 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -22,6 +22,7 @@
 #   define KERNEL_NAME              "CUDA"
 #   define CLUSTER_M                8
 #   define CLUSTER_N                VECTOR_WIDTH
+#   define UNROLL_J                 1
 #   define computeForceLJ           computeForceLJ_cuda
 #   define initialIntegrate         cudaInitialIntegrate
 #   define finalIntegrate           cudaFinalIntegrate
@@ -32,11 +33,13 @@
 #   if VECTOR_WIDTH > CLUSTER_M * 2
 #       define KERNEL_NAME          "Simd2xNN"
 #       define CLUSTER_N            (VECTOR_WIDTH / 2)
+#       define UNROLL_J             2
 #       define computeForceLJ       computeForceLJ_2xnn
 // Simd4xN
 #   else
 #       define KERNEL_NAME          "Simd4xN"
 #       define CLUSTER_N            VECTOR_WIDTH
+#       define UNROLL_J             1
 #       define computeForceLJ       computeForceLJ_4xn
 #   endif
 #   ifdef USE_REFERENCE_VERSION
@@ -116,6 +119,7 @@ typedef struct {
     Cluster *iclusters, *jclusters;
     int *icluster_bin;
     int dummy_cj;
+    MD_UINT *exclusion_filter;
 } Atom;
 
 extern void initAtom(Atom*);
diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h
index ddaaeed..ace503e 100644
--- a/gromacs/includes/neighbor.h
+++ b/gromacs/includes/neighbor.h
@@ -9,13 +9,34 @@
 
 #ifndef __NEIGHBOR_H_
 #define __NEIGHBOR_H_
+// Interaction masks from GROMACS, things to remember (maybe these confused just me):
+//   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
+//      interaction masks (1 = interaction, 0 = no interaction)
+//   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
+//      so read them from right to left (least significant to most significant bit)
+// All interaction mask is the same for all kernels
+#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
+// 4x4 kernel diagonal mask
+#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
+// 4x2 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
+#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
+// 4x8 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
+#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
+
+typedef struct {
+    int cj;
+    unsigned int imask;
+} NeighborCluster;
+
 typedef struct {
     int every;
     int ncalls;
-    int* neighbors;
     int maxneighs;
     int* numneigh;
     int half_neigh;
+    NeighborCluster* neighbors;
 } Neighbor;
 
 extern void initNeighbor(Neighbor*, Parameter*);
diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c
index fea0d2f..56c3191 100644
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -184,6 +184,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
     return 0;
 }
 
+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
+    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
+                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
+                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+#if VECTOR_WIDTH == 2
+#   define get_imask_simd_4xn get_imask_simd_j2
+#elif VECTOR_WIDTH== 4
+#   define get_imask_simd_4xn get_imask_simd_j4
+#elif VECTOR_WIDTH == 8
+#   define get_imask_simd_4xn get_imask_simd_j8
+#   define get_imask_simd_2xnn get_imask_simd_j4
+#elif VECTOR_WIDTH == 16
+#   define get_imask_simd_2xnn get_imask_simd_j8
+#else
+#   error "Invalid cluster configuration"
+#endif
+
 void buildNeighbor(Atom *atom, Neighbor *neighbor) {
     DEBUG_MESSAGE("buildNeighbor start\n");
 
@@ -193,7 +230,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
         if(neighbor->numneigh) free(neighbor->numneigh);
         if(neighbor->neighbors) free(neighbor->neighbors);
         neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
-        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+        neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
     }
 
     MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@@ -209,7 +246,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 
         for(int ci = 0; ci < atom->Nclusters_local; ci++) {
             int ci_cj1 = CJ1_FROM_CI(ci);
-            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+            NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
             int n = 0;
             int ibin = atom->icluster_bin[ci];
             MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
@@ -275,7 +312,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 
                             if(d_bb_sq < cutneighsq) {
                                 if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
-                                    neighptr[n++] = cj;
+                                    unsigned int imask;
+                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
+                                    imask = get_imask_simd_2xnn(neighbor->half_neigh, ci, cj);
+                                    #else // 4xn
+                                    imask = get_imask_simd_4xn(neighbor->half_neigh, ci, cj);
+                                    #endif
+
+                                    neighptr[n].cj = cj;
+                                    neighptr[n].imask = imask;
+                                    n++;
                                 }
                             }
                         }
@@ -297,7 +343,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
             // Fill neighbor list with dummy values to fit vector width
             if(CLUSTER_N < VECTOR_WIDTH) {
                 while(n % (VECTOR_WIDTH / CLUSTER_N)) {
-                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr[n].imask = 0;
+                    n++;
                 }
             }
 
@@ -315,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
             fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
             neighbor->maxneighs = new_maxneighs * 1.2;
             free(neighbor->neighbors);
-            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+            neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
         }
     }
 
@@ -370,19 +418,19 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
     MD_FLOAT cutsq = cutneighsq;
 
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
         int numneighs = neighbor->numneigh[ci];
         int k = 0;
 
         // Remove dummy clusters if necessary
         if(CLUSTER_N < VECTOR_WIDTH) {
-            while(neighs[numneighs - 1] == atom->dummy_cj) {
+            while(neighs[numneighs - 1].cj == atom->dummy_cj) {
                 numneighs--;
             }
         }
 
         while(k < numneighs) {
-            int cj = neighs[k];
+            int cj = neighs[k].cj;
             if(atomDistanceInRange(atom, ci, cj, cutsq)) {
                 k++;
             } else {
@@ -394,7 +442,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
         // Readd dummy clusters if necessary
         if(CLUSTER_N < VECTOR_WIDTH) {
             while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
-                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs[numneighs].imask = 0;
+                numneighs++;
             }
         }
 
diff --git a/gromacs/tracing.c b/gromacs/tracing.c
index ef44ad9..c87b73d 100644
--- a/gromacs/tracing.c
+++ b/gromacs/tracing.c
@@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
     MEM_TRACER_INIT;
     INDEX_TRACER_INIT;
     int Nlocal = atom->Nlocal;
-    int* neighs;
+    NeighborCluster* neighs;
     //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
 
     INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
         DIST_TRACE(neighs, numneighs);
 
         for(int k = 0; k < numneighs; k++) {
-            MEM_TRACE(neighs[k], 'R');
+            int j = neighs[k].cj;
+            MEM_TRACE(j, 'R');
             MEM_TRACE(atom_x(j), 'R');
             MEM_TRACE(atom_y(j), 'R');
             MEM_TRACE(atom_z(j), 'R');