diff --git a/Makefile b/Makefile
index 83a25ad..354d33d 100644
--- a/Makefile
+++ b/Makefile
@@ -152,6 +152,7 @@ $(BUILD_DIR)/%.o:  %.s
 clean:
 	$(info ===>  CLEAN)
 	@rm -rf $(BUILD_DIR)
+	@rm -rf MDBench-$(IDENTIFIER)
 	@rm -f tags
 
 cleanall:
diff --git a/gromacs/atom.c b/gromacs/atom.c
index bfb2ab7..eef4654 100644
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
     atom->iclusters = NULL;
     atom->jclusters = NULL;
     atom->icluster_bin = NULL;
+    initMasks(atom);
 }
 
 void createAtom(Atom *atom, Parameter *param) {
@@ -50,9 +51,6 @@ void createAtom(Atom *atom, Parameter *param) {
     atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
     atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
     atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
-    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
-    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
-    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
 
     for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
         atom->epsilon[i] = param->epsilon;
@@ -61,19 +59,6 @@ void createAtom(Atom *atom, Parameter *param) {
         atom->cutforcesq[i] = param->cutforce * param->cutforce;
     }
 
-    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {   
-        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
-    }
-
-    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
-        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
-        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
-    }
-
-    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
-        atom->exclusion_filter[i] = (1U << i);
-    }
-
     MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
     int ilo = (int) (xlo / (0.5 * alat) - 1);
     int ihi = (int) (xhi / (0.5 * alat) + 1);
@@ -409,6 +394,59 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
     return natoms;
 }
 
+void initMasks(Atom *atom) {
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
+    unsigned int mask0, mask1, mask2, mask3;
+
+    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
+    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
+    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
+    //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
+
+    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
+        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+    }
+
+    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
+        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
+    }
+
+    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
+        atom->exclusion_filter[i] = (1U << i);
+    }
+
+    #if CLUSTER_M == CLUSTER_N
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x3 * cond0);
+        mask2 = (unsigned int)(0xf - 0x7 * cond0);
+        mask3 = (unsigned int)(0xf - 0xf * cond0);
+        atom->masks_2xnn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+    }
+    #else
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
+            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
+            #endif
+
+            atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+        }
+    }
+    #endif
+}
+
 void growAtom(Atom *atom) {
     int nold = atom->Nmax;
     atom->Nmax += DELTA;
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 923d4ef..4070ae1 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -165,7 +165,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
     MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
         int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
@@ -236,7 +235,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
         MD_SIMD_FLOAT fiz2 = simd_zero();
 
         for(int k = 0; k < numneighs; k++) {
-            unsigned int mask0, mask1, mask2, mask3;
             int cj = neighs[k].cj;
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             int imask = neighs[k].imask;
@@ -261,30 +259,23 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 
             #if CLUSTER_M == CLUSTER_N
             unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            mask0 = (unsigned int)(0xf - 0x1 * cond0);
-            mask1 = (unsigned int)(0xf - 0x3 * cond0);
-            mask2 = (unsigned int)(0xf - 0x7 * cond0);
-            mask3 = (unsigned int)(0xf - 0xf * cond0);
-            #elif CLUSTER_M < CLUSTER_N
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 2 + 0]);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 2 + 1]);
+            #else
+            #if CLUSTER_M < CLUSTER_N
             unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
             unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
-            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
-            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
-            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
-            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
             #else
             unsigned int cond0 = (unsigned int)(cj == ci_cj0);
             unsigned int cond1 = (unsigned int)(cj == ci_cj1);
-            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
-            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
-            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
-            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
+            #endif
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 0]);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 1]);
             #endif
 
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
             MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
             MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
+
             cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
             cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
 
@@ -308,8 +299,8 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
             MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
             MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 
-            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0;
-            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2;
+            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 
             MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
             MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 92717fd..6bd3893 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -124,9 +124,11 @@ typedef struct {
     MD_UINT *exclusion_filter;
     MD_FLOAT *diagonal_4xn_j_minus_i;
     MD_FLOAT *diagonal_2xnn_j_minus_i;
+    unsigned int masks_2xnn[8];
 } Atom;
 
 extern void initAtom(Atom*);
+extern void initMasks(Atom*);
 extern void createAtom(Atom*, Parameter*);
 extern int readAtom(Atom*, Parameter*);
 extern int readAtom_pdb(Atom*, Parameter*);