diff --git a/Makefile b/Makefile
index 78289dc..9d3cc90 100644
--- a/Makefile
+++ b/Makefile
@@ -17,9 +17,6 @@ INCLUDES  += -I./$(SRC_DIR)/includes
 ifeq ($(strip $(DATA_LAYOUT)),AOS)
 DEFINES +=  -DAOS
 endif
-ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
-DEFINES +=  -DCLUSTER_AOS
-endif
 ifeq ($(strip $(DATA_TYPE)),SP)
 DEFINES +=  -DPRECISION=1
 else
@@ -30,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
     ASFLAGS += -masm=intel
 endif
 
-ifneq ($(ATOMS_LOOP_RUNS),)
-    DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
-endif
-
-ifneq ($(NEIGHBORS_LOOP_RUNS),)
-    DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
-endif
-
 ifeq ($(strip $(EXPLICIT_TYPES)),true)
     DEFINES += -DEXPLICIT_TYPES
 endif
@@ -74,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
     DEFINES += -DNO_AVX2
 endif
 
+ifeq ($(strip $(AVX512)),true)
+    DEFINES += -DAVX512
+endif
+
 VPATH     = $(SRC_DIR) $(ASM_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
diff --git a/config.mk b/config.mk
index de2bf9d..9afe7a0 100644
--- a/config.mk
+++ b/config.mk
@@ -7,7 +7,7 @@ OPT_SCHEME ?= gromacs
 # Enable likwid (true or false)
 ENABLE_LIKWID ?= true
 # SP or DP
-DATA_TYPE ?= DP
+DATA_TYPE ?= SP
 # AOS or SOA
 DATA_LAYOUT ?= AOS
 # Assembly syntax to generate (ATT/INTEL)
@@ -15,10 +15,6 @@ ASM_SYNTAX ?= ATT
 # Debug
 DEBUG ?= false
 
-# Number of times to run the atoms loop on stubbed variant
-ATOMS_LOOP_RUNS ?= 1
-# Number of times to run the neighbors loop on stubbed variant
-NEIGHBORS_LOOP_RUNS ?= 1
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
 # Trace memory addresses for cache simulator (true or false)
@@ -29,8 +25,6 @@ INDEX_TRACER ?= false
 COMPUTE_STATS ?= true
 
 # Configurations for gromacs optimization scheme
-# AOS or SOA
-CLUSTER_LAYOUT ?= SOA
 # Use reference version
 USE_REFERENCE_VERSION ?= false
 # Enable XTC output
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 7b6d37f..41749ad 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -128,7 +128,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 }
 
 double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
-    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
+    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
     int Nlocal = atom->Nlocal;
     int* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
@@ -136,10 +136,10 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
     MD_FLOAT epsilon = param->epsilon;
     MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
     MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
-    MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
@@ -152,131 +152,86 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
         neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
         int numneighs = neighbor->numneigh[ci];
 
-        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
-        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
-        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
-        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
-        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
-        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
-        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
-        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
-        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
-        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
-        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
-        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
+        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
+        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
+        MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
+        MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
+        MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
+        MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
         MD_SIMD_FLOAT fix0 = simd_zero();
         MD_SIMD_FLOAT fiy0 = simd_zero();
         MD_SIMD_FLOAT fiz0 = simd_zero();
-        MD_SIMD_FLOAT fix1 = simd_zero();
-        MD_SIMD_FLOAT fiy1 = simd_zero();
-        MD_SIMD_FLOAT fiz1 = simd_zero();
         MD_SIMD_FLOAT fix2 = simd_zero();
         MD_SIMD_FLOAT fiy2 = simd_zero();
         MD_SIMD_FLOAT fiz2 = simd_zero();
-        MD_SIMD_FLOAT fix3 = simd_zero();
-        MD_SIMD_FLOAT fiy3 = simd_zero();
-        MD_SIMD_FLOAT fiz3 = simd_zero();
 
-        for(int k = 0; k < numneighs; k += unroll_j) {
-            int cj = neighs[k + 0];
+        for(int k = 0; k < numneighs; k += 2) {
+            int cj = neighs[k];
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
-            unsigned int cond0 = (unsigned int)(ci == cj);
-            unsigned int cond1 = cond0;
-            MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp;
-
-            /*
-            MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
-            #if VECTOR_WIDTH == 8
-            int cj1 = neighs[k + 1];
-            unsigned int cond1 = (unsigned int)(ci == cj1);
-            MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
-            MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
-            MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
-            MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
-            #else
-            MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
-            MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
-            MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
-            #endif
-            */
+            unsigned int mask0, mask1, mask2, mask3;
 
+            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
+            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
+            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
             MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
             MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
             MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
             MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
             MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
             MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 
-            #if VECTOR_WIDTH == 8
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
+            #if CLUSTER_M == CLUSTER_N
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            mask0 = (unsigned int)(0xf - 0x1 * cond0);
+            mask1 = (unsigned int)(0xf - 0x2 * cond0);
+            mask2 = (unsigned int)(0xf - 0x4 * cond0);
+            mask3 = (unsigned int)(0xf - 0x8 * cond0);
+            #elif CLUSTER_M < CLUSTER_N
+            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
             #else
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
+            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
             #endif
 
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
+
             MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
-            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
             MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
-            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
 
             MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
-            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
             MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
-            MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
 
             MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
-            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
             MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
-            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
 
             MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
             MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
 
-            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
-            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
-            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
-            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
 
             fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
             fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
             fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
-            fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
-            fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
-            fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
             fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
             fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
             fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
-            fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
-            fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
-            fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
         }
 
-        ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
-        ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
-        ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
-        ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
-        ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
-        ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
-        ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
-        ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
-        ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
-        ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
-        ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
-        ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
+        simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
+        simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
+        simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
@@ -285,7 +240,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
 
     LIKWID_MARKER_STOP("force");
     double E = getTimeStamp();
-    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
+    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
     return E-S;
 }
 
@@ -298,7 +253,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     MD_FLOAT epsilon = param->epsilon;
     MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
     MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
-    MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
     double S = getTimeStamp();
@@ -348,7 +303,6 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
             MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
             MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
-
             MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
             MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
             MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
@@ -363,14 +317,14 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 
             #if CLUSTER_M == CLUSTER_N
-            int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
             MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
             MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
             MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
             #elif CLUSTER_M < CLUSTER_N
-            int cond0 = (unsigned int)((cj << 1) + 0 == ci);
-            int cond1 = (unsigned int)((cj << 1) + 1 == ci);
+            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
             MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
             MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
@@ -404,10 +358,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
             MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
 
-            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
-            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
-            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
-            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
+            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
+            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
 
             fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
             fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
@@ -423,18 +377,18 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
         }
 
-        ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
-        ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
-        ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
-        ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
-        ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
-        ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
-        ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
-        ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
-        ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
-        ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
-        ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
-        ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
+        ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
+        ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
+        ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
+        ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
+        ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
+        ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
+        ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
+        ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
+        ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
+        ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
+        ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
+        ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h
index 1d6a90b..59a044a 100644
--- a/gromacs/includes/simd.h
+++ b/gromacs/includes/simd.h
@@ -21,6 +21,7 @@
  * =======================================================================================
  */
 
+#include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #include <zmmintrin.h>
@@ -28,7 +29,9 @@
 #define SIMD_PRINT_REAL(a)  simd_print_real(#a, a);
 #define SIMD_PRINT_MASK(a)  simd_print_mask(#a, a);
 
-#if VECTOR_WIDTH == 8 // AVX512
+#ifdef AVX512
+
+#if PRECISION == 2 // Double precision
 
 #define MD_SIMD_FLOAT       __m512d
 #define MD_SIMD_MASK        __mmask8
@@ -47,15 +50,98 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_m
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
 static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
 
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
     MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
     x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
     x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
     return *((MD_FLOAT *) &x);
 }
 
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
+    return _mm512_broadcast_f64x4(_mm256_load_pd(m));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
+    return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
+}
+
+static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512d t0;
+    __m256d t2, t3;
+
+    t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
+    t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
+    t2 = _mm512_castpd512_pd256(t0);
+    t3 = _mm256_load_pd(m);
+    t3 = _mm256_add_pd(t3, t2);
+    _mm256_store_pd(m, t3);
+
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
+}
+
+#else // Single-precision
+
+#define MD_SIMD_FLOAT       __m512
+#define MD_SIMD_MASK        __mmask16
+
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    // This would only be called in a Mx16 configuration, which is not valid in GROMACS
+    fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
+    return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
+    return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
+}
+
+static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512 t0, t1;
+    __m128 t2, t3;
+
+    t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
+    t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
+    t0 = _mm512_add_ps(t0, t1);
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
+    t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
+    t3 = _mm512_castps512_ps128(t0);
+    t2 = _mm_load_ps(m);
+    t2 = _mm_add_ps(t2, t3);
+    _mm_store_ps(m, t2);
+
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
+    return _mm_cvtss_f32(t3);
+}
+
+#endif // PRECISION
+
 #else // AVX or AVX2
 
+#if PRECISION == 2 // Double precision
+
 #define MD_SIMD_FLOAT       __m256d
 
 #ifdef NO_AVX2
@@ -72,6 +158,7 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return
 static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
 
 #ifdef NO_AVX2
+
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
 static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
 static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
@@ -85,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
 }
 // TODO: Implement this, althrough it is just required for debugging
 static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
     __m128d a0, a1;
     a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
     a0 = _mm256_castpd256_pd128(a);
@@ -93,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
     a0 = _mm_add_sd(a0, a1);
     return *((MD_FLOAT *) &a0);
 }
-#else
+
+#else // AVX2
+
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
 static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
 static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
@@ -101,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
 static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
 static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
     __m128d a0, a1;
     // test with shuffle & add as an alternative to hadd later
     a = _mm256_hadd_pd(a, a);
@@ -110,11 +199,53 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
     a0 = _mm_add_sd(a0, a1);
     return *((MD_FLOAT *) &a0);
 }
-#endif
-
 
 #endif
 
+#else // Single-precision
+
+#define MD_SIMD_FLOAT       __m256
+
+#ifdef NO_AVX2
+#define MD_SIMD_MASK        __m256
+#else
+#define MD_SIMD_MASK        __mmask8
+#endif
+
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
+
+#ifdef NO_AVX2
+
+#error "AVX intrinsincs with single-precision not implemented!"
+
+#else // AVX2
+
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128 t0;
+    t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+#endif // NO_AVX2
+
+#endif // PRECISION
+
+#endif // AVX or AVX2
+
 static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
     double x[VECTOR_WIDTH];
     memcpy(x, &a, sizeof(x));
diff --git a/include_ISA.mk b/include_ISA.mk
index 215f626..fa9b90b 100644
--- a/include_ISA.mk
+++ b/include_ISA.mk
@@ -1,15 +1,18 @@
 ifeq ($(strip $(ISA)), SSE)
-VECTOR_WIDTH=2
+_VECTOR_WIDTH=2
 else ifeq ($(strip $(ISA)), AVX)
 # Vector width is 4 but AVX2 instruction set is not supported
 NO_AVX2=true
-VECTOR_WIDTH=4
+_VECTOR_WIDTH=4
 else ifeq ($(strip $(ISA)), AVX2)
-VECTOR_WIDTH=4
+_VECTOR_WIDTH=4
 else ifeq ($(strip $(ISA)), AVX512)
-VECTOR_WIDTH=8
+AVX512=true
+_VECTOR_WIDTH=8
 endif
 
 ifeq ($(strip $(DATA_TYPE)), SP)
-VECTOR_WIDTH=$((VECTOR_WIDTH * 2))
+VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
+else
+VECTOR_WIDTH=$(_VECTOR_WIDTH)
 endif
diff --git a/lammps/main-stub.c b/lammps/main-stub.c
index 4da90db..c7596e7 100644
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
@@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
 
     if(!csv) {
         printf("Number of timesteps: %d\n", param.ntimes);
-        printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
-        printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
         printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
         printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
         printf("Total number of atoms: %d\n", atom->Nlocal);
@@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
     E = getTimeStamp();
     double T_accum = E-S;
     double freq_hz = param.proc_freq * 1.e9;
-    const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
-    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
-    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
     const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
 
     if(!csv) {