Merge pull request #4 from RRZE-HPC/gromacs_sp

Gromacs sp
2022-03-15 20:31:42 +01:00
parent c2fcd50773 d47173d7a2
commit 459853dc25
16 changed files with 909 additions and 490 deletions
--- a/16
+++ b/16
@@ -10,15 +10,13 @@ Q         ?= @
 include $(MAKE_DIR)/config.mk
 include $(MAKE_DIR)/include_$(TAG).mk
 include $(MAKE_DIR)/include_LIKWID.mk
+include $(MAKE_DIR)/include_ISA.mk
 include $(MAKE_DIR)/include_GROMACS.mk
 INCLUDES  += -I./$(SRC_DIR)/includes

 ifeq ($(strip $(DATA_LAYOUT)),AOS)
 DEFINES +=  -DAOS
 endif
-ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
-DEFINES +=  -DCLUSTER_AOS
-endif
 ifeq ($(strip $(DATA_TYPE)),SP)
 DEFINES +=  -DPRECISION=1
 else
@@ -29,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
    ASFLAGS += -masm=intel
 endif

-ifneq ($(ATOMS_LOOP_RUNS),)
-    DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
-endif
-
-ifneq ($(NEIGHBORS_LOOP_RUNS),)
-    DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
-endif
-
 ifeq ($(strip $(EXPLICIT_TYPES)),true)
    DEFINES += -DEXPLICIT_TYPES
 endif
@@ -73,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
    DEFINES += -DNO_AVX2
 endif

+ifeq ($(strip $(AVX512)),true)
+    DEFINES += -DAVX512
+endif
+
 VPATH     = $(SRC_DIR) $(ASM_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
--- a/config.mk
+++ b/config.mk
@@ -1,11 +1,13 @@
 # Compiler tag (GCC/CLANG/ICC)
 TAG ?= ICC
+# Instruction set (SSE/AVX/AVX2/AVX512)
+ISA ?= AVX512
 # Optimization scheme (lammps/gromacs/clusters_per_bin)
 OPT_SCHEME ?= gromacs
 # Enable likwid (true or false)
 ENABLE_LIKWID ?= true
 # SP or DP
-DATA_TYPE ?= DP
+DATA_TYPE ?= SP
 # AOS or SOA
 DATA_LAYOUT ?= AOS
 # Assembly syntax to generate (ATT/INTEL)
@@ -13,26 +15,16 @@ ASM_SYNTAX ?= ATT
 # Debug
 DEBUG ?= false

-# Number of times to run the atoms loop on stubbed variant
-ATOMS_LOOP_RUNS ?= 1
-# Number of times to run the neighbors loop on stubbed variant
-NEIGHBORS_LOOP_RUNS ?= 1
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
 # Trace memory addresses for cache simulator (true or false)
 MEM_TRACER ?= false
 # Trace indexes and distances for gather-md (true or false)
 INDEX_TRACER ?= false
-# Vector width (elements) for index and distance tracer
-VECTOR_WIDTH ?= 8
-# When vector width is 4 but AVX2 is not supported (AVX only), set this to true
-NO_AVX2 ?= false
 # Compute statistics
 COMPUTE_STATS ?= true

 # Configurations for gromacs optimization scheme
-# AOS or SOA
-CLUSTER_LAYOUT ?= SOA
 # Use reference version
 USE_REFERENCE_VERSION ?= false
 # Enable XTC output
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
    atom->cl_x = NULL;
    atom->cl_v = NULL;
    atom->cl_f = NULL;
+    atom->cl_type = NULL;
    atom->Natoms = 0;
    atom->Nlocal = 0;
    atom->Nghost = 0;
@@ -51,7 +52,9 @@ void initAtom(Atom *atom) {
    atom->sigma6 = NULL;
    atom->cutforcesq = NULL;
    atom->cutneighsq = NULL;
-    atom->clusters = NULL;
+    atom->iclusters = NULL;
+    atom->jclusters = NULL;
+    atom->icluster_bin = NULL;
 }

 void createAtom(Atom *atom, Parameter *param) {
@@ -426,9 +429,13 @@ void growAtom(Atom *atom) {

 void growClusters(Atom *atom) {
    int nold = atom->Nclusters_max;
+    int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
    atom->Nclusters_max += DELTA;
-    atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
-    atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
-    atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
-    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
+    atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
+    atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
+    atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
+    atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
 }
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -41,11 +41,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
    MD_FLOAT epsilon = param->epsilon;

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *fptr = cluster_force_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            cluster_x(fptr, cii) = 0.0;
-            cluster_y(fptr, cii) = 0.0;
-            cluster_z(fptr, cii) = 0.0;
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_f[CL_X_OFFSET + cii] = 0.0;
+            ci_f[CL_Y_OFFSET + cii] = 0.0;
+            ci_f[CL_Z_OFFSET + cii] = 0.0;
        }
    }

@@ -54,28 +55,39 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat

    #pragma omp parallel for
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_cj0 = CJ0_FROM_CI(ci);
+        int ci_cj1 = CJ1_FROM_CI(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];

        for(int k = 0; k < numneighs; k++) {
            int cj = neighs[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            int any = 0;
-            MD_FLOAT *cjptr = cluster_pos_ptr(cj);
-            for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
-                MD_FLOAT xtmp = cluster_x(ciptr, cii);
-                MD_FLOAT ytmp = cluster_y(ciptr, cii);
-                MD_FLOAT ztmp = cluster_z(ciptr, cii);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+
+            for(int cii = 0; cii < CLUSTER_M; cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
                MD_FLOAT fix = 0;
                MD_FLOAT fiy = 0;
                MD_FLOAT fiz = 0;

-                for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
-                    if(ci != cj || cii != cjj) {
-                        MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
-                        MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
-                        MD_FLOAT delz = ztmp - cluster_z(cjptr, cjj);
+                for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+                    #if CLUSTER_M == CLUSTER_N
+                    if(ci_cj0 != cj || cii != cjj) {
+                    #elif CLUSTER_M < CLUSTER_N
+                    if(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj) {
+                    #else
+                    if((ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N)) {
+                    #endif
+                        MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
+                        MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
+                        MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj];
                        MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
                        if(rsq < cutforcesq) {
                            MD_FLOAT sr2 = 1.0 / rsq;
@@ -98,15 +110,15 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
                    addStat(stats->clusters_outside_cutoff, 1);
                }

-                cluster_x(cifptr, cii) += fix;
-                cluster_y(cifptr, cii) += fiy;
-                cluster_z(cifptr, cii) += fiz;
+                ci_f[CL_X_OFFSET + cii] += fix;
+                ci_f[CL_Y_OFFSET + cii] += fiy;
+                ci_f[CL_Z_OFFSET + cii] += fiz;
            }
        }

        addStat(stats->calculated_forces, 1);
        addStat(stats->num_neighs, numneighs);
-        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
    }

    LIKWID_MARKER_STOP("force");
@@ -115,6 +127,132 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
    return E-S;
 }

+double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
+    MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
+    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_f[CL_X_OFFSET + cii] = 0.0;
+            ci_f[CL_Y_OFFSET + cii] = 0.0;
+            ci_f[CL_Z_OFFSET + cii] = 0.0;
+        }
+    }
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    #pragma omp parallel for
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[ci];
+
+        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
+        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
+        MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
+        MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
+        MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
+        MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
+        MD_SIMD_FLOAT fix0 = simd_zero();
+        MD_SIMD_FLOAT fiy0 = simd_zero();
+        MD_SIMD_FLOAT fiz0 = simd_zero();
+        MD_SIMD_FLOAT fix2 = simd_zero();
+        MD_SIMD_FLOAT fiy2 = simd_zero();
+        MD_SIMD_FLOAT fiz2 = simd_zero();
+
+        for(int k = 0; k < numneighs; k++) {
+            int cj = neighs[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+            unsigned int mask0, mask1, mask2, mask3;
+
+            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
+            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
+            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
+            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
+            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
+            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
+            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
+            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
+            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
+
+            #if CLUSTER_M == CLUSTER_N
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            mask0 = (unsigned int)(0xf - 0x1 * cond0);
+            mask1 = (unsigned int)(0xf - 0x2 * cond0);
+            mask2 = (unsigned int)(0xf - 0x4 * cond0);
+            mask3 = (unsigned int)(0xf - 0x8 * cond0);
+            #elif CLUSTER_M < CLUSTER_N
+            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
+            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
+
+            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
+            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
+
+            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
+
+            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
+            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
+
+            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
+            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
+
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
+
+            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
+            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
+            fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
+            fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
+            fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
+            fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
+        }
+
+        simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
+        simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
+        simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
+
+        addStat(stats->calculated_forces, 1);
+        addStat(stats->num_neighs, numneighs);
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
+    }
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
+    return E-S;
+}

 double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
@@ -125,33 +263,36 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
    MD_FLOAT epsilon = param->epsilon;
    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
    MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
-    MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M;
-
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");

    #pragma omp parallel for
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_cj0 = CJ0_FROM_CI(ci);
+        #if CLUSTER_M > CLUSTER_N
+        int ci_cj1 = CJ1_FROM_CI(ci);
+        #endif
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];

-        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0));
-        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0));
-        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0));
-        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1));
-        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1));
-        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1));
-        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2));
-        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2));
-        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2));
-        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3));
-        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3));
-        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3));
+        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
+        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
+        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
+        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
+        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
+        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
+        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
+        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
+        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
+        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
+        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
+        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
        MD_SIMD_FLOAT fix0 = simd_zero();
        MD_SIMD_FLOAT fiy0 = simd_zero();
        MD_SIMD_FLOAT fiz0 = simd_zero();
@@ -165,24 +306,13 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
        MD_SIMD_FLOAT fiy3 = simd_zero();
        MD_SIMD_FLOAT fiz3 = simd_zero();

-        for(int k = 0; k < numneighs; k += unroll_j) {
-            int cj0 = neighs[k + 0];
-            unsigned int cond0 = (unsigned int)(ci == cj0);
-            MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
-
-            #if VECTOR_WIDTH == 8
-            int cj1 = neighs[k + 1];
-            unsigned int cond1 = (unsigned int)(ci == cj1);
-            MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
-            MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
-            MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
-            MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
-            #else
-            MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
-            MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
-            MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
-            #endif
-
+        for(int k = 0; k < numneighs; k++) {
+            int cj = neighs[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
+            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
+            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
@@ -196,16 +326,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);

-            #if VECTOR_WIDTH == 8
+            #if CLUSTER_M == CLUSTER_N
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            #elif CLUSTER_M < CLUSTER_N
+            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
            #else
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
            #endif

            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
@@ -228,10 +368,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));

-            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
-            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
-            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
-            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
+            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
+            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));

            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
@@ -247,22 +387,22 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
            fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
        }

-        cluster_x(cifptr, 0) = simd_horizontal_sum(fix0);
-        cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0);
-        cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0);
-        cluster_x(cifptr, 1) = simd_horizontal_sum(fix1);
-        cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1);
-        cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1);
-        cluster_x(cifptr, 2) = simd_horizontal_sum(fix2);
-        cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2);
-        cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2);
-        cluster_x(cifptr, 3) = simd_horizontal_sum(fix3);
-        cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3);
-        cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3);
+        ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
+        ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
+        ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
+        ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
+        ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
+        ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
+        ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
+        ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
+        ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
+        ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
+        ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
+        ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);

        addStat(stats->calculated_forces, 1);
        addStat(stats->num_neighs, numneighs);
-        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
    }

    LIKWID_MARKER_STOP("force");
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -27,13 +27,71 @@

 #define DELTA 20000

-#define CLUSTER_DIM_M       4
-#define CLUSTER_DIM_N       VECTOR_WIDTH
+#define CLUSTER_M 4
+
+// Nbnxn layouts (as of GROMACS):
+// Simd4xN: M=4, N=VECTOR_WIDTH
+// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
+
+// Simd2xNN (here used for single-precision)
+#if VECTOR_WIDTH > CLUSTER_M * 2
+#   define KERNEL_NAME              "Simd2xNN"
+#   define CLUSTER_N                (VECTOR_WIDTH / 2)
+#   define computeForceLJ           computeForceLJ_2xnn
+// Simd4xN
+#else
+#   define KERNEL_NAME              "Simd4xN"
+#   define CLUSTER_N                VECTOR_WIDTH
+#   define computeForceLJ           computeForceLJ_4xn
+#endif
+
+#ifdef USE_REFERENCE_VERSION
+#   undef KERNEL_NAME
+#   undef computeForceLJ
+#   define KERNEL_NAME              "Reference"
+#   define computeForceLJ           computeForceLJ_ref
+#endif
+
+#if CLUSTER_M == CLUSTER_N
+#   define CJ0_FROM_CI(a)           (a)
+#   define CJ1_FROM_CI(a)           (a)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#elif CLUSTER_M == CLUSTER_N * 2 // M > N
+#   define CJ0_FROM_CI(a)           ((a) << 1)
+#   define CJ1_FROM_CI(a)           (((a) << 1) | 0x1)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_M * (b))
+#   define CJ_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
+#elif CLUSTER_M == CLUSTER_N / 2 // M < N
+#   define CJ0_FROM_CI(a)           ((a) >> 1)
+#   define CJ1_FROM_CI(a)           ((a) >> 1)
+#   define CI_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#else
+#   error "Invalid cluster configuration!"
+#endif
+
+#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
+#   error "Cluster N dimension can be only 2, 4 and 8"
+#endif
+
+#define CI_SCALAR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 1))
+#define CI_VECTOR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 3))
+#define CJ_SCALAR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 1))
+#define CJ_VECTOR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 3))
+
+#if CLUSTER_M >= CLUSTER_N
+#   define CL_X_OFFSET              (0 * CLUSTER_M)
+#   define CL_Y_OFFSET              (1 * CLUSTER_M)
+#   define CL_Z_OFFSET              (2 * CLUSTER_M)
+#else
+#   define CL_X_OFFSET              (0 * CLUSTER_N)
+#   define CL_Y_OFFSET              (1 * CLUSTER_N)
+#   define CL_Z_OFFSET              (2 * CLUSTER_N)
+#endif

 typedef struct {
-    int bin;
    int natoms;
-    int type[CLUSTER_DIM_M];
    MD_FLOAT bbminx, bbmaxx;
    MD_FLOAT bbminy, bbmaxy;
    MD_FLOAT bbminz, bbmaxz;
@@ -44,9 +102,6 @@ typedef struct {
    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
    MD_FLOAT *x, *y, *z;
    MD_FLOAT *vx, *vy, *vz;
-    MD_FLOAT *cl_x;
-    MD_FLOAT *cl_v;
-    MD_FLOAT *cl_f;
    int *border_map;
    int *type;
    int ntypes;
@@ -54,8 +109,15 @@ typedef struct {
    MD_FLOAT *sigma6;
    MD_FLOAT *cutforcesq;
    MD_FLOAT *cutneighsq;
-    Cluster *clusters;
    int *PBCx, *PBCy, *PBCz;
+    // Data in cluster format
+    MD_FLOAT *cl_x;
+    MD_FLOAT *cl_v;
+    MD_FLOAT *cl_f;
+    int *cl_type;
+    Cluster *iclusters, *jclusters;
+    int *icluster_bin;
+    int dummy_cj;
 } Atom;

 extern void initAtom(Atom*);
@@ -67,10 +129,6 @@ extern int readAtom_dmp(Atom*, Parameter*);
 extern void growAtom(Atom*);
 extern void growClusters(Atom*);

-#define cluster_pos_ptr(ci)       &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3])
-#define cluster_velocity_ptr(ci)  &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3])
-#define cluster_force_ptr(ci)     &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3])
-
 #ifdef AOS
 #define POS_DATA_LAYOUT     "AoS"
 #define atom_x(i)           atom->x[(i) * 3 + 0]
@@ -83,14 +141,4 @@ extern void growClusters(Atom*);
 #define atom_z(i)           atom->z[i]
 #endif

-#ifdef CLUSTER_AOS
-#define cluster_x(cptr, i)  cptr[(i) * 3 + 0]
-#define cluster_y(cptr, i)  cptr[(i) * 3 + 1]
-#define cluster_z(cptr, i)  cptr[(i) * 3 + 2]
-#else
-#define cluster_x(cptr, i)  cptr[0 * CLUSTER_DIM_M + (i)]
-#define cluster_y(cptr, i)  cptr[1 * CLUSTER_DIM_M + (i)]
-#define cluster_z(cptr, i)  cptr[2 * CLUSTER_DIM_M + (i)]
-#endif
-
 #endif
--- a/gromacs/includes/neighbor.h
+++ b/gromacs/includes/neighbor.h
@@ -40,6 +40,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
 extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
 extern void sortAtom(Atom*);
 extern void buildClusters(Atom*);
+extern void defineJClusters(Atom*);
 extern void binClusters(Atom*);
 extern void updateSingleAtoms(Atom*);
 #endif
--- a/gromacs/includes/simd.h
+++ b/gromacs/includes/simd.h
@@ -21,6 +21,7 @@
 * =======================================================================================
 */

+#include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #include <zmmintrin.h>
@@ -28,7 +29,9 @@
 #define SIMD_PRINT_REAL(a)  simd_print_real(#a, a);
 #define SIMD_PRINT_MASK(a)  simd_print_mask(#a, a);

-#if VECTOR_WIDTH == 8 // AVX512
+#ifdef AVX512
+
+#if PRECISION == 2 // Double precision

 #define MD_SIMD_FLOAT       __m512d
 #define MD_SIMD_MASK        __mmask8
@@ -45,30 +48,100 @@ static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { retur
 static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
 static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }

-static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) {
-    MD_SIMD_FLOAT x;
-#ifdef CLUSTER_AOS
-    __m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0);
-    __m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d));
-    x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double));
-    x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double));
-#else
-    x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]);
-    x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1);
-#endif
-    return x;
-}
-
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
    MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
    x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
    x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
    return *((MD_FLOAT *) &x);
 }

+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
+    return _mm512_broadcast_f64x4(_mm256_load_pd(m));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
+    return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
+}
+
+static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512d t0;
+    __m256d t2, t3;
+
+    t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
+    t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
+    t2 = _mm512_castpd512_pd256(t0);
+    t3 = _mm256_load_pd(m);
+    t3 = _mm256_add_pd(t3, t2);
+    _mm256_store_pd(m, t3);
+
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
+}
+
+#else // Single-precision
+
+#define MD_SIMD_FLOAT       __m512
+#define MD_SIMD_MASK        __mmask16
+
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    // This would only be called in a Mx16 configuration, which is not valid in GROMACS
+    fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
+    return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
+    return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
+}
+
+static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512 t0, t1;
+    __m128 t2, t3;
+
+    t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
+    t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
+    t0 = _mm512_add_ps(t0, t1);
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
+    t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
+    t3 = _mm512_castps512_ps128(t0);
+    t2 = _mm_load_ps(m);
+    t2 = _mm_add_ps(t2, t3);
+    _mm_store_ps(m, t2);
+
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
+    return _mm_cvtss_f32(t3);
+}
+
+#endif // PRECISION
+
 #else // AVX or AVX2

+#if PRECISION == 2 // Double precision
+
 #define MD_SIMD_FLOAT       __m256d

 #ifdef NO_AVX2
@@ -82,8 +155,10 @@ static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
 static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
 static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }

 #ifdef NO_AVX2
+
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
 static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
 static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
@@ -97,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
 }
 // TODO: Implement this, althrough it is just required for debugging
 static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
    __m128d a0, a1;
    a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
    a0 = _mm256_castpd256_pd128(a);
@@ -105,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
    a0 = _mm_add_sd(a0, a1);
    return *((MD_FLOAT *) &a0);
 }
-#else
+
+#else // AVX2
+
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
 static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
 static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
@@ -113,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
 static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
 static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
    __m128d a0, a1;
    // test with shuffle & add as an alternative to hadd later
    a = _mm256_hadd_pd(a, a);
@@ -122,24 +199,52 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
    a0 = _mm_add_sd(a0, a1);
    return *((MD_FLOAT *) &a0);
 }
+
 #endif

-static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) {
-    MD_SIMD_FLOAT x;
-#ifdef CLUSTER_AOS
-    #ifdef NO_AVX2
-    #error "Not possible to use AoS cluster layout without AVX2 support!"
-    #endif
-    __m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0);
-    __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d));
-    x = _mm256_i32gather_pd(c0, vindex, sizeof(double));
+#else // Single-precision
+
+#define MD_SIMD_FLOAT       __m256
+
+#ifdef NO_AVX2
+#define MD_SIMD_MASK        __m256
 #else
-    x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]);
+#define MD_SIMD_MASK        __mmask8
 #endif
-    return x;
+
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
+
+#ifdef NO_AVX2
+
+#error "AVX intrinsincs with single-precision not implemented!"
+
+#else // AVX2
+
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128 t0;
+    t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
 }

-#endif
+#endif // NO_AVX2
+
+#endif // PRECISION
+
+#endif // AVX or AVX2

 static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
    double x[VECTOR_WIDTH];
--- a/gromacs/main-stub.c
+++ b/gromacs/main-stub.c
@@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {

    if(!csv) {
        printf("Number of timesteps: %d\n", param.ntimes);
-        printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
-        printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
        printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
        printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
        printf("Total number of atoms: %d\n", atom->Nlocal);
@@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
    E = getTimeStamp();
    double T_accum = E-S;
    double freq_hz = param.proc_freq * 1.e9;
-    const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
-    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
-    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
    const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);

    if(!csv) {
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -43,14 +43,9 @@

 extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);

-#ifdef USE_REFERENCE_VERSION
-#   define computeForceLJ   computeForceLJ_ref
-#else
-#   define computeForceLJ   computeForceLJ_4xn
-#endif
-
 double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
    if(param->force_field == FF_EAM) { initEam(eam, param); }
    double S, E;
@@ -75,6 +70,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
    setupThermo(param, atom->Natoms);
    if(param->input_file == NULL) { adjustThermo(param, atom); }
    buildClusters(atom);
+    defineJClusters(atom);
    setupPbc(atom, param);
    binClusters(atom);
    buildNeighbor(atom, neighbor);
@@ -89,6 +85,7 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
    updateSingleAtoms(atom);
    updateAtomsPbc(atom, param);
    buildClusters(atom);
+    defineJClusters(atom);
    setupPbc(atom, param);
    binClusters(atom);
    buildNeighbor(atom, neighbor);
@@ -101,17 +98,18 @@ void initialIntegrate(Parameter *param, Atom *atom) {
    DEBUG_MESSAGE("initialIntegrate start\n");

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-        MD_FLOAT *civptr = cluster_velocity_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];

-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
-            cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
-            cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
-            cluster_x(ciptr, cii) += param->dt * cluster_x(civptr, cii);
-            cluster_y(ciptr, cii) += param->dt * cluster_y(civptr, cii);
-            cluster_z(ciptr, cii) += param->dt * cluster_z(civptr, cii);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+            ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
+            ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
+            ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
        }
    }

@@ -122,13 +120,14 @@ void finalIntegrate(Parameter *param, Atom *atom) {
    DEBUG_MESSAGE("finalIntegrate start\n");

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *civptr = cluster_velocity_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];

-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
-            cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
-            cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
        }
    }

@@ -316,6 +315,7 @@ int main(int argc, char** argv) {
    }

    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
+    updateSingleAtoms(&atom);
    computeThermo(-1, &param, &atom);

    if(param.xtc_file != NULL) {
@@ -323,6 +323,7 @@ int main(int argc, char** argv) {
    }

    printf(HLINE);
+    printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
    printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
 #if PRECISION == 1
    printf("Using single precision floating point.\n");
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -39,8 +39,8 @@ static int nbinx, nbiny;
 static int mbinx, mbiny; // n bins in x, y
 static int *bincount;
 static int *bins;
-static int *cluster_bincount;
-static int *cluster_bins;
+static int *bin_nclusters;
+static int *bin_clusters;
 static int mbins; //total number of bins
 static int atoms_per_bin;  // max atoms per bin
 static int clusters_per_bin;  // max clusters per bin
@@ -63,12 +63,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
    cutneigh = param->cutneigh;
    nmax = 0;
    atoms_per_bin = 8;
-    clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4;
+    clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
    stencil = NULL;
    bins = NULL;
    bincount = NULL;
-    cluster_bins = NULL;
-    cluster_bincount = NULL;
+    bin_clusters = NULL;
+    bin_nclusters = NULL;
    neighbor->maxneighs = 100;
    neighbor->numneigh = NULL;
    neighbor->neighbors = NULL;
@@ -91,7 +91,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;

    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
-    MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N);
+    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
    nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
@@ -103,20 +103,16 @@ void setupNeighbor(Parameter *param, Atom *atom) {
    cutneighsq = cutneigh * cutneigh;

    coord = xlo - cutneigh - SMALL * xprd;
-    mbinxlo = (int) (coord * bininvx);
-    if (coord < 0.0) {
-        mbinxlo = mbinxlo - 1;
-    }
+    mbinxlo = (int)(coord * bininvx);
+    if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
    coord = xhi + cutneigh + SMALL * xprd;
-    mbinxhi = (int) (coord * bininvx);
+    mbinxhi = (int)(coord * bininvx);

    coord = ylo - cutneigh - SMALL * yprd;
-    mbinylo = (int) (coord * bininvy);
-    if (coord < 0.0) {
-        mbinylo = mbinylo - 1;
-    }
+    mbinylo = (int)(coord * bininvy);
+    if(coord < 0.0) { mbinylo = mbinylo - 1; }
    coord = yhi + cutneigh + SMALL * yprd;
-    mbinyhi = (int) (coord * bininvy);
+    mbinyhi = (int)(coord * bininvy);

    mbinxlo = mbinxlo - 1;
    mbinxhi = mbinxhi + 1;
@@ -126,16 +122,12 @@ void setupNeighbor(Parameter *param, Atom *atom) {
    mbinyhi = mbinyhi + 1;
    mbiny = mbinyhi - mbinylo + 1;

-    nextx = (int) (cutneigh * bininvx);
+    nextx = (int)(cutneigh * bininvx);
+    nexty = (int)(cutneigh * bininvy);
    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
-
-    nexty = (int) (cutneigh * bininvy);
    if(nexty * binsizey < FACTOR * cutneigh) nexty++;

-    if (stencil) {
-        free(stencil);
-    }
-
+    if (stencil) { free(stencil); }
    stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
    nstencil = 0;

@@ -147,19 +139,15 @@ void setupNeighbor(Parameter *param, Atom *atom) {
        }
    }

+    if(bincount) { free(bincount); }
+    if(bins) { free(bins); }
+    if(bin_nclusters) { free(bin_nclusters); }
+    if(bin_clusters) { free(bin_clusters); }
    mbins = mbinx * mbiny;
-
-    if (bincount) { free(bincount); }
    bincount = (int*) malloc(mbins * sizeof(int));
-
-    if (bins) { free(bins); }
    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
-
-    if (cluster_bincount) { free(cluster_bincount); }
-    cluster_bincount = (int*) malloc(mbins * sizeof(int));
-
-    if (cluster_bins) { free(cluster_bins); }
-    cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+    bin_nclusters = (int*) malloc(mbins * sizeof(int));
+    bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));

    /*
    DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
@@ -171,20 +159,20 @@ void setupNeighbor(Parameter *param, Atom *atom) {
 }

 MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
-    MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx;
-    MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx;
+    MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
+    MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
    MD_FLOAT dm = MAX(dl, dh);
    MD_FLOAT dm0 = MAX(dm, 0.0);
    MD_FLOAT d2 = dm0 * dm0;

-    dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy;
-    dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy;
+    dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
+    dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
    dm = MAX(dl, dh);
    dm0 = MAX(dm, 0.0);
    d2 += dm0 * dm0;

-    dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz;
-    dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz;
+    dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
+    dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
    dm = MAX(dl, dh);
    dm0 = MAX(dm, 0.0);
    d2 += dm0 * dm0;
@@ -192,14 +180,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
 }

 int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
-    MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-    MD_FLOAT *cjptr = cluster_pos_ptr(cj);
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];

-    for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-        for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) {
-            MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj);
-            MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj);
-            MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj);
+    for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
+            MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
+            MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
            if(delx * delx + dely * dely + delz * delz < rsq) {
                return 1;
            }
@@ -211,11 +201,10 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {

 void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("buildNeighbor start\n");
-    int nall = atom->Nclusters_local + atom->Nclusters_ghost;

    /* extend atom arrays if necessary */
-    if(nall > nmax) {
-        nmax = nall;
+    if(atom->Nclusters_local > nmax) {
+        nmax = atom->Nclusters_local;
        if(neighbor->numneigh) free(neighbor->numneigh);
        if(neighbor->neighbors) free(neighbor->neighbors);
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
@@ -234,22 +223,22 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
        resize = 0;

        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-            int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
            int n = 0;
-            int ibin = atom->clusters[ci].bin;
-            MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx;
-            MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx;
-            MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy;
-            MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy;
-            MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz;
-            MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz;
+            int ibin = atom->icluster_bin[ci];
+            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
+            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
+            MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
+            MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
+            MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
+            MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;

            for(int k = 0; k < nstencil; k++) {
                int jbin = ibin + stencil[k];
-                int *loc_bin = &cluster_bins[jbin * clusters_per_bin];
+                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
                int cj, m = -1;
                MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
-                const int c = cluster_bincount[jbin];
+                const int c = bin_nclusters[jbin];

                if(c > 0) {
                    MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
@@ -257,8 +246,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                    do {
                        m++;
                        cj = loc_bin[m];
-                        jbb_zmin = atom->clusters[cj].bbminz;
-                        jbb_zmax = atom->clusters[cj].bbmaxz;
+                        jbb_zmin = atom->jclusters[cj].bbminz;
+                        jbb_zmax = atom->jclusters[cj].bbmaxz;
                        dl = ibb_zmin - jbb_zmax;
                        dh = jbb_zmin - ibb_zmax;
                        dm = MAX(dl, dh);
@@ -266,10 +255,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                        d_bb_sq = dm0 * dm0;
                    } while(m + 1 < c && d_bb_sq > cutneighsq);

-                    jbb_xmin = atom->clusters[cj].bbminx;
-                    jbb_xmax = atom->clusters[cj].bbmaxx;
-                    jbb_ymin = atom->clusters[cj].bbminy;
-                    jbb_ymax = atom->clusters[cj].bbmaxy;
+                    jbb_xmin = atom->jclusters[cj].bbminx;
+                    jbb_xmax = atom->jclusters[cj].bbmaxx;
+                    jbb_ymin = atom->jclusters[cj].bbminy;
+                    jbb_ymax = atom->jclusters[cj].bbmaxy;

                    while(m < c) {
                        dl = ibb_zmin - jbb_zmax;
@@ -303,20 +292,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                        m++;
                        if(m < c) {
                            cj = loc_bin[m];
-                            jbb_xmin = atom->clusters[cj].bbminx;
-                            jbb_xmax = atom->clusters[cj].bbmaxx;
-                            jbb_ymin = atom->clusters[cj].bbminy;
-                            jbb_ymax = atom->clusters[cj].bbmaxy;
-                            jbb_zmin = atom->clusters[cj].bbminz;
-                            jbb_zmax = atom->clusters[cj].bbmaxz;
+                            jbb_xmin = atom->jclusters[cj].bbminx;
+                            jbb_xmax = atom->jclusters[cj].bbmaxx;
+                            jbb_ymin = atom->jclusters[cj].bbminy;
+                            jbb_ymax = atom->jclusters[cj].bbmaxy;
+                            jbb_zmin = atom->jclusters[cj].bbminz;
+                            jbb_zmax = atom->jclusters[cj].bbmaxz;
                        }
                    }
                }
            }

-            if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
-                while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
-                    neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster
+            // Fill neighbor list with dummy values to fit vector width
+            if(CLUSTER_N < VECTOR_WIDTH) {
+                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
+                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
                }
            }

@@ -341,38 +331,40 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    /*
    DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
    for(int ci = 0; ci < 6; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
        int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);

        DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
            ci,
-            atom->clusters[ci].bbminx,
-            atom->clusters[ci].bbmaxx,
-            atom->clusters[ci].bbminy,
-            atom->clusters[ci].bbmaxy,
-            atom->clusters[ci].bbminz,
-            atom->clusters[ci].bbmaxz);
+            atom->iclusters[ci].bbminx,
+            atom->iclusters[ci].bbmaxx,
+            atom->iclusters[ci].bbminy,
+            atom->iclusters[ci].bbmaxy,
+            atom->iclusters[ci].bbminz,
+            atom->iclusters[ci].bbmaxz);

-        for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
-            DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
+            DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
        }

        DEBUG_MESSAGE("Neighbors:\n");
        for(int k = 0; k < neighbor->numneigh[ci]; k++) {
-            const int cj = neighptr[k];
-            MD_FLOAT *cjptr = cluster_pos_ptr(cj);
+            int cj = neighptr[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];

            DEBUG_MESSAGE("    Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
                cj,
-                atom->clusters[cj].bbminx,
-                atom->clusters[cj].bbmaxx,
-                atom->clusters[cj].bbminy,
-                atom->clusters[cj].bbmaxy,
-                atom->clusters[cj].bbminz,
-                atom->clusters[cj].bbmaxz);
+                atom->jclusters[cj].bbminx,
+                atom->jclusters[cj].bbmaxx,
+                atom->jclusters[cj].bbminy,
+                atom->jclusters[cj].bbmaxy,
+                atom->jclusters[cj].bbminz,
+                atom->jclusters[cj].bbmaxz);

-            for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
-                DEBUG_MESSAGE("    %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
+            for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+                DEBUG_MESSAGE("    %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
            }
        }
    }
@@ -383,7 +375,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {

 void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("pruneNeighbor start\n");
-    int nall = atom->Nclusters_local + atom->Nclusters_ghost;
    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
    MD_FLOAT cutsq = cutneighsq;

@@ -393,8 +384,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
        int k = 0;

        // Remove dummy clusters if necessary
-        if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
-            while(neighs[numneighs - 1] == nall - 1) {
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(neighs[numneighs - 1] == atom->dummy_cj) {
                numneighs--;
            }
        }
@@ -410,9 +401,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
        }

        // Readd dummy clusters if necessary
-        if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
-            while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
-                neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
+                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
            }
        }

@@ -558,34 +549,37 @@ void buildClusters(Atom *atom) {
    for(int bin = 0; bin < mbins; bin++) {
        int c = bincount[bin];
        int ac = 0;
-        const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M);
+        int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
+        if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
        for(int cl = 0; cl < nclusters; cl++) {
            const int ci = atom->Nclusters_local;
-
            if(ci >= atom->Nclusters_max) {
                growClusters(atom);
            }

-            MD_FLOAT *cptr = cluster_pos_ptr(ci);
-            MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
+            int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+            int *ci_type = &atom->cl_type[ci_sca_base];
            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
-            atom->clusters[ci].natoms = 0;

-            for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
+            atom->iclusters[ci].natoms = 0;
+            for(int cii = 0; cii < CLUSTER_M; cii++) {
                if(ac < c) {
                    int i = bins[bin * atoms_per_bin + ac];
                    MD_FLOAT xtmp = atom_x(i);
                    MD_FLOAT ytmp = atom_y(i);
                    MD_FLOAT ztmp = atom_z(i);

-                    cluster_x(cptr, cii) = xtmp;
-                    cluster_y(cptr, cii) = ytmp;
-                    cluster_z(cptr, cii) = ztmp;
-                    cluster_x(cvptr, cii) = atom->vx[i];
-                    cluster_y(cvptr, cii) = atom->vy[i];
-                    cluster_z(cvptr, cii) = atom->vz[i];
+                    ci_x[CL_X_OFFSET + cii] = xtmp;
+                    ci_x[CL_Y_OFFSET + cii] = ytmp;
+                    ci_x[CL_Z_OFFSET + cii] = ztmp;
+                    ci_v[CL_X_OFFSET + cii] = atom->vx[i];
+                    ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
+                    ci_v[CL_Z_OFFSET + cii] = atom->vz[i];

                    // TODO: To create the bounding boxes faster, we can use SIMD operations
                    if(bbminx > xtmp) { bbminx = xtmp; }
@@ -595,24 +589,24 @@ void buildClusters(Atom *atom) {
                    if(bbminz > ztmp) { bbminz = ztmp; }
                    if(bbmaxz < ztmp) { bbmaxz = ztmp; }

-                    atom->clusters[ci].type[cii] = atom->type[i];
-                    atom->clusters[ci].natoms++;
+                    ci_type[cii] = atom->type[i];
+                    atom->iclusters[ci].natoms++;
                } else {
-                    cluster_x(cptr, cii) = INFINITY;
-                    cluster_y(cptr, cii) = INFINITY;
-                    cluster_z(cptr, cii) = INFINITY;
+                    ci_x[CL_X_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Y_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Z_OFFSET + cii] = INFINITY;
                }

                ac++;
            }

-            atom->clusters[ci].bin = bin;
-            atom->clusters[ci].bbminx = bbminx;
-            atom->clusters[ci].bbmaxx = bbmaxx;
-            atom->clusters[ci].bbminy = bbminy;
-            atom->clusters[ci].bbmaxy = bbmaxy;
-            atom->clusters[ci].bbminz = bbminz;
-            atom->clusters[ci].bbmaxz = bbmaxz;
+            atom->icluster_bin[ci] = bin;
+            atom->iclusters[ci].bbminx = bbminx;
+            atom->iclusters[ci].bbmaxx = bbmaxx;
+            atom->iclusters[ci].bbminy = bbminy;
+            atom->iclusters[ci].bbmaxy = bbmaxy;
+            atom->iclusters[ci].bbminz = bbminz;
+            atom->iclusters[ci].bbmaxz = bbmaxz;
            atom->Nclusters_local++;
        }
    }
@@ -620,6 +614,94 @@ void buildClusters(Atom *atom) {
    DEBUG_MESSAGE("buildClusters end\n");
 }

+void defineJClusters(Atom *atom) {
+    DEBUG_MESSAGE("defineJClusters start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int cj0 = CJ0_FROM_CI(ci);
+
+        if(CLUSTER_M == CLUSTER_N) {
+            atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
+            atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
+            atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
+            atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
+            atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
+            atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
+            atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
+
+        } else if(CLUSTER_M > CLUSTER_N) {
+            int cj1 = CJ1_FROM_CI(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj0].bbminx = bbminx;
+            atom->jclusters[cj0].bbmaxx = bbmaxx;
+            atom->jclusters[cj0].bbminy = bbminy;
+            atom->jclusters[cj0].bbmaxy = bbmaxy;
+            atom->jclusters[cj0].bbminz = bbminz;
+            atom->jclusters[cj0].bbmaxz = bbmaxz;
+            atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
+
+            bbminx = INFINITY, bbmaxx = -INFINITY;
+            bbminy = INFINITY, bbmaxy = -INFINITY;
+            bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj1].bbminx = bbminx;
+            atom->jclusters[cj1].bbmaxx = bbmaxx;
+            atom->jclusters[cj1].bbminy = bbminy;
+            atom->jclusters[cj1].bbmaxy = bbmaxy;
+            atom->jclusters[cj1].bbminz = bbminz;
+            atom->jclusters[cj1].bbmaxz = bbmaxz;
+            atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
+
+        } else {
+            if(ci % 2 == 0) {
+                const int ci1 = ci + 1;
+                atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
+                atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
+                atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
+                atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
+                atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
+                atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
+                atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
+            }
+        }
+    }
+
+    DEBUG_MESSAGE("defineJClusters end\n");
+}
+
 void binClusters(Atom *atom) {
    DEBUG_MESSAGE("binClusters start\n");

@@ -629,7 +711,7 @@ void binClusters(Atom *atom) {
        MD_FLOAT *cptr = cluster_pos_ptr(ci);
        DEBUG_MESSAGE("Cluster %d:\n", ci);
        DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
-            atom->clusters[ci].bin,
+            atom->icluster_bin[ci],
            atom->clusters[ci].natoms,
            atom->clusters[ci].bbminx,
            atom->clusters[ci].bbmaxx,
@@ -638,104 +720,120 @@ void binClusters(Atom *atom) {
            atom->clusters[ci].bbminz,
            atom->clusters[ci].bbmaxz);

-        for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
            DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
        }
    }
    */

    const int nlocal = atom->Nclusters_local;
-    int resize = 1;
+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    const int ncj = atom->Nclusters_local / jfac;

+    int resize = 1;
    while(resize > 0) {
        resize = 0;

        for(int bin = 0; bin < mbins; bin++) {
-            cluster_bincount[bin] = 0;
+            bin_nclusters[bin] = 0;
        }

        for(int ci = 0; ci < nlocal && !resize; ci++) {
-            int bin = atom->clusters[ci].bin;
-            int c = cluster_bincount[bin];
-            if(c < clusters_per_bin) {
-                cluster_bins[bin * clusters_per_bin + c] = ci;
-                cluster_bincount[bin]++;
-            } else {
-                resize = 1;
+            // Assure we add this j-cluster only once in the bin
+            if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
+                int bin = atom->icluster_bin[ci];
+                int c = bin_nclusters[bin];
+                if(c + 1 < clusters_per_bin) {
+                    bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
+                    bin_nclusters[bin]++;
+
+                    if(CLUSTER_M > CLUSTER_N) {
+                        int cj1 = CJ1_FROM_CI(ci);
+                        if(atom->jclusters[cj1].natoms > 0) {
+                            bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
+                            bin_nclusters[bin]++;
+                        }
+                    }
+                } else {
+                    resize = 1;
+                }
            }
        }

        for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
-            const int ci = nlocal + cg;
-            MD_FLOAT *cptr = cluster_pos_ptr(ci);
-            MD_FLOAT ci_minz = atom->clusters[ci].bbminz;
-            MD_FLOAT xtmp, ytmp;
+            const int cj = ncj + cg;
            int ix = -1, iy = -1;
+            MD_FLOAT xtmp, ytmp;

-            xtmp = cluster_x(cptr, 0);
-            ytmp = cluster_y(cptr, 0);
-            coord2bin2D(xtmp, ytmp, &ix, &iy);
-            ix = MAX(MIN(ix, mbinx - 1), 0);
-            iy = MAX(MIN(iy, mbiny - 1), 0);
-            for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) {
-                int nix, niy;
-                xtmp = cluster_x(cptr, cii);
-                ytmp = cluster_y(cptr, cii);
-                coord2bin2D(xtmp, ytmp, &nix, &niy);
-                nix = MAX(MIN(nix, mbinx - 1), 0);
-                niy = MAX(MIN(niy, mbiny - 1), 0);
+            if(atom->jclusters[cj].natoms > 0) {
+                int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+                MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+                MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;

-                // Always put the cluster on the bin of its innermost atom so
-                // the cluster should be closer to local clusters
-                if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
-                if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
-                if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
-                if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
-            }
+                xtmp = cj_x[CL_X_OFFSET + 0];
+                ytmp = cj_x[CL_Y_OFFSET + 0];
+                coord2bin2D(xtmp, ytmp, &ix, &iy);
+                ix = MAX(MIN(ix, mbinx - 1), 0);
+                iy = MAX(MIN(iy, mbiny - 1), 0);
+                for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
+                    int nix, niy;
+                    xtmp = cj_x[CL_X_OFFSET + cjj];
+                    ytmp = cj_x[CL_Y_OFFSET + cjj];
+                    coord2bin2D(xtmp, ytmp, &nix, &niy);
+                    nix = MAX(MIN(nix, mbinx - 1), 0);
+                    niy = MAX(MIN(niy, mbiny - 1), 0);

-            int bin = iy * mbinx + ix + 1;
-            int c = cluster_bincount[bin];
-            if(c < clusters_per_bin) {
-                // Insert the current ghost cluster in the bin keeping clusters
-                // sorted by z coordinate
-                int inserted = 0;
-                for(int i = 0; i < c; i++) {
-                    int last_cl = cluster_bins[bin * clusters_per_bin + i];
-                    if(atom->clusters[last_cl].bbminz > ci_minz) {
-                        cluster_bins[bin * clusters_per_bin + i] = ci;
+                    // Always put the cluster on the bin of its innermost atom so
+                    // the cluster should be closer to local clusters
+                    if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
+                    if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
+                    if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
+                    if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
+                }

-                        for(int j = i + 1; j <= c; j++) {
-                            int tmp = cluster_bins[bin * clusters_per_bin + j];
-                            cluster_bins[bin * clusters_per_bin + j] = last_cl;
-                            last_cl = tmp;
+                int bin = iy * mbinx + ix + 1;
+                int c = bin_nclusters[bin];
+                if(c < clusters_per_bin) {
+                    // Insert the current ghost cluster in the bin keeping clusters
+                    // sorted by z coordinate
+                    int inserted = 0;
+                    for(int i = 0; i < c; i++) {
+                        int last_cl = bin_clusters[bin * clusters_per_bin + i];
+                        if(atom->jclusters[last_cl].bbminz > cj_minz) {
+                            bin_clusters[bin * clusters_per_bin + i] = cj;
+
+                            for(int j = i + 1; j <= c; j++) {
+                                int tmp = bin_clusters[bin * clusters_per_bin + j];
+                                bin_clusters[bin * clusters_per_bin + j] = last_cl;
+                                last_cl = tmp;
+                            }
+
+                            inserted = 1;
+                            break;
                        }
-
-                        inserted = 1;
-                        break;
                    }
-                }

-                if(!inserted) {
-                    cluster_bins[bin * clusters_per_bin + c] = ci;
-                }
+                    if(!inserted) {
+                        bin_clusters[bin * clusters_per_bin + c] = cj;
+                    }

-                atom->clusters[ci].bin = bin;
-                cluster_bincount[bin]++;
-            } else {
-                resize = 1;
+                    bin_nclusters[bin]++;
+                } else {
+                    resize = 1;
+                }
            }
        }

        if(resize) {
-            free(cluster_bins);
+            free(bin_clusters);
            clusters_per_bin *= 2;
-            cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+            bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
        }
    }

    /*
-    DEBUG_MESSAGE("cluster_bincount\n");
-    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); }
+    DEBUG_MESSAGE("bin_nclusters\n");
+    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
    DEBUG_MESSAGE("\n");
    */

@@ -747,16 +845,17 @@ void updateSingleAtoms(Atom *atom) {
    int Natom = 0;

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];

-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            atom_x(Natom) = cluster_x(cptr, cii);
-            atom_y(Natom) = cluster_y(cptr, cii);
-            atom_z(Natom) = cluster_z(cptr, cii);
-            atom->vx[Natom] = cluster_x(cvptr, cii);
-            atom->vy[Natom] = cluster_y(cvptr, cii);
-            atom->vz[Natom] = cluster_z(cvptr, cii);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
+            atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
+            atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
+            atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
+            atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
+            atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
            Natom++;
        }
    }
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -28,6 +28,7 @@
 #include <atom.h>
 #include <allocate.h>
 #include <neighbor.h>
+#include <util.h>

 #define DELTA 20000

@@ -45,28 +46,31 @@ void initPbc(Atom* atom) {
 /* update coordinates of ghost atoms */
 /* uses mapping created in setupPbc */
 void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
-    int *border_map = atom->border_map;
-    int nlocal = atom->Nclusters_local;
+    DEBUG_MESSAGE("updatePbc start\n");
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
    MD_FLOAT xprd = param->xprd;
    MD_FLOAT yprd = param->yprd;
    MD_FLOAT zprd = param->zprd;

    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
-        const int ci = nlocal + cg;
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]);
+        const int cj = ncj + cg;
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+        MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;

-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd;
-            MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd;
-            MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd;
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
+            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
+            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;

-            cluster_x(cptr, cii) = xtmp;
-            cluster_y(cptr, cii) = ytmp;
-            cluster_z(cptr, cii) = ztmp;
+            cj_x[CL_X_OFFSET + cjj] = xtmp;
+            cj_x[CL_Y_OFFSET + cjj] = ytmp;
+            cj_x[CL_Z_OFFSET + cjj] = ztmp;

            if(firstUpdate) {
                // TODO: To create the bounding boxes faster, we can use SIMD operations
@@ -80,20 +84,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
        }

        if(firstUpdate) {
-            for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) {
-                cluster_x(cptr, cii) = INFINITY;
-                cluster_y(cptr, cii) = INFINITY;
-                cluster_z(cptr, cii) = INFINITY;
+            for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+                cj_x[CL_X_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Z_OFFSET + cjj] = INFINITY;
            }

-            atom->clusters[ci].bbminx = bbminx;
-            atom->clusters[ci].bbmaxx = bbmaxx;
-            atom->clusters[ci].bbminy = bbminy;
-            atom->clusters[ci].bbmaxy = bbmaxy;
-            atom->clusters[ci].bbminz = bbminz;
-            atom->clusters[ci].bbmaxz = bbmaxz;
+            atom->jclusters[cj].bbminx = bbminx;
+            atom->jclusters[cj].bbmaxx = bbmaxx;
+            atom->jclusters[cj].bbminy = bbminy;
+            atom->jclusters[cj].bbmaxy = bbmaxy;
+            atom->jclusters[cj].bbminz = bbminz;
+            atom->jclusters[cj].bbmaxz = bbmaxz;
        }
    }
+
+    DEBUG_MESSAGE("updatePbc end\n");
 }

 /* relocate atoms that have left domain according
@@ -130,15 +136,18 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
 * that are then enforced in updatePbc */
 #define ADDGHOST(dx,dy,dz);                                                     \
    Nghost++;                                                                   \
-    const int g_atom_idx = atom->Nclusters_local + Nghost;                      \
-    border_map[Nghost] = ci;                                                    \
+    const int cg = ncj + Nghost;                                                \
+    const int cj_natoms = atom->jclusters[cj].natoms;                           \
+    atom->border_map[Nghost] = cj;                                              \
    atom->PBCx[Nghost] = dx;                                                    \
    atom->PBCy[Nghost] = dy;                                                    \
    atom->PBCz[Nghost] = dz;                                                    \
-    atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms;              \
-    Nghost_atoms += atom->clusters[g_atom_idx].natoms;                          \
-    for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {                  \
-        atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii];    \
+    atom->jclusters[cg].natoms = cj_natoms;                                     \
+    Nghost_atoms += cj_natoms;                                                  \
+    int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);                                 \
+    int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg);                                 \
+    for(int cjj = 0; cjj < cj_natoms; cjj++) {                                  \
+        atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj];    \
    }

 /* internal subroutines */
@@ -153,80 +162,86 @@ void growPbc(Atom* atom) {
 }

 void setupPbc(Atom *atom, Parameter *param) {
-    int *border_map = atom->border_map;
+    DEBUG_MESSAGE("setupPbc start\n");
    MD_FLOAT xprd = param->xprd;
    MD_FLOAT yprd = param->yprd;
    MD_FLOAT zprd = param->zprd;
    MD_FLOAT Cutneigh = param->cutneigh;
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
    int Nghost = -1;
    int Nghost_atoms = 0;

-    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        if (atom->Nclusters_local + Nghost + 7 >= atom->Nclusters_max) {
-            growClusters(atom);
+    for(int cj = 0; cj < ncj; cj++) {
+        if(atom->jclusters[cj].natoms > 0) {
+            if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            if((Nghost + 7) * jfac >= NmaxGhost) {
+                growPbc(atom);
+            }
+
+            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
+            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
+            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
+            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
+            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
+            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
+
+            /* Setup ghost atoms */
+            /* 6 planes */
+            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
+            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
+            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
+            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
+            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
+            /* 8 corners */
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
+            /* 12 edges */
+            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
+            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
+            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
+            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
+            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
+            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
        }
-
-        if (Nghost + 7 >= NmaxGhost) {
-            growPbc(atom);
-            border_map = atom->border_map;
-        }
-
-        MD_FLOAT bbminx = atom->clusters[ci].bbminx;
-        MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx;
-        MD_FLOAT bbminy = atom->clusters[ci].bbminy;
-        MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy;
-        MD_FLOAT bbminz = atom->clusters[ci].bbminz;
-        MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz;
-
-        /* Setup ghost atoms */
-        /* 6 planes */
-        if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
-        if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
-        if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
-        if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
-        if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
-        if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
-        /* 8 corners */
-        if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
-        if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
-        if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
-        if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
-        /* 12 edges */
-        if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
-        if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
-        if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
-        if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
-        if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
-        if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
    }

-    if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) {
+    if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
        growClusters(atom);
    }

    // Add dummy cluster at the end
-    MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1);
-    for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
-        cluster_x(cptr, cii) = INFINITY;
-        cluster_y(cptr, cii) = INFINITY;
-        cluster_z(cptr, cii) = INFINITY;
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
    }

    // increase by one to make it the ghost atom count
+    atom->dummy_cj = ncj + Nghost + 1;
    atom->Nghost = Nghost_atoms;
    atom->Nclusters_ghost = Nghost + 1;
    atom->Nclusters = atom->Nclusters_local + Nghost + 1;

    // Update created ghost clusters positions
    updatePbc(atom, param, 1);
+    DEBUG_MESSAGE("setupPbc end\n");
 }
--- a/gromacs/stats.c
+++ b/gromacs/stats.c
@@ -18,11 +18,11 @@ void initStats(Stats *s) {
 void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
 #ifdef COMPUTE_STATS

-    const int MxN = CLUSTER_DIM_M * CLUSTER_DIM_N;
+    const int MxN = CLUSTER_M * CLUSTER_N;
    double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
    double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
                                          (double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
-    double avg_neigh_atom = (stats->num_neighs * CLUSTER_DIM_N) / (double)(atom->Nlocal * (param->ntimes + 1));
+    double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
    double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
    double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));

--- a/gromacs/vtk.c
+++ b/gromacs/vtk.c
@@ -27,9 +27,10 @@ int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
    for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
        }
    }
    fprintf(fp, "\n\n");
@@ -70,9 +71,10 @@ int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
    fprintf(fp, "POINTS %d double\n", atom->Nghost);
    for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
        }
    }
    fprintf(fp, "\n\n");
@@ -116,18 +118,19 @@ int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
    fprintf(fp, "DATASET POLYDATA\n");
    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
    for(int ci = 0; ci < N; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
        }

-        tot_lines += atom->clusters[ci].natoms;
+        tot_lines += atom->iclusters[ci].natoms;
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
    for(int ci = 0; ci < N; ++ci) {
-        fprintf(fp, "%d ", atom->clusters[ci].natoms);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
            fprintf(fp, "%d ", i++);
        }

@@ -157,18 +160,19 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
    fprintf(fp, "DATASET POLYDATA\n");
    fprintf(fp, "POINTS %d double\n", atom->Nghost);
    for(int ci = atom->Nclusters_local; ci < N; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
        }

-        tot_lines += atom->clusters[ci].natoms;
+        tot_lines += atom->iclusters[ci].natoms;
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
    for(int ci = atom->Nclusters_local; ci < N; ++ci) {
-        fprintf(fp, "%d ", atom->clusters[ci].natoms);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
            fprintf(fp, "%d ", i++);
        }

--- a/gromacs/xtc.c
+++ b/gromacs/xtc.c
@@ -52,11 +52,12 @@ void xtc_init(const char *filename, Atom *atom, int timestep) {
 void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
    int i = 0;
    for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            x_buf[i][XX] = cluster_x(cptr, cii);
-            x_buf[i][YY] = cluster_y(cptr, cii);
-            x_buf[i][ZZ] = cluster_z(cptr, cii);
+            x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
+            x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
+            x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
            i++;
        }
    }
--- a/include_ISA.mk
+++ b/include_ISA.mk
@@ -0,0 +1,18 @@
+ifeq ($(strip $(ISA)), SSE)
+_VECTOR_WIDTH=2
+else ifeq ($(strip $(ISA)), AVX)
+# Vector width is 4 but AVX2 instruction set is not supported
+NO_AVX2=true
+_VECTOR_WIDTH=4
+else ifeq ($(strip $(ISA)), AVX2)
+_VECTOR_WIDTH=4
+else ifeq ($(strip $(ISA)), AVX512)
+AVX512=true
+_VECTOR_WIDTH=8
+endif
+
+ifeq ($(strip $(DATA_TYPE)), SP)
+VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
+else
+VECTOR_WIDTH=$(_VECTOR_WIDTH)
+endif
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
@@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {

    if(!csv) {
        printf("Number of timesteps: %d\n", param.ntimes);
-        printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
-        printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
        printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
        printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
        printf("Total number of atoms: %d\n", atom->Nlocal);
@@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
    E = getTimeStamp();
    double T_accum = E-S;
    double freq_hz = param.proc_freq * 1.e9;
-    const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
-    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
-    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
    const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);

    if(!csv) {