From c7360305c8bffef44672110a46926fe25b9d2d8e Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Wed, 9 Mar 2022 02:25:39 +0100
Subject: [PATCH 1/7] Add first draft version of GROMACS method separating
 i-clusters and j-clusters

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 Makefile                |   1 +
 config.mk               |   6 +-
 gromacs/force_lj.c      |  10 +-
 gromacs/includes/atom.h |  79 +++++--
 gromacs/includes/simd.h |   6 +-
 gromacs/main-stub.c     |   7 +-
 gromacs/neighbor.c      | 460 +++++++++++++++++++++++++---------------
 gromacs/pbc.c           |  74 ++++---
 include_ISA.mk          |  15 ++
 9 files changed, 411 insertions(+), 247 deletions(-)
 create mode 100644 include_ISA.mk

diff --git a/Makefile b/Makefile
index e214557..78289dc 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,7 @@ Q         ?= @
 include $(MAKE_DIR)/config.mk
 include $(MAKE_DIR)/include_$(TAG).mk
 include $(MAKE_DIR)/include_LIKWID.mk
+include $(MAKE_DIR)/include_ISA.mk
 include $(MAKE_DIR)/include_GROMACS.mk
 INCLUDES  += -I./$(SRC_DIR)/includes
 
diff --git a/config.mk b/config.mk
index 3e8bb88..de2bf9d 100644
--- a/config.mk
+++ b/config.mk
@@ -1,5 +1,7 @@
 # Compiler tag (GCC/CLANG/ICC)
 TAG ?= ICC
+# Instruction set (SSE/AVX/AVX2/AVX512)
+ISA ?= AVX512
 # Optimization scheme (lammps/gromacs/clusters_per_bin)
 OPT_SCHEME ?= gromacs
 # Enable likwid (true or false)
@@ -23,10 +25,6 @@ EXPLICIT_TYPES ?= false
 MEM_TRACER ?= false
 # Trace indexes and distances for gather-md (true or false)
 INDEX_TRACER ?= false
-# Vector width (elements) for index and distance tracer
-VECTOR_WIDTH ?= 8
-# When vector width is 4 but AVX2 is not supported (AVX only), set this to true
-NO_AVX2 ?= false
 # Compute statistics
 COMPUTE_STATS ?= true
 
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 07ed0e5..04739a6 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -63,7 +63,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             int cj = neighs[k];
             int any = 0;
             MD_FLOAT *cjptr = cluster_pos_ptr(cj);
-            for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
+            for(int cii = 0; cii < CLUSTER_M; cii++) {
                 MD_FLOAT xtmp = cluster_x(ciptr, cii);
                 MD_FLOAT ytmp = cluster_y(ciptr, cii);
                 MD_FLOAT ztmp = cluster_z(ciptr, cii);
@@ -71,7 +71,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
                 MD_FLOAT fiy = 0;
                 MD_FLOAT fiz = 0;
 
-                for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
+                for(int cjj = 0; cjj < CLUSTER_M; cjj++) {
                     if(ci != cj || cii != cjj) {
                         MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
                         MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
@@ -106,7 +106,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
-        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
     }
 
     LIKWID_MARKER_STOP("force");
@@ -128,7 +128,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M;
+    const int unroll_j = CLUSTER_N / CLUSTER_M;
 
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
@@ -262,7 +262,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
-        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
     }
 
     LIKWID_MARKER_STOP("force");
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 854d0b0..2373d6b 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -27,13 +27,60 @@
 
 #define DELTA 20000
 
-#define CLUSTER_DIM_M       4
-#define CLUSTER_DIM_N       VECTOR_WIDTH
+#define CLUSTER_M 4
+
+// Nbnxn layouts (as of GROMACS):
+// Simd4xN: M=4, N=VECTOR_WIDTH
+// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
+
+// Simd2XNN (here used for single-precision)
+#if VECTOR_WIDTH > CLUSTER_M * 2
+#   define CLUSTER_N                (VECTOR_WIDTH / 2)
+// Simd4xN
+#else
+#   define CLUSTER_N                VECTOR_WIDTH
+#endif
+
+#if CLUSTER_M == CLUSTER_N
+#   define CJ0_FROM_CI(a)           (a)
+#   define CJ1_FROM_CI(a)           (a)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#elif CLUSTER_M == CLUSTER_N * 2 // M > N
+#   define CJ0_FROM_CI(a)           ((a) << 1)
+#   define CJ1_FROM_CI(a)           (((a) << 1) | 0x1)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_M * (b))
+#   define CJ_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
+#elif CLUSTER_M == CLUSTER_N / 2 // M < N
+#   define CJ0_FROM_CI(a)           ((a) >> 1)
+#   define CJ1_FROM_CI(a)           ((a) >> 1)
+#   define CI_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#else
+#   error "Invalid cluster configuration!"
+#endif
+
+#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
+#   error "Cluster N dimension can be only 2, 4 and 8"
+#endif
+
+#define CI_SCALAR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 1))
+#define CI_VECTOR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 3))
+#define CJ_SCALAR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 1))
+#define CJ_VECTOR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 3))
+
+#if CLUSTER_M >= CLUSTER_N
+#   define CL_X_OFFSET              (0 * CLUSTER_M)
+#   define CL_Y_OFFSET              (1 * CLUSTER_M)
+#   define CL_Z_OFFSET              (2 * CLUSTER_M)
+#else
+#   define CL_X_OFFSET              (0 * CLUSTER_N)
+#   define CL_Y_OFFSET              (1 * CLUSTER_N)
+#   define CL_Z_OFFSET              (2 * CLUSTER_N)
+#endif
 
 typedef struct {
-    int bin;
     int natoms;
-    int type[CLUSTER_DIM_M];
     MD_FLOAT bbminx, bbmaxx;
     MD_FLOAT bbminy, bbmaxy;
     MD_FLOAT bbminz, bbmaxz;
@@ -44,9 +91,6 @@ typedef struct {
     int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
     MD_FLOAT *x, *y, *z;
     MD_FLOAT *vx, *vy, *vz;
-    MD_FLOAT *cl_x;
-    MD_FLOAT *cl_v;
-    MD_FLOAT *cl_f;
     int *border_map;
     int *type;
     int ntypes;
@@ -54,8 +98,13 @@ typedef struct {
     MD_FLOAT *sigma6;
     MD_FLOAT *cutforcesq;
     MD_FLOAT *cutneighsq;
-    Cluster *clusters;
     int *PBCx, *PBCy, *PBCz;
+    // Data in cluster format
+    MD_FLOAT *cl_x;
+    MD_FLOAT *cl_v;
+    MD_FLOAT *cl_f;
+    int *cl_type;
+    Cluster *iclusters, *jclusters;
 } Atom;
 
 extern void initAtom(Atom*);
@@ -67,10 +116,6 @@ extern int readAtom_dmp(Atom*, Parameter*);
 extern void growAtom(Atom*);
 extern void growClusters(Atom*);
 
-#define cluster_pos_ptr(ci)       &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3])
-#define cluster_velocity_ptr(ci)  &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3])
-#define cluster_force_ptr(ci)     &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3])
-
 #ifdef AOS
 #define POS_DATA_LAYOUT     "AoS"
 #define atom_x(i)           atom->x[(i) * 3 + 0]
@@ -83,14 +128,4 @@ extern void growClusters(Atom*);
 #define atom_z(i)           atom->z[i]
 #endif
 
-#ifdef CLUSTER_AOS
-#define cluster_x(cptr, i)  cptr[(i) * 3 + 0]
-#define cluster_y(cptr, i)  cptr[(i) * 3 + 1]
-#define cluster_z(cptr, i)  cptr[(i) * 3 + 2]
-#else
-#define cluster_x(cptr, i)  cptr[0 * CLUSTER_DIM_M + (i)]
-#define cluster_y(cptr, i)  cptr[1 * CLUSTER_DIM_M + (i)]
-#define cluster_z(cptr, i)  cptr[2 * CLUSTER_DIM_M + (i)]
-#endif
-
 #endif
diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h
index ad86e7b..9c0a842 100644
--- a/gromacs/includes/simd.h
+++ b/gromacs/includes/simd.h
@@ -54,8 +54,8 @@ static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) {
     x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double));
     x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double));
 #else
-    x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]);
-    x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1);
+    x = _mm512_load_pd(&c0[d * CLUSTER_M]);
+    x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_M]), 1);
 #endif
     return x;
 }
@@ -134,7 +134,7 @@ static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) {
     __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d));
     x = _mm256_i32gather_pd(c0, vindex, sizeof(double));
 #else
-    x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]);
+    x = _mm256_load_pd(&c0[d * CLUSTER_M]);
 #endif
     return x;
 }
diff --git a/gromacs/main-stub.c b/gromacs/main-stub.c
index 4da90db..c7596e7 100644
--- a/gromacs/main-stub.c
+++ b/gromacs/main-stub.c
@@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
 
     if(!csv) {
         printf("Number of timesteps: %d\n", param.ntimes);
-        printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
-        printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
         printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
         printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
         printf("Total number of atoms: %d\n", atom->Nlocal);
@@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
     E = getTimeStamp();
     double T_accum = E-S;
     double freq_hz = param.proc_freq * 1.e9;
-    const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
-    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
-    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
     const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
 
     if(!csv) {
diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c
index 3d86183..1f40ef9 100644
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -39,8 +39,9 @@ static int nbinx, nbiny;
 static int mbinx, mbiny; // n bins in x, y
 static int *bincount;
 static int *bins;
-static int *cluster_bincount;
-static int *cluster_bins;
+static int *bin_nclusters;
+static int *bin_clusters;
+static int *icluster_bin;
 static int mbins; //total number of bins
 static int atoms_per_bin;  // max atoms per bin
 static int clusters_per_bin;  // max clusters per bin
@@ -63,12 +64,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
     cutneigh = param->cutneigh;
     nmax = 0;
     atoms_per_bin = 8;
-    clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4;
+    clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4;
     stencil = NULL;
     bins = NULL;
     bincount = NULL;
-    cluster_bins = NULL;
-    cluster_bincount = NULL;
+    bin_clusters = NULL;
+    bin_nclusters = NULL;
     neighbor->maxneighs = 100;
     neighbor->numneigh = NULL;
     neighbor->neighbors = NULL;
@@ -91,7 +92,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
     MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
 
     MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
-    MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N);
+    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
     MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
     MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
     nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
@@ -103,20 +104,16 @@ void setupNeighbor(Parameter *param, Atom *atom) {
     cutneighsq = cutneigh * cutneigh;
 
     coord = xlo - cutneigh - SMALL * xprd;
-    mbinxlo = (int) (coord * bininvx);
-    if (coord < 0.0) {
-        mbinxlo = mbinxlo - 1;
-    }
+    mbinxlo = (int)(coord * bininvx);
+    if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
     coord = xhi + cutneigh + SMALL * xprd;
-    mbinxhi = (int) (coord * bininvx);
+    mbinxhi = (int)(coord * bininvx);
 
     coord = ylo - cutneigh - SMALL * yprd;
-    mbinylo = (int) (coord * bininvy);
-    if (coord < 0.0) {
-        mbinylo = mbinylo - 1;
-    }
+    mbinylo = (int)(coord * bininvy);
+    if(coord < 0.0) { mbinylo = mbinylo - 1; }
     coord = yhi + cutneigh + SMALL * yprd;
-    mbinyhi = (int) (coord * bininvy);
+    mbinyhi = (int)(coord * bininvy);
 
     mbinxlo = mbinxlo - 1;
     mbinxhi = mbinxhi + 1;
@@ -126,16 +123,12 @@ void setupNeighbor(Parameter *param, Atom *atom) {
     mbinyhi = mbinyhi + 1;
     mbiny = mbinyhi - mbinylo + 1;
 
-    nextx = (int) (cutneigh * bininvx);
+    nextx = (int)(cutneigh * bininvx);
+    nexty = (int)(cutneigh * bininvy);
     if(nextx * binsizex < FACTOR * cutneigh) nextx++;
-
-    nexty = (int) (cutneigh * bininvy);
     if(nexty * binsizey < FACTOR * cutneigh) nexty++;
 
-    if (stencil) {
-        free(stencil);
-    }
-
+    if (stencil) { free(stencil); }
     stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
     nstencil = 0;
 
@@ -147,19 +140,15 @@ void setupNeighbor(Parameter *param, Atom *atom) {
         }
     }
 
+    if(bincount) { free(bincount); }
+    if(bins) { free(bins); }
+    if(bin_nclusters) { free(bin_nclusters); }
+    if(bin_clusters) { free(bin_clusters); }
     mbins = mbinx * mbiny;
-
-    if (bincount) { free(bincount); }
     bincount = (int*) malloc(mbins * sizeof(int));
-
-    if (bins) { free(bins); }
     bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
-
-    if (cluster_bincount) { free(cluster_bincount); }
-    cluster_bincount = (int*) malloc(mbins * sizeof(int));
-
-    if (cluster_bins) { free(cluster_bins); }
-    cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+    bin_nclusters = (int*) malloc(mbins * sizeof(int));
+    bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
 
     /*
     DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
@@ -171,20 +160,20 @@ void setupNeighbor(Parameter *param, Atom *atom) {
 }
 
 MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
-    MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx;
-    MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx;
+    MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
+    MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
     MD_FLOAT dm = MAX(dl, dh);
     MD_FLOAT dm0 = MAX(dm, 0.0);
     MD_FLOAT d2 = dm0 * dm0;
 
-    dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy;
-    dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy;
+    dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
+    dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
     dm = MAX(dl, dh);
     dm0 = MAX(dm, 0.0);
     d2 += dm0 * dm0;
 
-    dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz;
-    dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz;
+    dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
+    dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
     dm = MAX(dl, dh);
     dm0 = MAX(dm, 0.0);
     d2 += dm0 * dm0;
@@ -192,14 +181,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
 }
 
 int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
-    MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-    MD_FLOAT *cjptr = cluster_pos_ptr(cj);
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 
-    for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-        for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) {
-            MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj);
-            MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj);
-            MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj);
+    for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
+            MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
+            MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
             if(delx * delx + dely * dely + delz * delz < rsq) {
                 return 1;
             }
@@ -234,22 +225,22 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
         resize = 0;
 
         for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-            int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
             int n = 0;
-            int ibin = atom->clusters[ci].bin;
-            MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx;
-            MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx;
-            MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy;
-            MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy;
-            MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz;
-            MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz;
+            int ibin = icluster_bin[ci];
+            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
+            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
+            MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
+            MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
+            MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
+            MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
 
             for(int k = 0; k < nstencil; k++) {
                 int jbin = ibin + stencil[k];
-                int *loc_bin = &cluster_bins[jbin * clusters_per_bin];
+                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
                 int cj, m = -1;
                 MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
-                const int c = cluster_bincount[jbin];
+                const int c = bin_nclusters[jbin];
 
                 if(c > 0) {
                     MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
@@ -257,8 +248,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                     do {
                         m++;
                         cj = loc_bin[m];
-                        jbb_zmin = atom->clusters[cj].bbminz;
-                        jbb_zmax = atom->clusters[cj].bbmaxz;
+                        jbb_zmin = atom->jclusters[cj].bbminz;
+                        jbb_zmax = atom->jclusters[cj].bbmaxz;
                         dl = ibb_zmin - jbb_zmax;
                         dh = jbb_zmin - ibb_zmax;
                         dm = MAX(dl, dh);
@@ -266,10 +257,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                         d_bb_sq = dm0 * dm0;
                     } while(m + 1 < c && d_bb_sq > cutneighsq);
 
-                    jbb_xmin = atom->clusters[cj].bbminx;
-                    jbb_xmax = atom->clusters[cj].bbmaxx;
-                    jbb_ymin = atom->clusters[cj].bbminy;
-                    jbb_ymax = atom->clusters[cj].bbmaxy;
+                    jbb_xmin = atom->jclusters[cj].bbminx;
+                    jbb_xmax = atom->jclusters[cj].bbmaxx;
+                    jbb_ymin = atom->jclusters[cj].bbminy;
+                    jbb_ymax = atom->jclusters[cj].bbmaxy;
 
                     while(m < c) {
                         dl = ibb_zmin - jbb_zmax;
@@ -303,20 +294,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                         m++;
                         if(m < c) {
                             cj = loc_bin[m];
-                            jbb_xmin = atom->clusters[cj].bbminx;
-                            jbb_xmax = atom->clusters[cj].bbmaxx;
-                            jbb_ymin = atom->clusters[cj].bbminy;
-                            jbb_ymax = atom->clusters[cj].bbmaxy;
-                            jbb_zmin = atom->clusters[cj].bbminz;
-                            jbb_zmax = atom->clusters[cj].bbmaxz;
+                            jbb_xmin = atom->jclusters[cj].bbminx;
+                            jbb_xmax = atom->jclusters[cj].bbmaxx;
+                            jbb_ymin = atom->jclusters[cj].bbminy;
+                            jbb_ymax = atom->jclusters[cj].bbmaxy;
+                            jbb_zmin = atom->jclusters[cj].bbminz;
+                            jbb_zmax = atom->jclusters[cj].bbmaxz;
                         }
                     }
                 }
             }
 
-            if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
-                while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
-                    neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster
+            // Fill neighbor list with dummy values to fit vector width
+            if(CLUSTER_N < VECTOR_WIDTH) {
+                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
+                    neighptr[n++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster
                 }
             }
 
@@ -353,7 +345,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
             atom->clusters[ci].bbminz,
             atom->clusters[ci].bbmaxz);
 
-        for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
             DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
         }
 
@@ -371,7 +363,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                 atom->clusters[cj].bbminz,
                 atom->clusters[cj].bbmaxz);
 
-            for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
+            for(int cjj = 0; cjj < CLUSTER_M; cjj++) {
                 DEBUG_MESSAGE("    %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
             }
         }
@@ -393,8 +385,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
         int k = 0;
 
         // Remove dummy clusters if necessary
-        if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
-            while(neighs[numneighs - 1] == nall - 1) {
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(neighs[numneighs - 1] == CJ0_FROM_CI(nall - 1)) {
                 numneighs--;
             }
         }
@@ -410,9 +402,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
         }
 
         // Readd dummy clusters if necessary
-        if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
-            while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
-                neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
+                neighs[numneighs++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster
             }
         }
 
@@ -558,34 +550,37 @@ void buildClusters(Atom *atom) {
     for(int bin = 0; bin < mbins; bin++) {
         int c = bincount[bin];
         int ac = 0;
-        const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M);
+        int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
+        if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
         for(int cl = 0; cl < nclusters; cl++) {
             const int ci = atom->Nclusters_local;
-
             if(ci >= atom->Nclusters_max) {
                 growClusters(atom);
             }
 
-            MD_FLOAT *cptr = cluster_pos_ptr(ci);
-            MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
+            int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+            int *ci_type = &atom->cl_type[ci_sca_base];
             MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
             MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
             MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
-            atom->clusters[ci].natoms = 0;
 
-            for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
+            atom->iclusters[ci].natoms = 0;
+            for(int cii = 0; cii < CLUSTER_M; cii++) {
                 if(ac < c) {
                     int i = bins[bin * atoms_per_bin + ac];
                     MD_FLOAT xtmp = atom_x(i);
                     MD_FLOAT ytmp = atom_y(i);
                     MD_FLOAT ztmp = atom_z(i);
 
-                    cluster_x(cptr, cii) = xtmp;
-                    cluster_y(cptr, cii) = ytmp;
-                    cluster_z(cptr, cii) = ztmp;
-                    cluster_x(cvptr, cii) = atom->vx[i];
-                    cluster_y(cvptr, cii) = atom->vy[i];
-                    cluster_z(cvptr, cii) = atom->vz[i];
+                    ci_x[CL_X_OFFSET + cii] = xtmp;
+                    ci_x[CL_Y_OFFSET + cii] = ytmp;
+                    ci_x[CL_Z_OFFSET + cii] = ztmp;
+                    ci_v[CL_X_OFFSET + cii] = atom->vx[i];
+                    ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
+                    ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
 
                     // TODO: To create the bounding boxes faster, we can use SIMD operations
                     if(bbminx > xtmp) { bbminx = xtmp; }
@@ -595,24 +590,24 @@ void buildClusters(Atom *atom) {
                     if(bbminz > ztmp) { bbminz = ztmp; }
                     if(bbmaxz < ztmp) { bbmaxz = ztmp; }
 
-                    atom->clusters[ci].type[cii] = atom->type[i];
-                    atom->clusters[ci].natoms++;
+                    atom->cl_type[cii] = atom->type[i];
+                    atom->iclusters[ci].natoms++;
                 } else {
-                    cluster_x(cptr, cii) = INFINITY;
-                    cluster_y(cptr, cii) = INFINITY;
-                    cluster_z(cptr, cii) = INFINITY;
+                    ci_x[CL_X_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Y_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Z_OFFSET + cii] = INFINITY;
                 }
 
                 ac++;
             }
 
-            atom->clusters[ci].bin = bin;
-            atom->clusters[ci].bbminx = bbminx;
-            atom->clusters[ci].bbmaxx = bbmaxx;
-            atom->clusters[ci].bbminy = bbminy;
-            atom->clusters[ci].bbmaxy = bbmaxy;
-            atom->clusters[ci].bbminz = bbminz;
-            atom->clusters[ci].bbmaxz = bbmaxz;
+            icluster_bin[ci] = bin;
+            atom->iclusters[ci].bbminx = bbminx;
+            atom->iclusters[ci].bbmaxx = bbmaxx;
+            atom->iclusters[ci].bbminy = bbminy;
+            atom->iclusters[ci].bbmaxy = bbmaxy;
+            atom->iclusters[ci].bbminz = bbminz;
+            atom->iclusters[ci].bbmaxz = bbmaxz;
             atom->Nclusters_local++;
         }
     }
@@ -620,6 +615,93 @@ void buildClusters(Atom *atom) {
     DEBUG_MESSAGE("buildClusters end\n");
 }
 
+void defineJClusters(Atom *atom) {
+    DEBUG_MESSAGE("defineJClusters start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local + atom->Nclusters_ghost; ci++) {
+        int cj0 = CJ0_FROM_CI(ci);
+
+        if(CLUSTER_M == CLUSTER_N) {
+            atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
+            atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
+            atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
+            atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
+            atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
+            atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
+            atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
+
+        } else if(CLUSTER_M > CLUSTER_N) {
+            int cj1 = CJ1_FROM_CI(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj0].bbminx = bbminx;
+            atom->jclusters[cj0].bbmaxx = bbmaxx;
+            atom->jclusters[cj0].bbminy = bbminy;
+            atom->jclusters[cj0].bbmaxy = bbmaxy;
+            atom->jclusters[cj0].bbminz = bbminz;
+            atom->jclusters[cj0].bbmaxz = bbmaxz;
+            atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
+
+            bbminx = INFINITY, bbmaxx = -INFINITY;
+            bbminy = INFINITY, bbmaxy = -INFINITY;
+            bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj1].bbminx = bbminx;
+            atom->jclusters[cj1].bbmaxx = bbmaxx;
+            atom->jclusters[cj1].bbminy = bbminy;
+            atom->jclusters[cj1].bbmaxy = bbmaxy;
+            atom->jclusters[cj1].bbminz = bbminz;
+            atom->jclusters[cj1].bbmaxz = bbmaxz;
+            atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
+
+        } else {
+            if(ci % 2 == 0) {
+                atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci + 1].bbminx);
+                atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci + 1].bbmaxx);
+                atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci + 1].bbminy);
+                atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci + 1].bbmaxy);
+                atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci + 1].bbminz);
+                atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci + 1].bbmaxz);
+                atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci + 1].natoms;
+            }
+        }
+    }
+
+    DEBUG_MESSAGE("defineJClusters end\n");
+}
+
 void binClusters(Atom *atom) {
     DEBUG_MESSAGE("binClusters start\n");
 
@@ -629,7 +711,7 @@ void binClusters(Atom *atom) {
         MD_FLOAT *cptr = cluster_pos_ptr(ci);
         DEBUG_MESSAGE("Cluster %d:\n", ci);
         DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
-            atom->clusters[ci].bin,
+            icluster_bin[ci],
             atom->clusters[ci].natoms,
             atom->clusters[ci].bbminx,
             atom->clusters[ci].bbmaxx,
@@ -638,104 +720,133 @@ void binClusters(Atom *atom) {
             atom->clusters[ci].bbminz,
             atom->clusters[ci].bbmaxz);
 
-        for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
             DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
         }
     }
     */
 
     const int nlocal = atom->Nclusters_local;
-    int resize = 1;
+    const int ncji = MAX(1, CLUSTER_M / CLUSTER_N);
+    if(ncji > 2) {
+        fprintf(stderr, "binClusters(): ncji cannot be higher than 2!");
+    }
 
+    defineJClusters(atom);
+
+    int resize = 1;
     while(resize > 0) {
         resize = 0;
 
         for(int bin = 0; bin < mbins; bin++) {
-            cluster_bincount[bin] = 0;
+            bin_nclusters[bin] = 0;
         }
 
         for(int ci = 0; ci < nlocal && !resize; ci++) {
-            int bin = atom->clusters[ci].bin;
-            int c = cluster_bincount[bin];
-            if(c < clusters_per_bin) {
-                cluster_bins[bin * clusters_per_bin + c] = ci;
-                cluster_bincount[bin]++;
-            } else {
-                resize = 1;
+            // Assure we add this j-cluster only once in the bin
+            if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
+                int bin = icluster_bin[ci];
+                int c = bin_nclusters[bin];
+                if(c + 1 < clusters_per_bin) {
+                    bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
+                    bin_nclusters[bin]++;
+
+                    if(CLUSTER_M > CLUSTER_N) {
+                        int cj1 = CJ1_FROM_CI(ci);
+                        if(atoms->jclusters[cj1].natoms > 0) {
+                            bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
+                            bin_nclusters[bin]++;
+                        }
+                    }
+                } else {
+                    resize = 1;
+                }
             }
         }
 
         for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
-            const int ci = nlocal + cg;
-            MD_FLOAT *cptr = cluster_pos_ptr(ci);
-            MD_FLOAT ci_minz = atom->clusters[ci].bbminz;
-            MD_FLOAT xtmp, ytmp;
-            int ix = -1, iy = -1;
+            // Assure we add this j-cluster only once in the bin
+            if(!(CLUSTER_M < CLUSTER_N && cg % 2)) {
+                const int ci = nlocal + cg;
+                int ix = -1, iy = -1;
+                MD_FLOAT xtmp, ytmp;
 
-            xtmp = cluster_x(cptr, 0);
-            ytmp = cluster_y(cptr, 0);
-            coord2bin2D(xtmp, ytmp, &ix, &iy);
-            ix = MAX(MIN(ix, mbinx - 1), 0);
-            iy = MAX(MIN(iy, mbiny - 1), 0);
-            for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) {
-                int nix, niy;
-                xtmp = cluster_x(cptr, cii);
-                ytmp = cluster_y(cptr, cii);
-                coord2bin2D(xtmp, ytmp, &nix, &niy);
-                nix = MAX(MIN(nix, mbinx - 1), 0);
-                niy = MAX(MIN(niy, mbiny - 1), 0);
+                for(int cji = 0; cji < ncji && !resize; cji++) {
+                    int cj = (cji == 0) ? CJ0_FROM_CI(ci) : CJ1_FROM_CI(ci);
+                    const int n = atom->jclusters[cj].natoms;
 
-                // Always put the cluster on the bin of its innermost atom so
-                // the cluster should be closer to local clusters
-                if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
-                if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
-                if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
-                if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
-            }
+                    if(n > 0) {
+                        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+                        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+                        MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
 
-            int bin = iy * mbinx + ix + 1;
-            int c = cluster_bincount[bin];
-            if(c < clusters_per_bin) {
-                // Insert the current ghost cluster in the bin keeping clusters
-                // sorted by z coordinate
-                int inserted = 0;
-                for(int i = 0; i < c; i++) {
-                    int last_cl = cluster_bins[bin * clusters_per_bin + i];
-                    if(atom->clusters[last_cl].bbminz > ci_minz) {
-                        cluster_bins[bin * clusters_per_bin + i] = ci;
+                        xtmp = cj_x[CL_X_OFFSET + 0]
+                        ytmp = cj_x[CL_Y_OFFSET + 0]
+                        coord2bin2D(xtmp, ytmp, &ix, &iy);
+                        ix = MAX(MIN(ix, mbinx - 1), 0);
+                        iy = MAX(MIN(iy, mbiny - 1), 0);
+                        for(int cjj = 1; cjj < n; cjj++) {
+                            int nix, niy;
+                            xtmp = cj_x[CL_X_OFFSET + cjj]
+                            ytmp = cj_x[CL_Y_OFFSET + cjj]
+                            coord2bin2D(xtmp, ytmp, &nix, &niy);
+                            nix = MAX(MIN(nix, mbinx - 1), 0);
+                            niy = MAX(MIN(niy, mbiny - 1), 0);
 
-                        for(int j = i + 1; j <= c; j++) {
-                            int tmp = cluster_bins[bin * clusters_per_bin + j];
-                            cluster_bins[bin * clusters_per_bin + j] = last_cl;
-                            last_cl = tmp;
+                            // Always put the cluster on the bin of its innermost atom so
+                            // the cluster should be closer to local clusters
+                            if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
+                            if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
+                            if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
+                            if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
                         }
 
-                        inserted = 1;
-                        break;
+                        int bin = iy * mbinx + ix + 1;
+                        int c = bin_nclusters[bin];
+                        if(c < clusters_per_bin) {
+                            // Insert the current ghost cluster in the bin keeping clusters
+                            // sorted by z coordinate
+                            int inserted = 0;
+                            for(int i = 0; i < c; i++) {
+                                int last_cl = bin_clusters[bin * clusters_per_bin + i];
+                                if(atom->jclusters[last_cl].bbminz > cj_minz) {
+                                    bin_clusters[bin * clusters_per_bin + i] = cj;
+
+                                    for(int j = i + 1; j <= c; j++) {
+                                        int tmp = bin_clusters[bin * clusters_per_bin + j];
+                                        bin_clusters[bin * clusters_per_bin + j] = last_cl;
+                                        last_cl = tmp;
+                                    }
+
+                                    inserted = 1;
+                                    break;
+                                }
+                            }
+
+                            if(!inserted) {
+                                bin_clusters[bin * clusters_per_bin + c] = cj;
+                            }
+
+                            icluster_bin[ci] = bin;
+                            bin_nclusters[bin]++;
+                        } else {
+                            resize = 1;
+                        }
                     }
                 }
-
-                if(!inserted) {
-                    cluster_bins[bin * clusters_per_bin + c] = ci;
-                }
-
-                atom->clusters[ci].bin = bin;
-                cluster_bincount[bin]++;
-            } else {
-                resize = 1;
             }
         }
 
         if(resize) {
-            free(cluster_bins);
+            free(bin_clusters);
             clusters_per_bin *= 2;
-            cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+            bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
         }
     }
 
     /*
-    DEBUG_MESSAGE("cluster_bincount\n");
-    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); }
+    DEBUG_MESSAGE("bin_nclusters\n");
+    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
     DEBUG_MESSAGE("\n");
     */
 
@@ -747,16 +858,17 @@ void updateSingleAtoms(Atom *atom) {
     int Natom = 0;
 
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
 
         for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            atom_x(Natom) = cluster_x(cptr, cii);
-            atom_y(Natom) = cluster_y(cptr, cii);
-            atom_z(Natom) = cluster_z(cptr, cii);
-            atom->vx[Natom] = cluster_x(cvptr, cii);
-            atom->vy[Natom] = cluster_y(cvptr, cii);
-            atom->vz[Natom] = cluster_z(cvptr, cii);
+            atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
+            atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
+            atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
+            atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
+            atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
+            atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
             Natom++;
         }
     }
diff --git a/gromacs/pbc.c b/gromacs/pbc.c
index acb203e..9632e36 100644
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -53,20 +53,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
 
     for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
         const int ci = nlocal + cg;
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        int bmap_vec_base = CI_VECTOR_BASE_INDEX(border_map[cg]);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
         MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
         MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
         MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
 
         for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd;
-            MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd;
-            MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd;
+            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd;
+            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd;
+            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd;
 
-            cluster_x(cptr, cii) = xtmp;
-            cluster_y(cptr, cii) = ytmp;
-            cluster_z(cptr, cii) = ztmp;
+            ci_x[CL_X_OFFSET + cii] = xtmp;
+            ci_x[CL_Y_OFFSET + cii] = ytmp;
+            ci_x[CL_Z_OFFSET + cii] = ztmp;
 
             if(firstUpdate) {
                 // TODO: To create the bounding boxes faster, we can use SIMD operations
@@ -80,18 +82,18 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
         }
 
         if(firstUpdate) {
-            for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) {
-                cluster_x(cptr, cii) = INFINITY;
-                cluster_y(cptr, cii) = INFINITY;
-                cluster_z(cptr, cii) = INFINITY;
+            for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_M; cii++) {
+                ci_x[CL_X_OFFSET + cii] = INFINITY;
+                ci_x[CL_Y_OFFSET + cii] = INFINITY;
+                ci_x[CL_Z_OFFSET + cii] = INFINITY;
             }
 
-            atom->clusters[ci].bbminx = bbminx;
-            atom->clusters[ci].bbmaxx = bbmaxx;
-            atom->clusters[ci].bbminy = bbminy;
-            atom->clusters[ci].bbmaxy = bbmaxy;
-            atom->clusters[ci].bbminz = bbminz;
-            atom->clusters[ci].bbmaxz = bbmaxz;
+            atom->iclusters[ci].bbminx = bbminx;
+            atom->iclusters[ci].bbmaxx = bbmaxx;
+            atom->iclusters[ci].bbminy = bbminy;
+            atom->iclusters[ci].bbmaxy = bbmaxy;
+            atom->iclusters[ci].bbminz = bbminz;
+            atom->iclusters[ci].bbmaxz = bbmaxz;
         }
     }
 }
@@ -131,14 +133,17 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
 #define ADDGHOST(dx,dy,dz);                                                     \
     Nghost++;                                                                   \
     const int g_atom_idx = atom->Nclusters_local + Nghost;                      \
+    const int natoms_ci = atom->iclusters[ci].natoms;                           \
     border_map[Nghost] = ci;                                                    \
     atom->PBCx[Nghost] = dx;                                                    \
     atom->PBCy[Nghost] = dy;                                                    \
     atom->PBCz[Nghost] = dz;                                                    \
-    atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms;              \
-    Nghost_atoms += atom->clusters[g_atom_idx].natoms;                          \
-    for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {                  \
-        atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii];    \
+    atom->iclusters[g_atom_idx].natoms = natoms_ci;                             \
+    Nghost_atoms += natoms_ci;                                                  \
+    int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);                                 \
+    int cg_sca_base = CI_SCALAR_BASE_INDEX(cg);                                 \
+    for(int cii = 0; cii < natoms_ci; cii++) {                                  \
+        atom->cl_type[cg_sca_base + cii] = atom->cl_type[ci_sca_base + cii];    \
     }
 
 /* internal subroutines */
@@ -171,12 +176,12 @@ void setupPbc(Atom *atom, Parameter *param) {
             border_map = atom->border_map;
         }
 
-        MD_FLOAT bbminx = atom->clusters[ci].bbminx;
-        MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx;
-        MD_FLOAT bbminy = atom->clusters[ci].bbminy;
-        MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy;
-        MD_FLOAT bbminz = atom->clusters[ci].bbminz;
-        MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz;
+        MD_FLOAT bbminx = atom->iclusters[ci].bbminx;
+        MD_FLOAT bbmaxx = atom->iclusters[ci].bbmaxx;
+        MD_FLOAT bbminy = atom->iclusters[ci].bbminy;
+        MD_FLOAT bbmaxy = atom->iclusters[ci].bbmaxy;
+        MD_FLOAT bbminz = atom->iclusters[ci].bbminz;
+        MD_FLOAT bbmaxz = atom->iclusters[ci].bbmaxz;
 
         /* Setup ghost atoms */
         /* 6 planes */
@@ -210,16 +215,17 @@ void setupPbc(Atom *atom, Parameter *param) {
         if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
     }
 
-    if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) {
+    if(atom->Nclusters_local + Nghost + 2 >= atom->Nclusters_max) {
         growClusters(atom);
     }
 
     // Add dummy cluster at the end
-    MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1);
-    for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
-        cluster_x(cptr, cii) = INFINITY;
-        cluster_y(cptr, cii) = INFINITY;
-        cluster_z(cptr, cii) = INFINITY;
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(atom->Nclusters_local + Nghost + 1);
+    MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+    for(int cii = 0; cii < MAX(CLUSTER_M, CLUSTER_N); cii++) {
+        ci_x[CL_X_OFFSET + cii] = INFINITY;
+        ci_x[CL_Y_OFFSET + cii] = INFINITY;
+        ci_x[CL_Z_OFFSET + cii] = INFINITY;
     }
 
     // increase by one to make it the ghost atom count
diff --git a/include_ISA.mk b/include_ISA.mk
new file mode 100644
index 0000000..215f626
--- /dev/null
+++ b/include_ISA.mk
@@ -0,0 +1,15 @@
+ifeq ($(strip $(ISA)), SSE)
+VECTOR_WIDTH=2
+else ifeq ($(strip $(ISA)), AVX)
+# Vector width is 4 but AVX2 instruction set is not supported
+NO_AVX2=true
+VECTOR_WIDTH=4
+else ifeq ($(strip $(ISA)), AVX2)
+VECTOR_WIDTH=4
+else ifeq ($(strip $(ISA)), AVX512)
+VECTOR_WIDTH=8
+endif
+
+ifeq ($(strip $(DATA_TYPE)), SP)
+VECTOR_WIDTH=$((VECTOR_WIDTH * 2))
+endif

From 2b441e691e929d706d60a24de4960237e7e39a49 Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Wed, 9 Mar 2022 17:23:49 +0100
Subject: [PATCH 2/7] Make code compilable

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 gromacs/atom.c          |  13 +-
 gromacs/force_lj.c      | 258 ++++++++++++++++++++++++++++++++--------
 gromacs/includes/simd.h |  30 +----
 gromacs/main.c          |  34 +++---
 gromacs/neighbor.c      |  12 +-
 gromacs/pbc.c           |   9 +-
 gromacs/stats.c         |   4 +-
 gromacs/vtk.c           |  40 ++++---
 gromacs/xtc.c           |   9 +-
 9 files changed, 276 insertions(+), 133 deletions(-)

diff --git a/gromacs/atom.c b/gromacs/atom.c
index 8c11da9..024363f 100644
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -51,7 +51,8 @@ void initAtom(Atom *atom) {
     atom->sigma6 = NULL;
     atom->cutforcesq = NULL;
     atom->cutneighsq = NULL;
-    atom->clusters = NULL;
+    atom->iclusters = NULL;
+    atom->jclusters = NULL;
 }
 
 void createAtom(Atom *atom, Parameter *param) {
@@ -426,9 +427,11 @@ void growAtom(Atom *atom) {
 
 void growClusters(Atom *atom) {
     int nold = atom->Nclusters_max;
+    int jfac = MIN(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
     atom->Nclusters_max += DELTA;
-    atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
-    atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
-    atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
-    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
+    atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
+    atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster));
+    atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 }
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 04739a6..d749bb1 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -41,11 +41,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     MD_FLOAT epsilon = param->epsilon;
 
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *fptr = cluster_force_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            cluster_x(fptr, cii) = 0.0;
-            cluster_y(fptr, cii) = 0.0;
-            cluster_z(fptr, cii) = 0.0;
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_f[CL_X_OFFSET + cii] = 0.0;
+            ci_f[CL_Y_OFFSET + cii] = 0.0;
+            ci_f[CL_Z_OFFSET + cii] = 0.0;
         }
     }
 
@@ -54,28 +55,31 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 
     #pragma omp parallel for
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
         neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
         int numneighs = neighbor->numneigh[ci];
 
         for(int k = 0; k < numneighs; k++) {
             int cj = neighs[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             int any = 0;
-            MD_FLOAT *cjptr = cluster_pos_ptr(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+
             for(int cii = 0; cii < CLUSTER_M; cii++) {
-                MD_FLOAT xtmp = cluster_x(ciptr, cii);
-                MD_FLOAT ytmp = cluster_y(ciptr, cii);
-                MD_FLOAT ztmp = cluster_z(ciptr, cii);
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
                 MD_FLOAT fix = 0;
                 MD_FLOAT fiy = 0;
                 MD_FLOAT fiz = 0;
 
-                for(int cjj = 0; cjj < CLUSTER_M; cjj++) {
+                for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
                     if(ci != cj || cii != cjj) {
-                        MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
-                        MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
-                        MD_FLOAT delz = ztmp - cluster_z(cjptr, cjj);
+                        MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
+                        MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
+                        MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj];
                         MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
                         if(rsq < cutforcesq) {
                             MD_FLOAT sr2 = 1.0 / rsq;
@@ -98,9 +102,9 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
                     addStat(stats->clusters_outside_cutoff, 1);
                 }
 
-                cluster_x(cifptr, cii) += fix;
-                cluster_y(cifptr, cii) += fiy;
-                cluster_z(cifptr, cii) += fiz;
+                ci_f[CL_X_OFFSET + cii] += fix;
+                ci_f[CL_Y_OFFSET + cii] += fiy;
+                ci_f[CL_Z_OFFSET + cii] += fiz;
             }
         }
 
@@ -115,8 +119,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     return E-S;
 }
 
-
-double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
     DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
     int Nlocal = atom->Nlocal;
     int* neighs;
@@ -128,30 +131,31 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const int unroll_j = CLUSTER_N / CLUSTER_M;
+    const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
 
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
 
     #pragma omp parallel for
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
         neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
         int numneighs = neighbor->numneigh[ci];
 
-        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0));
-        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0));
-        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0));
-        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1));
-        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1));
-        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1));
-        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2));
-        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2));
-        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2));
-        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3));
-        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3));
-        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3));
+        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
+        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
+        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
+        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
+        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
+        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
+        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
+        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
+        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
+        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
+        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
+        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
         MD_SIMD_FLOAT fix0 = simd_zero();
         MD_SIMD_FLOAT fiy0 = simd_zero();
         MD_SIMD_FLOAT fiz0 = simd_zero();
@@ -166,10 +170,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
         MD_SIMD_FLOAT fiz3 = simd_zero();
 
         for(int k = 0; k < numneighs; k += unroll_j) {
-            int cj0 = neighs[k + 0];
-            unsigned int cond0 = (unsigned int)(ci == cj0);
-            MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
+            int cj = neighs[k + 0];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+            unsigned int cond0 = (unsigned int)(ci == cj);
+            unsigned int cond1 = cond0;
+            MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp;
 
+            /*
+            MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
             #if VECTOR_WIDTH == 8
             int cj1 = neighs[k + 1];
             unsigned int cond1 = (unsigned int)(ci == cj1);
@@ -182,6 +191,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
             MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
             #endif
+            */
 
             MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
             MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
@@ -247,18 +257,166 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
         }
 
-        cluster_x(cifptr, 0) = simd_horizontal_sum(fix0);
-        cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0);
-        cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0);
-        cluster_x(cifptr, 1) = simd_horizontal_sum(fix1);
-        cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1);
-        cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1);
-        cluster_x(cifptr, 2) = simd_horizontal_sum(fix2);
-        cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2);
-        cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2);
-        cluster_x(cifptr, 3) = simd_horizontal_sum(fix3);
-        cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3);
-        cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3);
+        ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
+        ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
+        ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
+        ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
+        ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
+        ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
+        ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
+        ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
+        ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
+        ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
+        ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
+        ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
+
+        addStat(stats->calculated_forces, 1);
+        addStat(stats->num_neighs, numneighs);
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
+    }
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
+    return E-S;
+}
+
+double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
+    MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
+    MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
+    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
+    const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    #pragma omp parallel for
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[ci];
+
+        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
+        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
+        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
+        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
+        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
+        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
+        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
+        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
+        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
+        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
+        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
+        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
+        MD_SIMD_FLOAT fix0 = simd_zero();
+        MD_SIMD_FLOAT fiy0 = simd_zero();
+        MD_SIMD_FLOAT fiz0 = simd_zero();
+        MD_SIMD_FLOAT fix1 = simd_zero();
+        MD_SIMD_FLOAT fiy1 = simd_zero();
+        MD_SIMD_FLOAT fiz1 = simd_zero();
+        MD_SIMD_FLOAT fix2 = simd_zero();
+        MD_SIMD_FLOAT fiy2 = simd_zero();
+        MD_SIMD_FLOAT fiz2 = simd_zero();
+        MD_SIMD_FLOAT fix3 = simd_zero();
+        MD_SIMD_FLOAT fiy3 = simd_zero();
+        MD_SIMD_FLOAT fiz3 = simd_zero();
+
+        for(int k = 0; k < numneighs; k += unroll_j) {
+            int cj = neighs[k + 0];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            unsigned int cond0 = (unsigned int)(ci == cj);
+            unsigned int cond1 = cond0;
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
+            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
+            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
+
+            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
+            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
+            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
+            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
+            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
+            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
+            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
+            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
+            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
+            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
+            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
+            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
+
+            #if VECTOR_WIDTH == 8
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
+            #else
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            #endif
+
+            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
+            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
+            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
+            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
+
+            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
+            MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
+
+            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
+            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
+            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
+            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
+
+            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
+            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
+            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
+            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
+
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
+            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
+            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
+
+            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
+            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
+            fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
+            fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
+            fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
+            fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
+            fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
+            fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
+            fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
+            fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
+            fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
+            fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
+        }
+
+        ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
+        ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
+        ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
+        ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
+        ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
+        ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
+        ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
+        ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
+        ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
+        ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
+        ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
+        ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h
index 9c0a842..1d6a90b 100644
--- a/gromacs/includes/simd.h
+++ b/gromacs/includes/simd.h
@@ -45,20 +45,7 @@ static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { retur
 static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
 static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
-
-static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) {
-    MD_SIMD_FLOAT x;
-#ifdef CLUSTER_AOS
-    __m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0);
-    __m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d));
-    x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double));
-    x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double));
-#else
-    x = _mm512_load_pd(&c0[d * CLUSTER_M]);
-    x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_M]), 1);
-#endif
-    return x;
-}
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
 
 static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
     MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
@@ -82,6 +69,7 @@ static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
 static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
 static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
 
 #ifdef NO_AVX2
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
@@ -124,20 +112,6 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
 }
 #endif
 
-static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) {
-    MD_SIMD_FLOAT x;
-#ifdef CLUSTER_AOS
-    #ifdef NO_AVX2
-    #error "Not possible to use AoS cluster layout without AVX2 support!"
-    #endif
-    __m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0);
-    __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d));
-    x = _mm256_i32gather_pd(c0, vindex, sizeof(double));
-#else
-    x = _mm256_load_pd(&c0[d * CLUSTER_M]);
-#endif
-    return x;
-}
 
 #endif
 
diff --git a/gromacs/main.c b/gromacs/main.c
index 0bc8a3c..ade8f65 100644
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -101,17 +101,18 @@ void initialIntegrate(Parameter *param, Atom *atom) {
     DEBUG_MESSAGE("initialIntegrate start\n");
 
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
-        MD_FLOAT *civptr = cluster_velocity_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
 
-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
-            cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
-            cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
-            cluster_x(ciptr, cii) += param->dt * cluster_x(civptr, cii);
-            cluster_y(ciptr, cii) += param->dt * cluster_y(civptr, cii);
-            cluster_z(ciptr, cii) += param->dt * cluster_z(civptr, cii);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+            ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
+            ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
+            ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
         }
     }
 
@@ -122,13 +123,14 @@ void finalIntegrate(Parameter *param, Atom *atom) {
     DEBUG_MESSAGE("finalIntegrate start\n");
 
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        MD_FLOAT *civptr = cluster_velocity_ptr(ci);
-        MD_FLOAT *cifptr = cluster_force_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
 
-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
-            cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
-            cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
-            cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
         }
     }
 
diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c
index 1f40ef9..557a9bb 100644
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -753,7 +753,7 @@ void binClusters(Atom *atom) {
 
                     if(CLUSTER_M > CLUSTER_N) {
                         int cj1 = CJ1_FROM_CI(ci);
-                        if(atoms->jclusters[cj1].natoms > 0) {
+                        if(atom->jclusters[cj1].natoms > 0) {
                             bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
                             bin_nclusters[bin]++;
                         }
@@ -780,15 +780,15 @@ void binClusters(Atom *atom) {
                         MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
                         MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
 
-                        xtmp = cj_x[CL_X_OFFSET + 0]
-                        ytmp = cj_x[CL_Y_OFFSET + 0]
+                        xtmp = cj_x[CL_X_OFFSET + 0];
+                        ytmp = cj_x[CL_Y_OFFSET + 0];
                         coord2bin2D(xtmp, ytmp, &ix, &iy);
                         ix = MAX(MIN(ix, mbinx - 1), 0);
                         iy = MAX(MIN(iy, mbiny - 1), 0);
                         for(int cjj = 1; cjj < n; cjj++) {
                             int nix, niy;
-                            xtmp = cj_x[CL_X_OFFSET + cjj]
-                            ytmp = cj_x[CL_Y_OFFSET + cjj]
+                            xtmp = cj_x[CL_X_OFFSET + cjj];
+                            ytmp = cj_x[CL_Y_OFFSET + cjj];
                             coord2bin2D(xtmp, ytmp, &nix, &niy);
                             nix = MAX(MIN(nix, mbinx - 1), 0);
                             niy = MAX(MIN(niy, mbiny - 1), 0);
@@ -862,7 +862,7 @@ void updateSingleAtoms(Atom *atom) {
         MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
         MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
 
-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
             atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
             atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
             atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
diff --git a/gromacs/pbc.c b/gromacs/pbc.c
index 9632e36..d1216ff 100644
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -28,6 +28,7 @@
 #include <atom.h>
 #include <allocate.h>
 #include <neighbor.h>
+#include <util.h>
 
 #define DELTA 20000
 
@@ -61,7 +62,7 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
         MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
         MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
 
-        for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
             MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd;
             MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd;
             MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd;
@@ -82,7 +83,7 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
         }
 
         if(firstUpdate) {
-            for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_M; cii++) {
+            for(int cii = atom->iclusters[ci].natoms; cii < CLUSTER_M; cii++) {
                 ci_x[CL_X_OFFSET + cii] = INFINITY;
                 ci_x[CL_Y_OFFSET + cii] = INFINITY;
                 ci_x[CL_Z_OFFSET + cii] = INFINITY;
@@ -132,13 +133,13 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
  * that are then enforced in updatePbc */
 #define ADDGHOST(dx,dy,dz);                                                     \
     Nghost++;                                                                   \
-    const int g_atom_idx = atom->Nclusters_local + Nghost;                      \
+    const int cg = atom->Nclusters_local + Nghost;                                      \
     const int natoms_ci = atom->iclusters[ci].natoms;                           \
     border_map[Nghost] = ci;                                                    \
     atom->PBCx[Nghost] = dx;                                                    \
     atom->PBCy[Nghost] = dy;                                                    \
     atom->PBCz[Nghost] = dz;                                                    \
-    atom->iclusters[g_atom_idx].natoms = natoms_ci;                             \
+    atom->iclusters[cg].natoms = natoms_ci;                                     \
     Nghost_atoms += natoms_ci;                                                  \
     int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);                                 \
     int cg_sca_base = CI_SCALAR_BASE_INDEX(cg);                                 \
diff --git a/gromacs/stats.c b/gromacs/stats.c
index 5f62918..c3c9f43 100644
--- a/gromacs/stats.c
+++ b/gromacs/stats.c
@@ -18,11 +18,11 @@ void initStats(Stats *s) {
 void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
 #ifdef COMPUTE_STATS
 
-    const int MxN = CLUSTER_DIM_M * CLUSTER_DIM_N;
+    const int MxN = CLUSTER_M * CLUSTER_N;
     double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
     double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
                                           (double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
-    double avg_neigh_atom = (stats->num_neighs * CLUSTER_DIM_N) / (double)(atom->Nlocal * (param->ntimes + 1));
+    double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
     double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
     double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
 
diff --git a/gromacs/vtk.c b/gromacs/vtk.c
index 35cc14e..2a29a26 100644
--- a/gromacs/vtk.c
+++ b/gromacs/vtk.c
@@ -27,9 +27,10 @@ int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
     fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
     fprintf(fp, "POINTS %d double\n", atom->Nlocal);
     for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
         }
     }
     fprintf(fp, "\n\n");
@@ -70,9 +71,10 @@ int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
     fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
     fprintf(fp, "POINTS %d double\n", atom->Nghost);
     for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
         }
     }
     fprintf(fp, "\n\n");
@@ -116,18 +118,19 @@ int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
     fprintf(fp, "DATASET POLYDATA\n");
     fprintf(fp, "POINTS %d double\n", atom->Nlocal);
     for(int ci = 0; ci < N; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
         }
 
-        tot_lines += atom->clusters[ci].natoms;
+        tot_lines += atom->iclusters[ci].natoms;
     }
     fprintf(fp, "\n\n");
     fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
     for(int ci = 0; ci < N; ++ci) {
-        fprintf(fp, "%d ", atom->clusters[ci].natoms);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
             fprintf(fp, "%d ", i++);
         }
 
@@ -157,18 +160,19 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
     fprintf(fp, "DATASET POLYDATA\n");
     fprintf(fp, "POINTS %d double\n", atom->Nghost);
     for(int ci = atom->Nclusters_local; ci < N; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
         }
 
-        tot_lines += atom->clusters[ci].natoms;
+        tot_lines += atom->iclusters[ci].natoms;
     }
     fprintf(fp, "\n\n");
     fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
     for(int ci = atom->Nclusters_local; ci < N; ++ci) {
-        fprintf(fp, "%d ", atom->clusters[ci].natoms);
-        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
             fprintf(fp, "%d ", i++);
         }
 
diff --git a/gromacs/xtc.c b/gromacs/xtc.c
index d381f40..08fb0c9 100644
--- a/gromacs/xtc.c
+++ b/gromacs/xtc.c
@@ -52,11 +52,12 @@ void xtc_init(const char *filename, Atom *atom, int timestep) {
 void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
     int i = 0;
     for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
-        MD_FLOAT *cptr = cluster_pos_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
         for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
-            x_buf[i][XX] = cluster_x(cptr, cii);
-            x_buf[i][YY] = cluster_y(cptr, cii);
-            x_buf[i][ZZ] = cluster_z(cptr, cii);
+            x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
+            x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
+            x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
             i++;
         }
     }

From 22d0f0b958203f1792a6a295d6d6bc1c1a32906c Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Thu, 10 Mar 2022 01:31:50 +0100
Subject: [PATCH 3/7] Commit version that works for M=N

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 gromacs/atom.c          |  4 ++++
 gromacs/force_lj.c      | 30 ++++++++++++++++++++----------
 gromacs/includes/atom.h | 14 +++++++++++++-
 gromacs/main.c          |  8 ++------
 gromacs/neighbor.c      | 15 +++++++--------
 gromacs/pbc.c           |  5 +++++
 6 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/gromacs/atom.c b/gromacs/atom.c
index 024363f..b96a91e 100644
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
     atom->cl_x = NULL;
     atom->cl_v = NULL;
     atom->cl_f = NULL;
+    atom->cl_type = NULL;
     atom->Natoms = 0;
     atom->Nlocal = 0;
     atom->Nghost = 0;
@@ -53,6 +54,7 @@ void initAtom(Atom *atom) {
     atom->cutneighsq = NULL;
     atom->iclusters = NULL;
     atom->jclusters = NULL;
+    atom->icluster_bin = NULL;
 }
 
 void createAtom(Atom *atom, Parameter *param) {
@@ -431,7 +433,9 @@ void growClusters(Atom *atom) {
     atom->Nclusters_max += DELTA;
     atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
     atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster));
+    atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
     atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
     atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
     atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
 }
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index d749bb1..03d1926 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -293,13 +293,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
-
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
 
     #pragma omp parallel for
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_cj0 = CJ0_FROM_CI(ci);
+        #if CLUSTER_M > CLUSTER_N
+        int ci_cj1 = CJ1_FROM_CI(ci);
+        #endif
         int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
         MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
         MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
@@ -331,11 +333,9 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
         MD_SIMD_FLOAT fiy3 = simd_zero();
         MD_SIMD_FLOAT fiz3 = simd_zero();
 
-        for(int k = 0; k < numneighs; k += unroll_j) {
+        for(int k = 0; k < numneighs; k++) {
             int cj = neighs[k + 0];
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
-            unsigned int cond0 = (unsigned int)(ci == cj);
-            unsigned int cond1 = cond0;
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
             MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
             MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
@@ -354,16 +354,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
             MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 
-            #if VECTOR_WIDTH == 8
+            #if CLUSTER_M == CLUSTER_N
+            int cond0 = (unsigned int)(cj == ci_cj0);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            #elif CLUSTER_M < CLUSTER_N
+            int cond0 = (unsigned int)(cj == (ci << 1) + 0);
+            int cond1 = (unsigned int)(cj == (ci << 1) + 1);
             MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
             MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
             MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
             #else
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
             #endif
 
             MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 2373d6b..0eff9f5 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -33,12 +33,23 @@
 // Simd4xN: M=4, N=VECTOR_WIDTH
 // Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
 
-// Simd2XNN (here used for single-precision)
+// Simd2xNN (here used for single-precision)
 #if VECTOR_WIDTH > CLUSTER_M * 2
+#   define KERNEL_NAME              "Simd2xNN"
 #   define CLUSTER_N                (VECTOR_WIDTH / 2)
+#   define computeForceLJ           computeForceLJ_2xnn
 // Simd4xN
 #else
+#   define KERNEL_NAME              "Simd4xN"
 #   define CLUSTER_N                VECTOR_WIDTH
+#   define computeForceLJ           computeForceLJ_4xn
+#endif
+
+#ifdef USE_REFERENCE_VERSION
+#   undef KERNEL_NAME
+#   undef computeForceLJ
+#   define KERNEL_NAME              "Reference"
+#   define computeForceLJ           computeForceLJ_ref
 #endif
 
 #if CLUSTER_M == CLUSTER_N
@@ -105,6 +116,7 @@ typedef struct {
     MD_FLOAT *cl_f;
     int *cl_type;
     Cluster *iclusters, *jclusters;
+    int *icluster_bin;
 } Atom;
 
 extern void initAtom(Atom*);
diff --git a/gromacs/main.c b/gromacs/main.c
index ade8f65..a4eda58 100644
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -43,14 +43,9 @@
 
 extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
 
-#ifdef USE_REFERENCE_VERSION
-#   define computeForceLJ   computeForceLJ_ref
-#else
-#   define computeForceLJ   computeForceLJ_4xn
-#endif
-
 double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
     if(param->force_field == FF_EAM) { initEam(eam, param); }
     double S, E;
@@ -325,6 +320,7 @@ int main(int argc, char** argv) {
     }
 
     printf(HLINE);
+    printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
     printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
 #if PRECISION == 1
     printf("Using single precision floating point.\n");
diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c
index 557a9bb..9634157 100644
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -41,7 +41,6 @@ static int *bincount;
 static int *bins;
 static int *bin_nclusters;
 static int *bin_clusters;
-static int *icluster_bin;
 static int mbins; //total number of bins
 static int atoms_per_bin;  // max atoms per bin
 static int clusters_per_bin;  // max clusters per bin
@@ -64,7 +63,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
     cutneigh = param->cutneigh;
     nmax = 0;
     atoms_per_bin = 8;
-    clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4;
+    clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
     stencil = NULL;
     bins = NULL;
     bincount = NULL;
@@ -227,7 +226,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
         for(int ci = 0; ci < atom->Nclusters_local; ci++) {
             int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
             int n = 0;
-            int ibin = icluster_bin[ci];
+            int ibin = atom->icluster_bin[ci];
             MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
             MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
             MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
@@ -590,7 +589,7 @@ void buildClusters(Atom *atom) {
                     if(bbminz > ztmp) { bbminz = ztmp; }
                     if(bbmaxz < ztmp) { bbmaxz = ztmp; }
 
-                    atom->cl_type[cii] = atom->type[i];
+                    ci_type[cii] = atom->type[i];
                     atom->iclusters[ci].natoms++;
                 } else {
                     ci_x[CL_X_OFFSET + cii] = INFINITY;
@@ -601,7 +600,7 @@ void buildClusters(Atom *atom) {
                 ac++;
             }
 
-            icluster_bin[ci] = bin;
+            atom->icluster_bin[ci] = bin;
             atom->iclusters[ci].bbminx = bbminx;
             atom->iclusters[ci].bbmaxx = bbmaxx;
             atom->iclusters[ci].bbminy = bbminy;
@@ -711,7 +710,7 @@ void binClusters(Atom *atom) {
         MD_FLOAT *cptr = cluster_pos_ptr(ci);
         DEBUG_MESSAGE("Cluster %d:\n", ci);
         DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
-            icluster_bin[ci],
+            atom->icluster_bin[ci],
             atom->clusters[ci].natoms,
             atom->clusters[ci].bbminx,
             atom->clusters[ci].bbmaxx,
@@ -745,7 +744,7 @@ void binClusters(Atom *atom) {
         for(int ci = 0; ci < nlocal && !resize; ci++) {
             // Assure we add this j-cluster only once in the bin
             if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
-                int bin = icluster_bin[ci];
+                int bin = atom->icluster_bin[ci];
                 int c = bin_nclusters[bin];
                 if(c + 1 < clusters_per_bin) {
                     bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
@@ -827,7 +826,7 @@ void binClusters(Atom *atom) {
                                 bin_clusters[bin * clusters_per_bin + c] = cj;
                             }
 
-                            icluster_bin[ci] = bin;
+                            atom->icluster_bin[ci] = bin;
                             bin_nclusters[bin]++;
                         } else {
                             resize = 1;
diff --git a/gromacs/pbc.c b/gromacs/pbc.c
index d1216ff..03d669b 100644
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -46,6 +46,7 @@ void initPbc(Atom* atom) {
 /* update coordinates of ghost atoms */
 /* uses mapping created in setupPbc */
 void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
+    DEBUG_MESSAGE("updatePbc start\n");
     int *border_map = atom->border_map;
     int nlocal = atom->Nclusters_local;
     MD_FLOAT xprd = param->xprd;
@@ -97,6 +98,8 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
             atom->iclusters[ci].bbmaxz = bbmaxz;
         }
     }
+
+    DEBUG_MESSAGE("updatePbc end\n");
 }
 
 /* relocate atoms that have left domain according
@@ -159,6 +162,7 @@ void growPbc(Atom* atom) {
 }
 
 void setupPbc(Atom *atom, Parameter *param) {
+    DEBUG_MESSAGE("setupPbc start\n");
     int *border_map = atom->border_map;
     MD_FLOAT xprd = param->xprd;
     MD_FLOAT yprd = param->yprd;
@@ -236,4 +240,5 @@ void setupPbc(Atom *atom, Parameter *param) {
 
     // Update created ghost clusters positions
     updatePbc(atom, param, 1);
+    DEBUG_MESSAGE("setupPbc end\n");
 }

From d79c3c2a1d313a09b5de95ae6047231987076d33 Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Thu, 10 Mar 2022 22:33:41 +0100
Subject: [PATCH 4/7] Add first working version with 4x8 config (ref kernel)

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 gromacs/atom.c              |   4 +-
 gromacs/force_lj.c          |  10 +-
 gromacs/includes/atom.h     |   1 +
 gromacs/includes/neighbor.h |   1 +
 gromacs/main.c              |   3 +
 gromacs/neighbor.c          | 184 +++++++++++++++++-------------------
 gromacs/pbc.c               | 175 +++++++++++++++++-----------------
 7 files changed, 191 insertions(+), 187 deletions(-)

diff --git a/gromacs/atom.c b/gromacs/atom.c
index b96a91e..88ee0f8 100644
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -429,10 +429,10 @@ void growAtom(Atom *atom) {
 
 void growClusters(Atom *atom) {
     int nold = atom->Nclusters_max;
-    int jfac = MIN(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
+    int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
     atom->Nclusters_max += DELTA;
     atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
-    atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster));
+    atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
     atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
     atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
     atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 03d1926..99d56a9 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -55,6 +55,8 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 
     #pragma omp parallel for
     for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_cj0 = CJ0_FROM_CI(ci);
+        int ci_cj1 = CJ1_FROM_CI(ci);
         int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
         MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
         MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
@@ -76,7 +78,13 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
                 MD_FLOAT fiz = 0;
 
                 for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
-                    if(ci != cj || cii != cjj) {
+                    #if CLUSTER_M == CLUSTER_N
+                    if(ci_cj0 != cj || cii != cjj) {
+                    #elif CLUSTER_M < CLUSTER_N
+                    if(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj) {
+                    #else
+                    if((ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N)) {
+                    #endif
                         MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
                         MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
                         MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj];
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 0eff9f5..6590419 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -117,6 +117,7 @@ typedef struct {
     int *cl_type;
     Cluster *iclusters, *jclusters;
     int *icluster_bin;
+    int dummy_cj;
 } Atom;
 
 extern void initAtom(Atom*);
diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h
index d1fbb5f..0ba00c0 100644
--- a/gromacs/includes/neighbor.h
+++ b/gromacs/includes/neighbor.h
@@ -40,6 +40,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
 extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
 extern void sortAtom(Atom*);
 extern void buildClusters(Atom*);
+extern void defineJClusters(Atom*);
 extern void binClusters(Atom*);
 extern void updateSingleAtoms(Atom*);
 #endif
diff --git a/gromacs/main.c b/gromacs/main.c
index a4eda58..a36af9d 100644
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -70,6 +70,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
     setupThermo(param, atom->Natoms);
     if(param->input_file == NULL) { adjustThermo(param, atom); }
     buildClusters(atom);
+    defineJClusters(atom);
     setupPbc(atom, param);
     binClusters(atom);
     buildNeighbor(atom, neighbor);
@@ -84,6 +85,7 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
     updateSingleAtoms(atom);
     updateAtomsPbc(atom, param);
     buildClusters(atom);
+    defineJClusters(atom);
     setupPbc(atom, param);
     binClusters(atom);
     buildNeighbor(atom, neighbor);
@@ -313,6 +315,7 @@ int main(int argc, char** argv) {
     }
 
     timer[TOTAL] = getTimeStamp() - timer[TOTAL];
+    updateSingleAtoms(&atom);
     computeThermo(-1, &param, &atom);
 
     if(param.xtc_file != NULL) {
diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c
index 9634157..694abd7 100644
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -201,11 +201,10 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
 
 void buildNeighbor(Atom *atom, Neighbor *neighbor) {
     DEBUG_MESSAGE("buildNeighbor start\n");
-    int nall = atom->Nclusters_local + atom->Nclusters_ghost;
 
     /* extend atom arrays if necessary */
-    if(nall > nmax) {
-        nmax = nall;
+    if(atom->Nclusters_local > nmax) {
+        nmax = atom->Nclusters_local;
         if(neighbor->numneigh) free(neighbor->numneigh);
         if(neighbor->neighbors) free(neighbor->neighbors);
         neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
@@ -307,7 +306,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
             // Fill neighbor list with dummy values to fit vector width
             if(CLUSTER_N < VECTOR_WIDTH) {
                 while(n % (VECTOR_WIDTH / CLUSTER_N)) {
-                    neighptr[n++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster
+                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
                 }
             }
 
@@ -332,38 +331,40 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
     /*
     DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
     for(int ci = 0; ci < 6; ci++) {
-        MD_FLOAT *ciptr = cluster_pos_ptr(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
         int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 
         DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
             ci,
-            atom->clusters[ci].bbminx,
-            atom->clusters[ci].bbmaxx,
-            atom->clusters[ci].bbminy,
-            atom->clusters[ci].bbmaxy,
-            atom->clusters[ci].bbminz,
-            atom->clusters[ci].bbmaxz);
+            atom->iclusters[ci].bbminx,
+            atom->iclusters[ci].bbmaxx,
+            atom->iclusters[ci].bbminy,
+            atom->iclusters[ci].bbmaxy,
+            atom->iclusters[ci].bbminz,
+            atom->iclusters[ci].bbmaxz);
 
         for(int cii = 0; cii < CLUSTER_M; cii++) {
-            DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
+            DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
         }
 
         DEBUG_MESSAGE("Neighbors:\n");
         for(int k = 0; k < neighbor->numneigh[ci]; k++) {
-            const int cj = neighptr[k];
-            MD_FLOAT *cjptr = cluster_pos_ptr(cj);
+            int cj = neighptr[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 
             DEBUG_MESSAGE("    Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
                 cj,
-                atom->clusters[cj].bbminx,
-                atom->clusters[cj].bbmaxx,
-                atom->clusters[cj].bbminy,
-                atom->clusters[cj].bbmaxy,
-                atom->clusters[cj].bbminz,
-                atom->clusters[cj].bbmaxz);
+                atom->jclusters[cj].bbminx,
+                atom->jclusters[cj].bbmaxx,
+                atom->jclusters[cj].bbminy,
+                atom->jclusters[cj].bbmaxy,
+                atom->jclusters[cj].bbminz,
+                atom->jclusters[cj].bbmaxz);
 
-            for(int cjj = 0; cjj < CLUSTER_M; cjj++) {
-                DEBUG_MESSAGE("    %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
+            for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+                DEBUG_MESSAGE("    %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
             }
         }
     }
@@ -374,7 +375,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 
 void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
     DEBUG_MESSAGE("pruneNeighbor start\n");
-    int nall = atom->Nclusters_local + atom->Nclusters_ghost;
     //MD_FLOAT cutsq = param->cutforce * param->cutforce;
     MD_FLOAT cutsq = cutneighsq;
 
@@ -385,7 +385,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
 
         // Remove dummy clusters if necessary
         if(CLUSTER_N < VECTOR_WIDTH) {
-            while(neighs[numneighs - 1] == CJ0_FROM_CI(nall - 1)) {
+            while(neighs[numneighs - 1] == atom->dummy_cj) {
                 numneighs--;
             }
         }
@@ -403,7 +403,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
         // Readd dummy clusters if necessary
         if(CLUSTER_N < VECTOR_WIDTH) {
             while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
-                neighs[numneighs++] = CJ0_FROM_CI(nall - 1); // Last cluster is always a dummy cluster
+                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
             }
         }
 
@@ -617,7 +617,7 @@ void buildClusters(Atom *atom) {
 void defineJClusters(Atom *atom) {
     DEBUG_MESSAGE("defineJClusters start\n");
 
-    for(int ci = 0; ci < atom->Nclusters_local + atom->Nclusters_ghost; ci++) {
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
         int cj0 = CJ0_FROM_CI(ci);
 
         if(CLUSTER_M == CLUSTER_N) {
@@ -687,13 +687,14 @@ void defineJClusters(Atom *atom) {
 
         } else {
             if(ci % 2 == 0) {
-                atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci + 1].bbminx);
-                atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci + 1].bbmaxx);
-                atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci + 1].bbminy);
-                atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci + 1].bbmaxy);
-                atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci + 1].bbminz);
-                atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci + 1].bbmaxz);
-                atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci + 1].natoms;
+                const int ci1 = ci + 1;
+                atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
+                atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
+                atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
+                atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
+                atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
+                atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
+                atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
             }
         }
     }
@@ -726,12 +727,8 @@ void binClusters(Atom *atom) {
     */
 
     const int nlocal = atom->Nclusters_local;
-    const int ncji = MAX(1, CLUSTER_M / CLUSTER_N);
-    if(ncji > 2) {
-        fprintf(stderr, "binClusters(): ncji cannot be higher than 2!");
-    }
-
-    defineJClusters(atom);
+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    const int ncj = atom->Nclusters_local / jfac;
 
     int resize = 1;
     while(resize > 0) {
@@ -764,74 +761,65 @@ void binClusters(Atom *atom) {
         }
 
         for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
-            // Assure we add this j-cluster only once in the bin
-            if(!(CLUSTER_M < CLUSTER_N && cg % 2)) {
-                const int ci = nlocal + cg;
-                int ix = -1, iy = -1;
-                MD_FLOAT xtmp, ytmp;
+            const int cj = ncj + cg;
+            int ix = -1, iy = -1;
+            MD_FLOAT xtmp, ytmp;
 
-                for(int cji = 0; cji < ncji && !resize; cji++) {
-                    int cj = (cji == 0) ? CJ0_FROM_CI(ci) : CJ1_FROM_CI(ci);
-                    const int n = atom->jclusters[cj].natoms;
+            if(atom->jclusters[cj].natoms > 0) {
+                int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+                MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+                MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
 
-                    if(n > 0) {
-                        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
-                        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
-                        MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
+                xtmp = cj_x[CL_X_OFFSET + 0];
+                ytmp = cj_x[CL_Y_OFFSET + 0];
+                coord2bin2D(xtmp, ytmp, &ix, &iy);
+                ix = MAX(MIN(ix, mbinx - 1), 0);
+                iy = MAX(MIN(iy, mbiny - 1), 0);
+                for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
+                    int nix, niy;
+                    xtmp = cj_x[CL_X_OFFSET + cjj];
+                    ytmp = cj_x[CL_Y_OFFSET + cjj];
+                    coord2bin2D(xtmp, ytmp, &nix, &niy);
+                    nix = MAX(MIN(nix, mbinx - 1), 0);
+                    niy = MAX(MIN(niy, mbiny - 1), 0);
 
-                        xtmp = cj_x[CL_X_OFFSET + 0];
-                        ytmp = cj_x[CL_Y_OFFSET + 0];
-                        coord2bin2D(xtmp, ytmp, &ix, &iy);
-                        ix = MAX(MIN(ix, mbinx - 1), 0);
-                        iy = MAX(MIN(iy, mbiny - 1), 0);
-                        for(int cjj = 1; cjj < n; cjj++) {
-                            int nix, niy;
-                            xtmp = cj_x[CL_X_OFFSET + cjj];
-                            ytmp = cj_x[CL_Y_OFFSET + cjj];
-                            coord2bin2D(xtmp, ytmp, &nix, &niy);
-                            nix = MAX(MIN(nix, mbinx - 1), 0);
-                            niy = MAX(MIN(niy, mbiny - 1), 0);
+                    // Always put the cluster on the bin of its innermost atom so
+                    // the cluster should be closer to local clusters
+                    if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
+                    if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
+                    if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
+                    if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
+                }
 
-                            // Always put the cluster on the bin of its innermost atom so
-                            // the cluster should be closer to local clusters
-                            if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
-                            if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
-                            if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
-                            if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
-                        }
+                int bin = iy * mbinx + ix + 1;
+                int c = bin_nclusters[bin];
+                if(c < clusters_per_bin) {
+                    // Insert the current ghost cluster in the bin keeping clusters
+                    // sorted by z coordinate
+                    int inserted = 0;
+                    for(int i = 0; i < c; i++) {
+                        int last_cl = bin_clusters[bin * clusters_per_bin + i];
+                        if(atom->jclusters[last_cl].bbminz > cj_minz) {
+                            bin_clusters[bin * clusters_per_bin + i] = cj;
 
-                        int bin = iy * mbinx + ix + 1;
-                        int c = bin_nclusters[bin];
-                        if(c < clusters_per_bin) {
-                            // Insert the current ghost cluster in the bin keeping clusters
-                            // sorted by z coordinate
-                            int inserted = 0;
-                            for(int i = 0; i < c; i++) {
-                                int last_cl = bin_clusters[bin * clusters_per_bin + i];
-                                if(atom->jclusters[last_cl].bbminz > cj_minz) {
-                                    bin_clusters[bin * clusters_per_bin + i] = cj;
-
-                                    for(int j = i + 1; j <= c; j++) {
-                                        int tmp = bin_clusters[bin * clusters_per_bin + j];
-                                        bin_clusters[bin * clusters_per_bin + j] = last_cl;
-                                        last_cl = tmp;
-                                    }
-
-                                    inserted = 1;
-                                    break;
-                                }
+                            for(int j = i + 1; j <= c; j++) {
+                                int tmp = bin_clusters[bin * clusters_per_bin + j];
+                                bin_clusters[bin * clusters_per_bin + j] = last_cl;
+                                last_cl = tmp;
                             }
 
-                            if(!inserted) {
-                                bin_clusters[bin * clusters_per_bin + c] = cj;
-                            }
-
-                            atom->icluster_bin[ci] = bin;
-                            bin_nclusters[bin]++;
-                        } else {
-                            resize = 1;
+                            inserted = 1;
+                            break;
                         }
                     }
+
+                    if(!inserted) {
+                        bin_clusters[bin * clusters_per_bin + c] = cj;
+                    }
+
+                    bin_nclusters[bin]++;
+                } else {
+                    resize = 1;
                 }
             }
         }
diff --git a/gromacs/pbc.c b/gromacs/pbc.c
index 03d669b..301ee28 100644
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -47,30 +47,30 @@ void initPbc(Atom* atom) {
 /* uses mapping created in setupPbc */
 void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
     DEBUG_MESSAGE("updatePbc start\n");
-    int *border_map = atom->border_map;
-    int nlocal = atom->Nclusters_local;
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
     MD_FLOAT xprd = param->xprd;
     MD_FLOAT yprd = param->yprd;
     MD_FLOAT zprd = param->zprd;
 
     for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
-        const int ci = nlocal + cg;
-        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
-        int bmap_vec_base = CI_VECTOR_BASE_INDEX(border_map[cg]);
-        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        const int cj = ncj + cg;
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
         MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
         MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
         MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
         MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
 
-        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
-            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cii] + atom->PBCx[cg] * xprd;
-            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cii] + atom->PBCy[cg] * yprd;
-            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cii] + atom->PBCz[cg] * zprd;
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
+            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
+            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
 
-            ci_x[CL_X_OFFSET + cii] = xtmp;
-            ci_x[CL_Y_OFFSET + cii] = ytmp;
-            ci_x[CL_Z_OFFSET + cii] = ztmp;
+            cj_x[CL_X_OFFSET + cjj] = xtmp;
+            cj_x[CL_Y_OFFSET + cjj] = ytmp;
+            cj_x[CL_Z_OFFSET + cjj] = ztmp;
 
             if(firstUpdate) {
                 // TODO: To create the bounding boxes faster, we can use SIMD operations
@@ -84,18 +84,18 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
         }
 
         if(firstUpdate) {
-            for(int cii = atom->iclusters[ci].natoms; cii < CLUSTER_M; cii++) {
-                ci_x[CL_X_OFFSET + cii] = INFINITY;
-                ci_x[CL_Y_OFFSET + cii] = INFINITY;
-                ci_x[CL_Z_OFFSET + cii] = INFINITY;
+            for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+                cj_x[CL_X_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Z_OFFSET + cjj] = INFINITY;
             }
 
-            atom->iclusters[ci].bbminx = bbminx;
-            atom->iclusters[ci].bbmaxx = bbmaxx;
-            atom->iclusters[ci].bbminy = bbminy;
-            atom->iclusters[ci].bbmaxy = bbmaxy;
-            atom->iclusters[ci].bbminz = bbminz;
-            atom->iclusters[ci].bbmaxz = bbmaxz;
+            atom->jclusters[cj].bbminx = bbminx;
+            atom->jclusters[cj].bbmaxx = bbmaxx;
+            atom->jclusters[cj].bbminy = bbminy;
+            atom->jclusters[cj].bbmaxy = bbmaxy;
+            atom->jclusters[cj].bbminz = bbminz;
+            atom->jclusters[cj].bbmaxz = bbmaxz;
         }
     }
 
@@ -136,18 +136,18 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
  * that are then enforced in updatePbc */
 #define ADDGHOST(dx,dy,dz);                                                     \
     Nghost++;                                                                   \
-    const int cg = atom->Nclusters_local + Nghost;                                      \
-    const int natoms_ci = atom->iclusters[ci].natoms;                           \
-    border_map[Nghost] = ci;                                                    \
+    const int cg = ncj + Nghost;                                                \
+    const int cj_natoms = atom->jclusters[cj].natoms;                           \
+    atom->border_map[Nghost] = cj;                                              \
     atom->PBCx[Nghost] = dx;                                                    \
     atom->PBCy[Nghost] = dy;                                                    \
     atom->PBCz[Nghost] = dz;                                                    \
-    atom->iclusters[cg].natoms = natoms_ci;                                     \
-    Nghost_atoms += natoms_ci;                                                  \
-    int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);                                 \
-    int cg_sca_base = CI_SCALAR_BASE_INDEX(cg);                                 \
-    for(int cii = 0; cii < natoms_ci; cii++) {                                  \
-        atom->cl_type[cg_sca_base + cii] = atom->cl_type[ci_sca_base + cii];    \
+    atom->jclusters[cg].natoms = cj_natoms;                                     \
+    Nghost_atoms += cj_natoms;                                                  \
+    int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);                                 \
+    int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg);                                 \
+    for(int cjj = 0; cjj < cj_natoms; cjj++) {                                  \
+        atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj];    \
     }
 
 /* internal subroutines */
@@ -163,77 +163,80 @@ void growPbc(Atom* atom) {
 
 void setupPbc(Atom *atom, Parameter *param) {
     DEBUG_MESSAGE("setupPbc start\n");
-    int *border_map = atom->border_map;
     MD_FLOAT xprd = param->xprd;
     MD_FLOAT yprd = param->yprd;
     MD_FLOAT zprd = param->zprd;
     MD_FLOAT Cutneigh = param->cutneigh;
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
     int Nghost = -1;
     int Nghost_atoms = 0;
 
-    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        if (atom->Nclusters_local + Nghost + 7 >= atom->Nclusters_max) {
-            growClusters(atom);
+    for(int cj = 0; cj < ncj; cj++) {
+        if(atom->jclusters[cj].natoms > 0) {
+            if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            if((Nghost + 7) * jfac >= NmaxGhost) {
+                growPbc(atom);
+            }
+
+            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
+            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
+            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
+            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
+            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
+            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
+
+            /* Setup ghost atoms */
+            /* 6 planes */
+            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
+            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
+            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
+            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
+            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
+            /* 8 corners */
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
+            /* 12 edges */
+            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
+            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
+            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
+            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
+            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
+            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
         }
-
-        if (Nghost + 7 >= NmaxGhost) {
-            growPbc(atom);
-            border_map = atom->border_map;
-        }
-
-        MD_FLOAT bbminx = atom->iclusters[ci].bbminx;
-        MD_FLOAT bbmaxx = atom->iclusters[ci].bbmaxx;
-        MD_FLOAT bbminy = atom->iclusters[ci].bbminy;
-        MD_FLOAT bbmaxy = atom->iclusters[ci].bbmaxy;
-        MD_FLOAT bbminz = atom->iclusters[ci].bbminz;
-        MD_FLOAT bbmaxz = atom->iclusters[ci].bbmaxz;
-
-        /* Setup ghost atoms */
-        /* 6 planes */
-        if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
-        if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
-        if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
-        if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
-        if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
-        if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
-        /* 8 corners */
-        if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
-        if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
-        if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
-        if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
-        /* 12 edges */
-        if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
-        if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
-        if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
-        if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
-        if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
-        if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
-        if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
-        if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
     }
 
-    if(atom->Nclusters_local + Nghost + 2 >= atom->Nclusters_max) {
+    if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
         growClusters(atom);
     }
 
     // Add dummy cluster at the end
-    int ci_vec_base = CI_VECTOR_BASE_INDEX(atom->Nclusters_local + Nghost + 1);
-    MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
-    for(int cii = 0; cii < MAX(CLUSTER_M, CLUSTER_N); cii++) {
-        ci_x[CL_X_OFFSET + cii] = INFINITY;
-        ci_x[CL_Y_OFFSET + cii] = INFINITY;
-        ci_x[CL_Z_OFFSET + cii] = INFINITY;
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
     }
 
     // increase by one to make it the ghost atom count
+    atom->dummy_cj = ncj + Nghost + 1;
     atom->Nghost = Nghost_atoms;
     atom->Nclusters_ghost = Nghost + 1;
     atom->Nclusters = atom->Nclusters_local + Nghost + 1;

From 8669f2f6d73ed89836f27877e91de1e0f028843c Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Fri, 11 Mar 2022 01:12:59 +0100
Subject: [PATCH 5/7] Fix LJ Simd4xN kernel

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 gromacs/force_lj.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 99d56a9..7b6d37f 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -342,7 +342,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
         MD_SIMD_FLOAT fiz3 = simd_zero();
 
         for(int k = 0; k < numneighs; k++) {
-            int cj = neighs[k + 0];
+            int cj = neighs[k];
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
             MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@@ -369,8 +369,8 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
             MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
             #elif CLUSTER_M < CLUSTER_N
-            int cond0 = (unsigned int)(cj == (ci << 1) + 0);
-            int cond1 = (unsigned int)(cj == (ci << 1) + 1);
+            int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            int cond1 = (unsigned int)((cj << 1) + 1 == ci);
             MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
             MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));

From d61576699dad256d50c53cb3b3982390d69eb7c8 Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Tue, 15 Mar 2022 02:40:56 +0100
Subject: [PATCH 6/7] Add first compilable version of Gromacs with SP

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 Makefile                |  15 +---
 config.mk               |   8 +-
 gromacs/force_lj.c      | 172 +++++++++++++++-------------------------
 gromacs/includes/simd.h | 145 +++++++++++++++++++++++++++++++--
 include_ISA.mk          |  13 +--
 lammps/main-stub.c      |   7 +-
 6 files changed, 216 insertions(+), 144 deletions(-)

diff --git a/Makefile b/Makefile
index 78289dc..9d3cc90 100644
--- a/Makefile
+++ b/Makefile
@@ -17,9 +17,6 @@ INCLUDES  += -I./$(SRC_DIR)/includes
 ifeq ($(strip $(DATA_LAYOUT)),AOS)
 DEFINES +=  -DAOS
 endif
-ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
-DEFINES +=  -DCLUSTER_AOS
-endif
 ifeq ($(strip $(DATA_TYPE)),SP)
 DEFINES +=  -DPRECISION=1
 else
@@ -30,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
     ASFLAGS += -masm=intel
 endif
 
-ifneq ($(ATOMS_LOOP_RUNS),)
-    DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
-endif
-
-ifneq ($(NEIGHBORS_LOOP_RUNS),)
-    DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
-endif
-
 ifeq ($(strip $(EXPLICIT_TYPES)),true)
     DEFINES += -DEXPLICIT_TYPES
 endif
@@ -74,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
     DEFINES += -DNO_AVX2
 endif
 
+ifeq ($(strip $(AVX512)),true)
+    DEFINES += -DAVX512
+endif
+
 VPATH     = $(SRC_DIR) $(ASM_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
diff --git a/config.mk b/config.mk
index de2bf9d..9afe7a0 100644
--- a/config.mk
+++ b/config.mk
@@ -7,7 +7,7 @@ OPT_SCHEME ?= gromacs
 # Enable likwid (true or false)
 ENABLE_LIKWID ?= true
 # SP or DP
-DATA_TYPE ?= DP
+DATA_TYPE ?= SP
 # AOS or SOA
 DATA_LAYOUT ?= AOS
 # Assembly syntax to generate (ATT/INTEL)
@@ -15,10 +15,6 @@ ASM_SYNTAX ?= ATT
 # Debug
 DEBUG ?= false
 
-# Number of times to run the atoms loop on stubbed variant
-ATOMS_LOOP_RUNS ?= 1
-# Number of times to run the neighbors loop on stubbed variant
-NEIGHBORS_LOOP_RUNS ?= 1
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
 # Trace memory addresses for cache simulator (true or false)
@@ -29,8 +25,6 @@ INDEX_TRACER ?= false
 COMPUTE_STATS ?= true
 
 # Configurations for gromacs optimization scheme
-# AOS or SOA
-CLUSTER_LAYOUT ?= SOA
 # Use reference version
 USE_REFERENCE_VERSION ?= false
 # Enable XTC output
diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 7b6d37f..41749ad 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -128,7 +128,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 }
 
 double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
-    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
+    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
     int Nlocal = atom->Nlocal;
     int* neighs;
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
@@ -136,10 +136,10 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
     MD_FLOAT epsilon = param->epsilon;
     MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
     MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
-    MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-    const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
@@ -152,131 +152,86 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
         neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
         int numneighs = neighbor->numneigh[ci];
 
-        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
-        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
-        MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
-        MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
-        MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
-        MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
-        MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
-        MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
-        MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
-        MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
-        MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
-        MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
+        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
+        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
+        MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
+        MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
+        MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
+        MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
         MD_SIMD_FLOAT fix0 = simd_zero();
         MD_SIMD_FLOAT fiy0 = simd_zero();
         MD_SIMD_FLOAT fiz0 = simd_zero();
-        MD_SIMD_FLOAT fix1 = simd_zero();
-        MD_SIMD_FLOAT fiy1 = simd_zero();
-        MD_SIMD_FLOAT fiz1 = simd_zero();
         MD_SIMD_FLOAT fix2 = simd_zero();
         MD_SIMD_FLOAT fiy2 = simd_zero();
         MD_SIMD_FLOAT fiz2 = simd_zero();
-        MD_SIMD_FLOAT fix3 = simd_zero();
-        MD_SIMD_FLOAT fiy3 = simd_zero();
-        MD_SIMD_FLOAT fiz3 = simd_zero();
 
-        for(int k = 0; k < numneighs; k += unroll_j) {
-            int cj = neighs[k + 0];
+        for(int k = 0; k < numneighs; k += 2) {
+            int cj = neighs[k];
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
-            unsigned int cond0 = (unsigned int)(ci == cj);
-            unsigned int cond1 = cond0;
-            MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp;
-
-            /*
-            MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
-            #if VECTOR_WIDTH == 8
-            int cj1 = neighs[k + 1];
-            unsigned int cond1 = (unsigned int)(ci == cj1);
-            MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
-            MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
-            MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
-            MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
-            #else
-            MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
-            MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
-            MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
-            #endif
-            */
+            unsigned int mask0, mask1, mask2, mask3;
 
+            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
+            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
+            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
             MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
             MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
             MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
             MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
             MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
             MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 
-            #if VECTOR_WIDTH == 8
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
+            #if CLUSTER_M == CLUSTER_N
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            mask0 = (unsigned int)(0xf - 0x1 * cond0);
+            mask1 = (unsigned int)(0xf - 0x2 * cond0);
+            mask2 = (unsigned int)(0xf - 0x4 * cond0);
+            mask3 = (unsigned int)(0xf - 0x8 * cond0);
+            #elif CLUSTER_M < CLUSTER_N
+            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
             #else
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
+            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
             #endif
 
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
+
             MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
-            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
             MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
-            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
 
             MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
-            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
             MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
-            MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
 
             MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
-            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
             MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
-            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
 
             MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
             MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
 
-            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
-            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
-            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
-            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
 
             fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
             fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
             fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
-            fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
-            fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
-            fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
             fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
             fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
             fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
-            fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
-            fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
-            fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
         }
 
-        ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
-        ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
-        ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
-        ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
-        ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
-        ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
-        ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
-        ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
-        ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
-        ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
-        ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
-        ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
+        simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
+        simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
+        simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
@@ -285,7 +240,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
 
     LIKWID_MARKER_STOP("force");
     double E = getTimeStamp();
-    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
+    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
     return E-S;
 }
 
@@ -298,7 +253,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
     MD_FLOAT epsilon = param->epsilon;
     MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
     MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
-    MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
     MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
     double S = getTimeStamp();
@@ -348,7 +303,6 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
             MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
             MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
-
             MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
             MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
             MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
@@ -363,14 +317,14 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 
             #if CLUSTER_M == CLUSTER_N
-            int cond0 = (unsigned int)(cj == ci_cj0);
+            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
             MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
             MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
             MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
             #elif CLUSTER_M < CLUSTER_N
-            int cond0 = (unsigned int)((cj << 1) + 0 == ci);
-            int cond1 = (unsigned int)((cj << 1) + 1 == ci);
+            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
+            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
             MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
             MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
             MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
@@ -404,10 +358,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
             MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
 
-            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
-            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
-            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
-            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
+            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
+            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
 
             fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
             fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
@@ -423,18 +377,18 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
             fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
         }
 
-        ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
-        ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
-        ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
-        ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
-        ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
-        ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
-        ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
-        ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
-        ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
-        ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
-        ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
-        ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
+        ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
+        ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
+        ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
+        ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
+        ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
+        ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
+        ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
+        ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
+        ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
+        ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
+        ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
+        ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
 
         addStat(stats->calculated_forces, 1);
         addStat(stats->num_neighs, numneighs);
diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h
index 1d6a90b..59a044a 100644
--- a/gromacs/includes/simd.h
+++ b/gromacs/includes/simd.h
@@ -21,6 +21,7 @@
  * =======================================================================================
  */
 
+#include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #include <zmmintrin.h>
@@ -28,7 +29,9 @@
 #define SIMD_PRINT_REAL(a)  simd_print_real(#a, a);
 #define SIMD_PRINT_MASK(a)  simd_print_mask(#a, a);
 
-#if VECTOR_WIDTH == 8 // AVX512
+#ifdef AVX512
+
+#if PRECISION == 2 // Double precision
 
 #define MD_SIMD_FLOAT       __m512d
 #define MD_SIMD_MASK        __mmask8
@@ -47,15 +50,98 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_m
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
 static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
 
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
     MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
     x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
     x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
     return *((MD_FLOAT *) &x);
 }
 
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
+    return _mm512_broadcast_f64x4(_mm256_load_pd(m));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
+    return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
+}
+
+static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512d t0;
+    __m256d t2, t3;
+
+    t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
+    t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
+    t2 = _mm512_castpd512_pd256(t0);
+    t3 = _mm256_load_pd(m);
+    t3 = _mm256_add_pd(t3, t2);
+    _mm256_store_pd(m, t3);
+
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
+}
+
+#else // Single-precision
+
+#define MD_SIMD_FLOAT       __m512
+#define MD_SIMD_MASK        __mmask16
+
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    // This would only be called in a Mx16 configuration, which is not valid in GROMACS
+    fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
+    return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
+    return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
+}
+
+static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512 t0, t1;
+    __m128 t2, t3;
+
+    t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
+    t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
+    t0 = _mm512_add_ps(t0, t1);
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
+    t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
+    t3 = _mm512_castps512_ps128(t0);
+    t2 = _mm_load_ps(m);
+    t2 = _mm_add_ps(t2, t3);
+    _mm_store_ps(m, t2);
+
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
+    return _mm_cvtss_f32(t3);
+}
+
+#endif // PRECISION
+
 #else // AVX or AVX2
 
+#if PRECISION == 2 // Double precision
+
 #define MD_SIMD_FLOAT       __m256d
 
 #ifdef NO_AVX2
@@ -72,6 +158,7 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return
 static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
 
 #ifdef NO_AVX2
+
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
 static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
 static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
@@ -85,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
 }
 // TODO: Implement this, althrough it is just required for debugging
 static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
     __m128d a0, a1;
     a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
     a0 = _mm256_castpd256_pd128(a);
@@ -93,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
     a0 = _mm_add_sd(a0, a1);
     return *((MD_FLOAT *) &a0);
 }
-#else
+
+#else // AVX2
+
 static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
 static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
 static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
@@ -101,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
 static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
 static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
 static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
-static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
     __m128d a0, a1;
     // test with shuffle & add as an alternative to hadd later
     a = _mm256_hadd_pd(a, a);
@@ -110,11 +199,53 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
     a0 = _mm_add_sd(a0, a1);
     return *((MD_FLOAT *) &a0);
 }
-#endif
-
 
 #endif
 
+#else // Single-precision
+
+#define MD_SIMD_FLOAT       __m256
+
+#ifdef NO_AVX2
+#define MD_SIMD_MASK        __m256
+#else
+#define MD_SIMD_MASK        __mmask8
+#endif
+
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
+
+#ifdef NO_AVX2
+
+#error "AVX intrinsincs with single-precision not implemented!"
+
+#else // AVX2
+
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128 t0;
+    t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+#endif // NO_AVX2
+
+#endif // PRECISION
+
+#endif // AVX or AVX2
+
 static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
     double x[VECTOR_WIDTH];
     memcpy(x, &a, sizeof(x));
diff --git a/include_ISA.mk b/include_ISA.mk
index 215f626..fa9b90b 100644
--- a/include_ISA.mk
+++ b/include_ISA.mk
@@ -1,15 +1,18 @@
 ifeq ($(strip $(ISA)), SSE)
-VECTOR_WIDTH=2
+_VECTOR_WIDTH=2
 else ifeq ($(strip $(ISA)), AVX)
 # Vector width is 4 but AVX2 instruction set is not supported
 NO_AVX2=true
-VECTOR_WIDTH=4
+_VECTOR_WIDTH=4
 else ifeq ($(strip $(ISA)), AVX2)
-VECTOR_WIDTH=4
+_VECTOR_WIDTH=4
 else ifeq ($(strip $(ISA)), AVX512)
-VECTOR_WIDTH=8
+AVX512=true
+_VECTOR_WIDTH=8
 endif
 
 ifeq ($(strip $(DATA_TYPE)), SP)
-VECTOR_WIDTH=$((VECTOR_WIDTH * 2))
+VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
+else
+VECTOR_WIDTH=$(_VECTOR_WIDTH)
 endif
diff --git a/lammps/main-stub.c b/lammps/main-stub.c
index 4da90db..c7596e7 100644
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
@@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
 
     if(!csv) {
         printf("Number of timesteps: %d\n", param.ntimes);
-        printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
-        printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
         printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
         printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
         printf("Total number of atoms: %d\n", atom->Nlocal);
@@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
     E = getTimeStamp();
     double T_accum = E-S;
     double freq_hz = param.proc_freq * 1.e9;
-    const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
-    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
-    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
     const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
 
     if(!csv) {

From d47173d7a2bd486deecab52c444fc0293baff837 Mon Sep 17 00:00:00 2001
From: Rafael Ravedutti <rafaelravedutti@gmail.com>
Date: Tue, 15 Mar 2022 19:59:10 +0100
Subject: [PATCH 7/7] Fix Simd2xNN kernel

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
---
 gromacs/force_lj.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c
index 41749ad..afdc311 100644
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -141,6 +141,16 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
     MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
     const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_f[CL_X_OFFSET + cii] = 0.0;
+            ci_f[CL_Y_OFFSET + cii] = 0.0;
+            ci_f[CL_Z_OFFSET + cii] = 0.0;
+        }
+    }
+
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
 
@@ -165,7 +175,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
         MD_SIMD_FLOAT fiy2 = simd_zero();
         MD_SIMD_FLOAT fiz2 = simd_zero();
 
-        for(int k = 0; k < numneighs; k += 2) {
+        for(int k = 0; k < numneighs; k++) {
             int cj = neighs[k];
             int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
             MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];