diff --git a/config.mk b/config.mk
index 7e5f72a..3767ee7 100644
--- a/config.mk
+++ b/config.mk
@@ -13,7 +13,7 @@ DATA_LAYOUT ?= AOS
 # Assembly syntax to generate (ATT/INTEL)
 ASM_SYNTAX ?= ATT
 # Debug
-DEBUG ?= false
+DEBUG ?= true
 
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
diff --git a/gromacs/atom.c b/gromacs/atom.c
index 047775e..78a1924 100644
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -452,12 +452,5 @@ void growSuperClusters(Atom *atom) {
     atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
     atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
     atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
-
-
-    /*
-    for (int sci = 0; sci < atom->Nsclusters_max; sci++) {
-        atom->siclusters[sci].iclusters = (int*) reallocate(atom->siclusters[sci].iclusters, ALIGNMENT, SCLUSTER_SIZE * sizeof(int), SCLUSTER_SIZE * sizeof(int));
-    }
-     */
 }
 #endif //USE_SUPER_CLUSTERS
diff --git a/gromacs/cuda/force_lj.cu b/gromacs/cuda/force_lj.cu
index 2b7d449..aa02fdd 100644
--- a/gromacs/cuda/force_lj.cu
+++ b/gromacs/cuda/force_lj.cu
@@ -154,11 +154,11 @@ void copyDataFromCUDADevice(Atom *atom) {
     memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 
 #ifdef USE_SUPER_CLUSTERS
-    alignDataFromSuperclusters(atom);
-
     memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
     memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
     memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
+
+    alignDataFromSuperclusters(atom);
 #endif //USE_SUPER_CLUSTERS
 
     DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
@@ -241,6 +241,39 @@ __global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
     }
 }
 
+__global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
+                                   int *cuda_jclusters_natoms,
+                                   int *cuda_PBCx,
+                                   int *cuda_PBCy,
+                                   int *cuda_PBCz,
+                                   int Nsclusters_local,
+                                   int Nclusters_ghost,
+                                   MD_FLOAT param_xprd,
+                                   MD_FLOAT param_yprd,
+                                   MD_FLOAT param_zprd) {
+    unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
+    if (cg >= Nclusters_ghost) return;
+
+    //int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int jfac = SCLUSTER_SIZE / CLUSTER_M;
+    int ncj = Nsclusters_local / jfac;
+    MD_FLOAT xprd = param_xprd;
+    MD_FLOAT yprd = param_yprd;
+    MD_FLOAT zprd = param_zprd;
+
+    const int cj = ncj + cg;
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
+    MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+    MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
+
+    for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
+        cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
+        cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
+    }
+}
+
 __global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
                                          int Nclusters_local, int Nclusters_max,
                                          int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
@@ -348,11 +381,19 @@ extern "C"
 void cudaUpdatePbc(Atom *atom, Parameter *param) {
     const int threads_num = 512;
     dim3 block_size = dim3(threads_num, 1, 1);;
-    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
-    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
+    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
+
+#ifdef USE_SUPER_CLUSTERS
+    cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
                                        cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
                                        atom->Nclusters_local, atom->Nclusters_ghost,
                                        param->xprd, param->yprd, param->zprd);
+#else
+    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
+                                                  cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
+                                                  atom->Nclusters_local, atom->Nclusters_ghost,
+                                                  param->xprd, param->yprd, param->zprd);
+#endif //USE_SUPER_CLUSTERS
     cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
     cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
 }
diff --git a/gromacs/cuda/force_lj_sup.cu b/gromacs/cuda/force_lj_sup.cu
index 23cf486..7d5d9c6 100644
--- a/gromacs/cuda/force_lj_sup.cu
+++ b/gromacs/cuda/force_lj_sup.cu
@@ -193,9 +193,11 @@ __global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_
     int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
 
     for(int k = 0; k < numneighs; k++) {
-        int cj = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
+        int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
+        int scj = glob_j / SCLUSTER_SIZE;
         // TODO Make cj accessible from super cluster data alignment (not reachable right now)
-        int cj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
+        int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
+        int cj_vec_base = cj;
         MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
         MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
 
@@ -206,14 +208,10 @@ __global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_
         MD_FLOAT fiy = 0;
         MD_FLOAT fiz = 0;
 
-        int cond;
-#if CLUSTER_M == CLUSTER_N
-        cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
-               (ci_cj0 != cj || cii_pos != cjj_pos);
-#elif CLUSTER_M < CLUSTER_N
-        cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
-                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
-#endif
+
+        //int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
+        int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
+
         if(cond) {
             MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
             MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 13f9d73..281548e 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -149,7 +149,6 @@ typedef struct {
 } Cluster;
 
 typedef struct {
-    //int *iclusters;
     int nclusters;
     MD_FLOAT bbminx, bbmaxx;
     MD_FLOAT bbminy, bbmaxy;
diff --git a/gromacs/includes/pbc.h b/gromacs/includes/pbc.h
index 33ceb53..50607c2 100644
--- a/gromacs/includes/pbc.h
+++ b/gromacs/includes/pbc.h
@@ -16,5 +16,8 @@ extern void setupPbc(Atom*, Parameter*);
 
 #ifdef CUDA_TARGET
 extern void cudaUpdatePbc(Atom*, Parameter*, int);
+#if defined(USE_SUPER_CLUSTERS)
+extern void setupPbcGPU(Atom*, Parameter*);
+#endif //defined(USE_SUPER_CLUSTERS)
 #endif
 #endif
diff --git a/gromacs/includes/utils.h b/gromacs/includes/utils.h
index d2c3130..326c022 100644
--- a/gromacs/includes/utils.h
+++ b/gromacs/includes/utils.h
@@ -6,12 +6,14 @@
 #define MD_BENCH_UTILS_H
 
 #include <atom.h>
+#include <neighbor.h>
 
 #ifdef USE_SUPER_CLUSTERS
 void verifyClusters(Atom *atom);
 void verifyLayout(Atom *atom);
 void checkAlignment(Atom *atom);
 void showSuperclusters(Atom *atom);
+void printNeighs(Atom *atom, Neighbor *neighbor);
 #endif //USE_SUPER_CLUSTERS
 
 #endif //MD_BENCH_UTILS_H
diff --git a/gromacs/main.c b/gromacs/main.c
index 5cd66b1..8a24baa 100644
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -77,7 +77,12 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
     buildClusters(atom);
     #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
     defineJClusters(atom);
+    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
+    //setupPbcGPU(atom, param);
     setupPbc(atom, param);
+    #else
+    setupPbc(atom, param);
+    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
     binClusters(atom);
     #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
     buildNeighborGPU(atom, neighbor);
@@ -101,7 +106,12 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
     buildClusters(atom);
     #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
     defineJClusters(atom);
+    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
+    //setupPbcGPU(atom, param);
     setupPbc(atom, param);
+    #else
+    setupPbc(atom, param);
+    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
     binClusters(atom);
     #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
     buildNeighborGPU(atom, neighbor);
@@ -234,6 +244,8 @@ int main(int argc, char** argv) {
     printParameter(&param);
     printf(HLINE);
 
+    //verifyNeigh(&atom, &neighbor);
+
     printf("step\ttemp\t\tpressure\n");
     computeThermo(0, &param, &atom);
     #if defined(MEM_TRACER) || defined(INDEX_TRACER)
@@ -276,7 +288,10 @@ int main(int argc, char** argv) {
                 #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
             }
 
+
+            copyDataFromCUDADevice(&atom);
             updatePbc(&atom, &param, 0);
+            copyDataToCUDADevice(&atom);
         } else {
             #ifdef CUDA_TARGET
             copyDataFromCUDADevice(&atom);
@@ -294,14 +309,34 @@ int main(int argc, char** argv) {
         traceAddresses(&param, &atom, &neighbor, n + 1);
         #endif
 
+
+        /*
+        printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
+        copyDataToCUDADevice(&atom);
+        verifyLayout(&atom);
+
+        //printClusterIndices(&atom);
+
+        */
+
         if(param.force_field == FF_EAM) {
             timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
         } else {
             timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
         }
 
+        /*
+        copyDataFromCUDADevice(&atom);
+        verifyLayout(&atom);
+
+        getchar();
+        */
+
         finalIntegrate(&param, &atom);
 
+
+
+
         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
             computeThermo(n + 1, &param, &atom);
         }
diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c
index 775c5dd..3124b8f 100644
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -395,11 +395,11 @@ void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
         int new_maxneighs = neighbor->maxneighs;
         resize = 0;
 
-        //for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
-            //for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
-            for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
+            for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
+            //for(int ci = 0; ci < atom->Nclusters_local; ci++) {
                 //const int ci = atom->siclusters[sci].iclusters[scii];
-                //const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + scii];
+                const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + scii];
                 int ci_cj1 = CJ1_FROM_CI(ci);
                 int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
                 //int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]);
@@ -504,7 +504,7 @@ void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
                     }
                 }
             }
-        //}
+        }
 
         if(resize) {
             fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
@@ -898,10 +898,7 @@ void buildClustersGPU(Atom *atom) {
         int n_super_clusters =  n_super_clusters_xy / SCLUSTER_SIZE_Z;
         if (n_super_clusters_xy % SCLUSTER_SIZE_Z) n_super_clusters++;
 
-        //printf("%d\t%d\t%d\t%d\r\n", nclusters, n_super_clusters_xy, n_super_clusters, (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)*n_super_clusters);
-
         int cl_count = 0;
-
         for (int scl = 0; scl < n_super_clusters; scl++) {
             const int sci = atom->Nsclusters_local;
             if(sci >= atom->Nsclusters_max) {
@@ -928,8 +925,6 @@ void buildClustersGPU(Atom *atom) {
 
                 sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx);
 
-                //printf("%d\t%d\t%d\t%d\t%d\t%d\r\n", nclusters, scl, scl_z, atom_scl_z_offset, atom_scl_z_end_idx, c);
-
                 for (int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) {
 
                     if (cl_count >= nclusters) break;
@@ -941,11 +936,8 @@ void buildClustersGPU(Atom *atom) {
                     const int atom_scl_y_end_idx = MIN(atom_scl_y_offset +
                             SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
 
-                    //printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\r\n", nclusters, scl, scl_z, scl_y, atom_scl_y_offset, atom_scl_y_end_idx, c);
-
                     sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx);
 
-
                     for (int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) {
                         if (cl_count >= nclusters) break;
                         cl_count++;
@@ -1035,9 +1027,6 @@ void buildClustersGPU(Atom *atom) {
             atom->siclusters[sci].bbmaxz = sc_bbmaxz;
             atom->Nsclusters_local++;
         }
-
-        //printf("%d\t%d\r\n", (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)*n_super_clusters, count);
-
     }
 
     DEBUG_MESSAGE("buildClustersGPU end\n");
diff --git a/gromacs/pbc.c b/gromacs/pbc.c
index 412e088..ec4581e 100644
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -229,3 +229,88 @@ void setupPbc(Atom *atom, Parameter *param) {
     cpuUpdatePbc(atom, param, 1);
     DEBUG_MESSAGE("setupPbc end\n");
 }
+
+void setupPbcGPU(Atom *atom, Parameter *param) {
+    DEBUG_MESSAGE("setupPbcGPU start\n");
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+    MD_FLOAT Cutneigh = param->cutneigh;
+    int jfac = SCLUSTER_SIZE / CLUSTER_M;
+    int ncj = atom->Nsclusters_local * jfac;
+    int Nghost = -1;
+    int Nghost_atoms = 0;
+
+    for(int cj = 0; cj < ncj; cj++) {
+        if(atom->jclusters[cj].natoms > 0) {
+            if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
+                growPbc(atom);
+            }
+
+            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
+            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
+            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
+            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
+            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
+            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
+
+            /* Setup ghost atoms */
+            /* 6 planes */
+            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
+            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
+            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
+            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
+            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
+            /* 8 corners */
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
+            /* 12 edges */
+            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
+            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
+            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
+            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
+            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
+            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
+        }
+    }
+
+    if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
+        growClusters(atom);
+    }
+
+    // Add dummy cluster at the end
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
+    }
+
+    // increase by one to make it the ghost atom count
+    atom->dummy_cj = ncj + Nghost + 1;
+    atom->Nghost = Nghost_atoms;
+    atom->Nclusters_ghost = Nghost + 1;
+    atom->Nclusters = atom->Nclusters_local + Nghost + 1;
+
+    // Update created ghost clusters positions
+    cudaUpdatePbc(atom, param, 1);
+    DEBUG_MESSAGE("setupPbcGPU end\n");
+}
diff --git a/gromacs/utils.c b/gromacs/utils.c
index 1d9db02..8cfd710 100644
--- a/gromacs/utils.c
+++ b/gromacs/utils.c
@@ -4,7 +4,7 @@
  */
 
 #include <stdio.h>
-
+#include <stdlib.h>
 #include <utils.h>
 
 extern void alignDataToSuperclusters(Atom *atom);
@@ -274,4 +274,59 @@ void showSuperclusters(Atom *atom) {
 
     }
 }
+
+void printNeighs(Atom *atom, Neighbor *neighbor) {
+    for (int i = 0; i < atom->Nclusters_local; ++i) {
+        int neigh_num = neighbor->numneigh[i];
+        for (int j = 0; j < neigh_num; j++) {
+            printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
+        }
+        printf("\r\n");
+    }
+}
+
+void printClusterIndices(Atom *atom) {
+    for (int i = 0; i < atom->Nsclusters_local; ++i) {
+        int clusters_num = atom->siclusters[i].nclusters;
+        for (int j = 0; j < clusters_num; j++) {
+            printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
+        }
+        printf("\r\n");
+    }
+}
+
+void verifyNeigh(Atom *atom, Neighbor *neighbor) {
+
+    buildNeighbor(atom, neighbor);
+    int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
+    int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
+
+    for (int i = 0; i < atom->Nclusters_local; ++i) {
+        int neigh_num = neighbor->numneigh[i];
+        numneigh[i] = neighbor->numneigh[i];
+        neighbor->numneigh[i] = 0;
+        for (int j = 0; j < neigh_num; j++) {
+            neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
+            neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
+        }
+    }
+
+
+    buildNeighborGPU(atom, neighbor);
+
+    unsigned int num_diff = 0;
+    unsigned int neigh_diff = 0;
+
+    for (int i = 0; i < atom->Nclusters_local; ++i) {
+        int neigh_num = neighbor->numneigh[i];
+        if (numneigh[i] != neigh_num) num_diff++;
+        for (int j = 0; j < neigh_num; j++) {
+            if (neighbors[i * neighbor->maxneighs + j] !=
+            neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
+        }
+    }
+
+    printf("%d\t%d\r\n", num_diff, neigh_diff);
+}
+
 #endif //USE_SUPER_CLUSTERS