Final MPI version

2024-04-15 16:53:25 +02:00
parent a6a269703d
commit a13a0f3bae
33 changed files with 3568 additions and 624 deletions
--- a/common/box.c
+++ b/common/box.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <parameter.h>
+#include <util.h>
+#include <box.h>
+#include <mpi.h>
+
+int overlapBox(int dim, int dir, const Box* mybox, const Box* other, Box* cut, MD_FLOAT xprd, MD_FLOAT cutneigh)
+{
+  int pbc = -100;
+  MD_FLOAT min[3], max[3];
+  int same = (mybox->id == other->id) ? 1 : 0;
+  
+  //projections
+  min[_x] = MAX(mybox->lo[_x], other->lo[_x]); max[_x] = MIN(mybox->hi[_x], other->hi[_x]); 
+  min[_y] = MAX(mybox->lo[_y], other->lo[_y]); max[_y] = MIN(mybox->hi[_y], other->hi[_y]);
+  min[_z] = MAX(mybox->lo[_z], other->lo[_z]); max[_z] = MIN(mybox->hi[_z], other->hi[_z]);
+  
+  //Intersection no periodic case
+  if(!same){
+    if (dir ==  0)  max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ cutneigh);
+    if (dir ==  1)  min[dim] = MAX(mybox->lo[dim], other->lo[dim]- cutneigh);
+    if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) pbc = 0;
+  }
+
+  //Intersection periodic case
+  if(pbc < 0)
+  {
+    if(dir == 0){
+      min[dim] = MAX(mybox->lo[dim] , other->lo[dim]- xprd);
+      max[dim] = MIN(mybox->hi[dim] , other->hi[dim]- xprd + cutneigh);
+
+    } else {
+      min[dim] = MAX(mybox->lo[dim], other->lo[dim]+ xprd - cutneigh);
+      max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ xprd); 
+
+    } 
+    if((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) 
+      pbc = (dir == 0) ? 1:-1;
+  }   
+  
+  //storing the cuts
+  cut->lo[_x] = min[_x]; cut->hi[_x] = max[_x]; 
+  cut->lo[_y] = min[_y]; cut->hi[_y] = max[_y];
+  cut->lo[_z] = min[_z]; cut->hi[_z] = max[_z];
+
+  return pbc;
+}
+
+int overlapFullBox(Parameter* param, MD_FLOAT *cutneigh ,const Box* mybox, const Box* other)
+{
+  MD_FLOAT min[3], max[3];
+  MD_FLOAT xprd = param->xprd; 
+  MD_FLOAT yprd = param->yprd; 
+  MD_FLOAT zprd = param->zprd;
+  
+  for(int k = -1; k < 2; k++)
+  {
+    for(int j = -1; j < 2; j++)
+    {
+      for(int i= -1; i < 2; i++)
+      {
+        min[_x] = MAX(mybox->lo[_x], other->lo[_x]-cutneigh[_x] + i*xprd);
+        min[_y] = MAX(mybox->lo[_y], other->lo[_y]-cutneigh[_y] + j*yprd); 
+        min[_z] = MAX(mybox->lo[_z], other->lo[_z]-cutneigh[_z] + k*zprd);
+        max[_x] = MIN(mybox->hi[_x], other->hi[_x]+cutneigh[_x] + i*xprd);
+        max[_y] = MIN(mybox->hi[_y], other->hi[_y]+cutneigh[_y] + j*yprd);
+        max[_z] = MIN(mybox->hi[_z], other->hi[_z]+cutneigh[_z] + k*zprd);
+        if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) 
+          return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+void expandBox(int iswap, const Box* me, const Box* other, Box* cut, MD_FLOAT cutneigh)
+ {
+    if(iswap==2 || iswap==3){
+      if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
+      if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
+    }
+
+    if(iswap==4 || iswap==5){
+      if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
+      if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
+      if(me->lo[_y] <= other->lo[_y]) cut->lo[_y] -= cutneigh;
+      if(me->hi[_y] >= other->hi[_y]) cut->hi[_y] += cutneigh;
+    }
+}
+
--- a/common/comm.c
+++ b/common/comm.c
@@ -0,0 +1,556 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <comm.h>   
+#include <allocate.h>
+#include <mpi.h>
+#include <util.h>
+
+#define NEIGHMIN  6       
+#define BUFFACTOR 2
+#define BUFMIN    1000
+#define BUFEXTRA  100
+#define world MPI_COMM_WORLD
+
+MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE; 
+static inline void allocDynamicBuffers(Comm*);
+static inline void freeDynamicBuffers(Comm*);
+static inline void freeBuffers(Comm*);
+
+void defineReverseList(Comm* comm){
+  int dim = 0;
+  int index = 0;
+  int me = comm->myproc;
+  
+  //Set the inverse list
+  for(int iswap = 0; iswap<6; iswap++){
+    int dim = comm->swapdim[iswap]; 
+    int dir = comm->swapdir[iswap];
+    int invswap = comm->swap[dim][(dir+1)%2]; 
+    
+    for(int ineigh = comm->sendfrom[invswap]; ineigh< comm->sendtill[invswap]; ineigh++)
+      comm->nrecv[index++] = comm->nsend[ineigh]; 
+     
+    comm->recvfrom[iswap] = (iswap == 0) ? 0 : comm->recvtill[iswap-1];
+    comm->recvtill[iswap] = index;
+  }
+
+  //set if myproc is unique in the swap 
+  for(int iswap = 0; iswap<6; iswap++){
+    int sizeswap = comm->sendtill[iswap]-comm->sendfrom[iswap]; 
+    int index = comm->sendfrom[iswap];
+    int myneigh = comm->nsend[index];
+    comm->othersend[iswap] = (sizeswap != 1 || comm->myproc != myneigh) ?  1 : 0;
+  }
+}
+
+void addNeighToExchangeList(Comm* comm, int newneigh){
+
+    int numneigh = comm->numneighexch;
+   
+    if(comm->numneighexch>=comm->maxneighexch){
+      size_t oldByteSize = comm->maxneighexch*sizeof(int);
+      comm->maxneighexch *=2; 
+      comm->nexch = (int*) reallocate(comm->nexch, ALIGNMENT,  comm->maxneighexch * sizeof(int), oldByteSize);
+    }
+    
+    // Add the new element to the list
+    comm->nexch[numneigh] = newneigh;
+    comm->numneighexch++;
+}
+
+//Exported functions
+void neighComm(Comm *comm, Parameter* param, Grid *grid)
+{
+  int me = comm->myproc; 
+  int numproc = comm ->numproc;
+  int PAD = 6;   //number of elements for processor in the map
+  int ineigh = 0;
+  int sneigh = 0;
+  MD_FLOAT *map = grid->map;
+  MD_FLOAT cutneigh = param->cutneigh;
+  MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
+  Box mybox, other, cut;
+ 
+  //needed for rebalancing
+  freeDynamicBuffers(comm);
+
+  //Local box
+  mybox.id = me;
+  mybox.lo[_x] = map[me*PAD+0];  mybox.hi[_x] = map[me*PAD+3];
+  mybox.lo[_y] = map[me*PAD+1];  mybox.hi[_y] = map[me*PAD+4];
+  mybox.lo[_z] = map[me*PAD+2];  mybox.hi[_z] = map[me*PAD+5];
+
+  //Check for all possible neighbours only for exchange atoms
+  comm->numneighexch = 0;
+  for(int proc = 0; proc <numproc; proc++){
+      other.id = proc;
+      other.lo[_x] = map[proc*PAD+0];  other.hi[_x] = map[proc*PAD+3];
+      other.lo[_y] = map[proc*PAD+1];  other.hi[_y] = map[proc*PAD+4];
+      other.lo[_z] = map[proc*PAD+2];  other.hi[_z] = map[proc*PAD+5];
+    
+    if(proc != me){
+      int intersection = overlapFullBox(param,grid->cutneigh,&mybox,&other);
+      if(intersection) addNeighToExchangeList(comm,proc);
+    }
+  }
+  
+  //MAP is stored as follows: xlo,ylo,zlo,xhi,yhi,zhi
+  for(int iswap = 0; iswap <6; iswap++)
+  {
+    int dir = comm->swapdir[iswap]; 
+    int dim = comm->swapdim[iswap]; 
+
+    for(int proc = 0; proc < numproc; proc++)
+    {      
+      //Check for neighbours along dimmensions, for forwardComm, backwardComm  and ghostComm
+      other.id = proc;
+      other.lo[_x] = map[proc*PAD+0];  other.hi[_x] = map[proc*PAD+3];
+      other.lo[_y] = map[proc*PAD+1];  other.hi[_y] = map[proc*PAD+4];
+      other.lo[_z] = map[proc*PAD+2];  other.hi[_z] = map[proc*PAD+5]; 
+          
+      //return if two boxes intersect: -100 not intersection, 0, 1 and -1 intersection for each different pbc.  
+      int pbc = overlapBox(dim,dir,&mybox,&other,&cut,prd[dim],cutneigh);
+      if(pbc == -100) continue;   
+      
+      expandBox(iswap, &mybox, &other, &cut, cutneigh);
+ 
+      if(ineigh >= comm->maxneigh) {
+          size_t oldByteSize = comm->maxneigh*sizeof(int);
+          size_t oldBoxSize = comm->maxneigh*sizeof(Box); 
+          comm->maxneigh  = 2*ineigh;  
+          comm->nsend     = (int*) reallocate(comm->nsend, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->nrecv     = (int*) reallocate(comm->nrecv, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->pbc_x     = (int*) reallocate(comm->pbc_x, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->pbc_y     = (int*) reallocate(comm->pbc_y, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->pbc_z     = (int*) reallocate(comm->pbc_z, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->boxes     = (Box*) reallocate(comm->boxes, ALIGNMENT,  comm->maxneigh * sizeof(Box), oldBoxSize);
+        }
+
+      comm->boxes[ineigh] = cut;  
+      comm->nsend[ineigh] = proc;
+      comm->pbc_x[ineigh] = (dim == _x) ? pbc : 0;
+      comm->pbc_y[ineigh] = (dim == _y) ? pbc : 0; 
+      comm->pbc_z[ineigh] = (dim == _z) ? pbc : 0; 
+      ineigh++; 
+    }
+
+    comm->sendfrom[iswap] = (iswap == 0) ? 0:comm->sendtill[iswap-1];
+    comm->sendtill[iswap] = ineigh;
+    comm->numneigh = ineigh; 
+  }
+
+  allocDynamicBuffers(comm);
+  defineReverseList(comm);
+}
+    
+void initComm(int* argc, char*** argv, Comm* comm)
+{
+  //MPI Initialize
+  MPI_Init(argc, argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &(comm->numproc));
+  MPI_Comm_rank(MPI_COMM_WORLD, &(comm->myproc));
+  comm->numneigh = 0;
+  comm->numneighexch = 0;
+  comm->nrecv=NULL;
+  comm->nsend=NULL;
+  comm->nexch=NULL;  
+  comm->pbc_x=NULL; 
+  comm->pbc_y=NULL;  
+  comm->pbc_z=NULL;  
+  comm->boxes=NULL;  
+  comm->atom_send=NULL;     
+  comm->atom_recv=NULL;   
+  comm->off_atom_send=NULL; 
+  comm->off_atom_recv=NULL;
+  comm->maxsendlist=NULL; 
+  comm->sendlist=NULL;
+  comm->buf_send=NULL; 
+  comm->buf_recv=NULL; 
+}
+ 
+void endComm(Comm* comm)
+{
+  comm->maxneigh = 0;
+  comm->maxneighexch =0;
+  comm->maxsend = 0; 
+  comm->maxrecv = 0;
+  freeBuffers(comm);
+  MPI_Finalize();
+}
+
+void setupComm(Comm* comm, Parameter* param, Grid* grid){
+ 
+  comm->swap[_x][0] = 0; comm->swap[_x][1] =1;
+  comm->swap[_y][0] = 2; comm->swap[_y][1] =3;
+  comm->swap[_z][0] = 4; comm->swap[_z][1] =5;
+
+  comm->swapdim[0] = comm->swapdim[1] = _x;
+  comm->swapdim[2] = comm->swapdim[3] = _y;
+  comm->swapdim[4] = comm->swapdim[5] = _z;
+
+  comm->swapdir[0] = comm->swapdir[2] = comm->swapdir[4] = 0;
+  comm->swapdir[1] = comm->swapdir[3] = comm->swapdir[5] = 1;
+  
+  for(int i = 0;  i<6; i++){
+    comm->sendfrom[i] = 0;
+    comm->sendtill[i] = 0;
+    comm->recvfrom[i] = 0;
+    comm->recvtill[i] = 0;  
+  }
+
+  comm->forwardSize   = FORWARD_SIZE;      //send coordiantes x,y,z
+  comm->reverseSize   = REVERSE_SIZE;      //return forces fx, fy, fz
+  comm->ghostSize     = GHOST_SIZE;        //send x,y,z,type;
+  comm->exchangeSize  = EXCHANGE_SIZE;     //send x,y,z,vx,vy,vz,type
+ 
+  //Allocate memory for recv buffer and recv buffer
+  comm->maxsend = BUFMIN; 
+  comm->maxrecv = BUFMIN;
+  comm->buf_send = (MD_FLOAT*) allocate(ALIGNMENT,(comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT));
+  comm->buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT)); 
+
+  comm->maxneighexch = NEIGHMIN;
+  comm->nexch  = (int*) allocate(ALIGNMENT,  comm->maxneighexch * sizeof(int));
+
+  comm->maxneigh = NEIGHMIN;
+  comm->nsend  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->nrecv  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->pbc_x  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->pbc_y  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->pbc_z  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->boxes  = (Box*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(Box));
+  
+  neighComm(comm, param, grid); 
+}
+
+void forwardComm(Comm* comm, Atom* atom, int iswap)
+{ 
+  int nrqst=0, offset=0, nsend=0, nrecv=0; 
+  int pbc[3];
+  int size = comm->forwardSize; 
+  int maxrqst = comm->numneigh;
+  MD_FLOAT* buf;
+  MPI_Request requests[maxrqst];
+  
+  for(int ineigh = comm->sendfrom[iswap]; ineigh < comm->sendtill[iswap]; ineigh++){
+    offset = comm->off_atom_send[ineigh];
+    pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh];  pbc[_z]=comm->pbc_z[ineigh];
+    packForward(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &comm->buf_send[offset*size],pbc);
+  }
+   
+  //Receives elements 
+  if(comm->othersend[iswap])  
+    for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){      
+      offset = comm->off_atom_recv[ineigh]*size;
+      nrecv  = comm->atom_recv[ineigh]*size;
+      MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
+    }
+   
+  //Send elements 
+  if(comm->othersend[iswap]) 
+    for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){  
+      offset = comm->off_atom_send[ineigh]*size;
+      nsend  = comm->atom_send[ineigh]*size;
+      MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);      
+    } 
+
+  if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+  
+  if(comm->othersend[iswap]) buf = comm->buf_recv;
+  else buf = comm->buf_send;
+  
+  /* unpack buffer */   
+  for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
+    offset = comm->off_atom_recv[ineigh];
+    unpackForward(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &buf[offset*size]);
+  }
+}
+
+void reverseComm(Comm* comm, Atom* atom, int iswap)
+{ 
+  int nrqst=0, offset=0, nsend=0, nrecv=0 ;
+  int size = comm->reverseSize; 
+  int maxrqst = comm->numneigh;
+  MD_FLOAT* buf;
+  MPI_Request requests[maxrqst];
+  
+  for(int ineigh = comm->recvfrom[iswap]; ineigh < comm->recvtill[iswap]; ineigh++){
+    offset = comm->off_atom_recv[ineigh]; 
+    packReverse(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &comm->buf_send[offset*size]);
+  }
+  //Receives elements 
+  if(comm->othersend[iswap])   
+    for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){      
+      offset = comm->off_atom_send[ineigh]*size;
+      nrecv  = comm->atom_send[ineigh]*size; 
+      MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nsend[ineigh],0,world,&requests[nrqst++]);
+    }
+  //Send elements  
+  if(comm->othersend[iswap]) 
+    for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){  
+      offset = comm->off_atom_recv[ineigh]*size;
+      nsend  = comm->atom_recv[ineigh]*size;  
+      MPI_Send(&comm->buf_send[offset],nsend,type,comm->nrecv[ineigh],0,world);        
+    } 
+  if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+  if(comm->othersend[iswap])  buf = comm->buf_recv;
+  else buf = comm->buf_send; 
+
+  /* unpack buffer */   
+  for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
+    offset =  comm->off_atom_send[ineigh]; 
+    unpackReverse(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &buf[offset*size]);
+  } 
+}
+
+void ghostComm(Comm* comm, Atom* atom,int iswap){
+  
+  MD_FLOAT xlo=0, xhi=0, ylo=0, yhi=0, zlo=0, zhi=0; 
+  MD_FLOAT* buf;
+  int nrqst=0, nsend=0, nrecv=0, offset=0, ineigh=0, pbc[3];
+  int all_recv=0, all_send=0, currentSend=0; 
+  int size = comm->ghostSize; 
+  int maxrqrst = comm->numneigh;
+  MPI_Request requests[maxrqrst];
+  for(int i = 0; i<maxrqrst; i++) 
+    requests[maxrqrst]=MPI_REQUEST_NULL;    
+  if(iswap%2==0) comm->iterAtom = LOCAL+GHOST;
+  int iter = 0; 
+  for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
+      {          
+        Box* tile = &comm->boxes[ineigh];
+        
+        xlo = tile->lo[_x]; ylo = tile->lo[_y]; zlo = tile->lo[_z]; 
+        xhi = tile->hi[_x]; yhi = tile->hi[_y]; zhi = tile->hi[_z];   
+        pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh];  pbc[_z]=comm->pbc_z[ineigh];
+        nsend = 0; 
+    
+        for(int i = 0; i < comm->iterAtom ; i++) 
+        { 
+          if(IsinRegionToSend(i)){
+                if(nsend >= comm->maxsendlist[ineigh]) growList(comm,ineigh,nsend);
+                if(currentSend + size >= comm->maxsend) growSend(comm,currentSend); 
+                comm->sendlist[ineigh][nsend++] = i;
+                currentSend += packGhost(atom, i, &comm->buf_send[currentSend], pbc);  
+          }   
+        }
+        comm->atom_send[ineigh]     = nsend;          //#atoms send per neigh   
+        comm->off_atom_send[ineigh] = all_send;       //offset atom respect to neighbours in a swap
+        all_send += nsend;                            //all atoms send
+      } 
+  //Receives how many elements to be received.
+  if(comm->othersend[iswap])
+    for(nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++)
+      MPI_Irecv(&comm->atom_recv[ineigh],1,MPI_INT,comm->nrecv[ineigh],0,world,&requests[nrqst++]);
+  
+  if(!comm->othersend[iswap]) comm->atom_recv[comm->recvfrom[iswap]] = nsend; 
+
+  //Communicate how many elements to be sent.
+  if(comm->othersend[iswap])
+    for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
+      MPI_Send(&comm->atom_send[ineigh],1,MPI_INT,comm->nsend[ineigh],0,world);    
+   if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+
+  //Define offset to store in the recv_buff    
+  for(int ineigh = comm->recvfrom[iswap]; ineigh<comm->recvtill[iswap]; ineigh++){ 
+    comm->off_atom_recv[ineigh] = all_recv;
+    all_recv += comm->atom_recv[ineigh];
+  }
+
+  if(all_recv*size>=comm->maxrecv) growRecv(comm,all_recv*size);
+
+  //Receives elements 
+  if(comm->othersend[iswap])
+    for (nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
+      offset = comm->off_atom_recv[ineigh]*size;  
+      nrecv = comm->atom_recv[ineigh]*size;
+      MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
+    } 
+  //Send elements
+  if(comm->othersend[iswap])
+    for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
+      offset = comm->off_atom_send[ineigh]*size;
+      nsend  = comm->atom_send[ineigh]*size;  
+      MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world); 
+    }
+  if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+  
+  if(comm->othersend[iswap]) buf = comm->buf_recv;
+  else buf = comm->buf_send; 
+  //unpack elements
+  comm->firstrecv[iswap] = LOCAL+GHOST; 
+  for(int i = 0; i < all_recv; i++)
+    unpackGhost(atom, LOCAL+GHOST, &buf[i*size]); 
+  
+  //Increases the buffer if needed
+  int max_size = MAX(comm->forwardSize,comm->reverseSize);
+  int max_buf = max_size * MAX(all_recv, all_send); 
+  if(max_buf>=comm->maxrecv) growRecv(comm,max_buf);
+  if(max_buf>=comm->maxsend) growSend(comm,max_buf);
+}
+
+void exchangeComm(Comm* comm, Atom* atom){
+
+  MD_FLOAT x,y,z;
+  MD_FLOAT *lo = atom->mybox.lo; 
+  MD_FLOAT *hi = atom->mybox.hi;
+  int size = comm->exchangeSize;
+  int numneigh = comm->numneighexch;
+  int offset_recv[numneigh];
+  int size_recv[numneigh];
+  MPI_Request requests[numneigh];
+  int i =0,  nsend = 0, nrecv = 0;
+  int nrqst = 0;
+  int nlocal, offset,m;
+
+  /* enforce PBC */
+  pbc(atom);
+  
+  if(comm->numneigh == 0) return;
+
+  nlocal = atom->Nlocal;
+  while(i < nlocal) {
+    if(atom_x(i) < lo[_x] || atom_x(i) >= hi[_x] ||
+       atom_y(i) < lo[_y] || atom_y(i) >= hi[_y] ||
+       atom_z(i) < lo[_z] || atom_z(i) >= hi[_z]) {
+      if(nsend+size >= comm->maxsend) growSend(comm, nsend);
+      nsend += packExchange(atom, i, &comm->buf_send[nsend]);
+      copy(atom, i, nlocal-1);
+      nlocal--;
+    } else i++;
+  }
+  atom->Nlocal = nlocal;
+
+  /* send/recv number of to share atoms with neighbouring procs*/
+  for(int ineigh = 0; ineigh < numneigh; ineigh++) 
+    MPI_Irecv(&size_recv[ineigh],1,MPI_INT,comm->nexch[ineigh],0,world,&requests[nrqst++]);
+
+  for (int ineigh = 0; ineigh < numneigh; ineigh++) 
+    MPI_Send(&nsend,1,MPI_INT,comm->nexch[ineigh],0,world); 
+  MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+
+  //Define offset to store in the recv_buff
+  for(int ineigh = 0; ineigh<numneigh; ineigh++){ 
+    offset_recv[ineigh] = nrecv; 
+    nrecv += size_recv[ineigh];
+  }
+
+  if(nrecv >= comm->maxrecv) growRecv(comm,nrecv); 
+
+    //Receives elements 
+    nrqst=0;
+    for (int ineigh = 0; ineigh< numneigh; ineigh++){
+      offset = offset_recv[ineigh];
+      MPI_Irecv(&comm->buf_recv[offset], size_recv[ineigh], type, comm->nexch[ineigh],0,world,&requests[nrqst++]);
+    }
+    //Send elements 
+    for (int ineigh = 0; ineigh< numneigh; ineigh++)
+      MPI_Send(comm->buf_send,nsend,type,comm->nexch[ineigh],0,world); 
+    MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);  
+
+    nlocal = atom->Nlocal;
+    m = 0;
+    while(m < nrecv) {
+      x = comm->buf_recv[m + _x]; 
+      y = comm->buf_recv[m + _y];
+      z = comm->buf_recv[m + _z];
+
+      if(x >= lo[_x] && x < hi[_x] &&
+         y >= lo[_y] && y < hi[_y] &&
+         z >= lo[_z] && z < hi[_z]){
+        m += unpackExchange(atom, nlocal++, &comm->buf_recv[m]);
+      } else {
+        m += size;
+      }
+    } 
+    atom->Nlocal = nlocal;
+    
+    int all_atoms=0;
+    MPI_Allreduce(&atom->Nlocal, &all_atoms, 1, MPI_INT, MPI_SUM, world);
+    if(atom->Natoms!=all_atoms && comm->myproc ==0){
+      printf("Losing atoms! current atoms:%d expected atoms:%d\n",all_atoms,atom->Natoms);
+    }
+}
+
+//Internal functions
+
+inline void growRecv(Comm* comm, int n)
+{ 
+  comm -> maxrecv = BUFFACTOR * n;
+  if(comm->buf_recv) free(comm -> buf_recv);
+  comm -> buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
+}
+
+inline void growSend(Comm* comm, int n)
+{
+  size_t oldByteSize = (comm->maxsend+BUFEXTRA)*sizeof(MD_FLOAT);
+  comm -> maxsend = BUFFACTOR * n;
+  comm -> buf_send = (MD_FLOAT*) reallocate(comm->buf_send, ALIGNMENT, (comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT), oldByteSize);
+}
+
+inline void growList(Comm* comm, int ineigh, int n)
+{
+  size_t oldByteSize = comm->maxsendlist[ineigh]*sizeof(int);
+  comm->maxsendlist[ineigh] = BUFFACTOR * n;
+  comm->sendlist[ineigh] = (int*) reallocate(comm->sendlist[ineigh],ALIGNMENT, comm->maxsendlist[ineigh] * sizeof(int), oldByteSize);
+}
+
+static inline void  allocDynamicBuffers(Comm* comm)
+{  
+  //Buffers depending on the # of my neighs 
+  int numneigh = comm->numneigh; 
+  comm->atom_send   = (int*) allocate(ALIGNMENT,  numneigh * sizeof(int));
+  comm->atom_recv   = (int*) allocate(ALIGNMENT,  numneigh * sizeof(int));
+  comm->off_atom_send = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
+  comm->off_atom_recv = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
+  comm->maxsendlist   = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
+ 
+  for(int i = 0; i < numneigh; i++) 
+    comm->maxsendlist[i] = BUFMIN;
+
+  comm->sendlist = (int**) allocate(ALIGNMENT, numneigh * sizeof(int*));
+  for(int i = 0; i < numneigh; i++) 
+    comm->sendlist[i] = (int*) allocate(ALIGNMENT, comm->maxsendlist[i] * sizeof(int));
+}
+
+static inline void freeDynamicBuffers(Comm* comm)
+{
+  int numneigh =comm->numneigh;
+  
+  if(comm->atom_send) free(comm->atom_send);
+  if(comm->atom_recv) free(comm->atom_recv);
+  if(comm->off_atom_send) free(comm->off_atom_send);
+  if(comm->off_atom_recv) free(comm->off_atom_recv);
+  if(comm->maxsendlist) free(comm->maxsendlist);
+  if(comm->sendlist){
+    for(int i = 0; i < numneigh; i++) 
+      if(comm->sendlist[i]) free(comm->sendlist[i]);
+  } 
+  if(comm->sendlist) free(comm->sendlist);
+}
+
+static inline void freeBuffers(Comm* comm)
+{
+  if(comm->nrecv) free(comm->nrecv);
+  if(comm->nsend) free(comm->nsend);
+  if(comm->nexch) free(comm->nexch);  
+  if(comm->pbc_x) free(comm->pbc_x); 
+  if(comm->pbc_y) free(comm->pbc_y);  
+  if(comm->pbc_z) free(comm->pbc_z);  
+  if(comm->boxes) free(comm->boxes);  
+  if(comm->atom_send) free(comm->atom_send);     
+  if(comm->atom_recv) free(comm->atom_recv);   
+  if(comm->off_atom_send) free(comm->off_atom_send); 
+  if(comm->off_atom_recv) free(comm->off_atom_recv);
+  if(comm->maxsendlist) free(comm->maxsendlist); 
+  
+  if(comm->sendlist){
+    for(int i = 0; i < comm->numneigh; i++) 
+      if(comm->sendlist[i]) free(comm->sendlist[i]); 
+  }
+  if(comm->sendlist) free(comm->sendlist);
+
+  if(comm->buf_send) free(comm->buf_send); 
+  if(comm->buf_recv) free(comm->buf_recv);   
+}
--- a/common/grid.c
+++ b/common/grid.c
@@ -0,0 +1,490 @@
+#include <stdio.h>
+#include <grid.h>
+#include <mpi.h>
+#include <parameter.h>
+#include <allocate.h>
+#include <util.h>
+#include <math.h>
+
+static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
+
+//Grommacs Balancing
+MD_FLOAT f_normalization(MD_FLOAT* x,MD_FLOAT* fx, MD_FLOAT minx, int nprocs) {
+
+  MD_FLOAT sum=0;
+  for(int n = 0; n<nprocs; n++){
+    fx[n] = MAX(minx,x[n]);
+    sum+=fx[n];
+  }
+
+  for(int n = 0; n<nprocs; n++)
+    fx[n] /= sum;    
+}
+
+void fixedPointIteration(MD_FLOAT* x0, int nprocs, MD_FLOAT minx)
+{ 
+  MD_FLOAT tolerance = 1e-3;
+  MD_FLOAT alpha = 0.5;
+  MD_FLOAT *fx = (MD_FLOAT*) malloc(nprocs*sizeof(MD_FLOAT));
+  int maxIterations = 100; 
+    
+  for (int i = 0; i < maxIterations; i++) {
+
+    int converged = 1; 
+    f_normalization(x0,fx,minx,nprocs);
+
+    for(int n=0; n<nprocs; n++)
+      fx[n]= (1-alpha) * x0[n] + alpha * fx[n];
+    
+    for (int n=0; n<nprocs; n++) {
+        if (fabs(fx[n] - x0[n]) >= tolerance) {
+            converged = 0;
+            break;
+        }      
+    }
+    
+    for (int n=0; n<nprocs; n++) 
+        x0[n] = fx[n];
+
+    if(converged){
+      for(int n = 0; n<nprocs; n++)    
+      return;
+    } 
+  }
+
+
+}
+
+void staggeredBalance(Grid* grid, Atom* atom, Parameter* param, double newTime)
+{ 
+  int me;
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  int *coord = grid->coord;
+  int *nprocs  = grid ->nprocs;
+  //Elapsed time since the last rebalance
+  double time = newTime - grid->Timer;
+  grid->Timer = newTime;
+  //store the older dimm to compare later for exchange
+  MD_FLOAT lo[3], hi[3];
+  for(int dim = 0; dim< 3; dim++){ 
+    lo[dim] = atom->mybox.lo[dim];
+    hi[dim] = atom->mybox.hi[dim]; 
+  }
+  
+  //Define parameters
+  MPI_Comm subComm[3]; 
+  int color[3] = {0,0,0};
+  int id[3] = {0,0,0};
+  MD_FLOAT ** load = (MD_FLOAT**) malloc(3*sizeof(MD_FLOAT*));
+  for(int dim = 0; dim<3; dim++) 
+    load[dim] = (MD_FLOAT*) malloc(nprocs[dim]*sizeof(MD_FLOAT));
+ 
+  int maxprocs = MAX(MAX(nprocs[_x],nprocs[_y]),nprocs[_z]);
+  MD_FLOAT* cellSize = (MD_FLOAT*) malloc(maxprocs*sizeof(MD_FLOAT)); 
+  MD_FLOAT* limits = (MD_FLOAT*) malloc(2*maxprocs*sizeof(MD_FLOAT)); //limits: (x0, x1), (x1, x2)... Repeat values in between to perfom MPI_Scatter later 
+  MD_FLOAT t_sum[3] = {0,0,0}; 
+  MD_FLOAT recv_buf[2] = {0,0};        //Each proc only receives 2 elments per dimension xlo and xhi
+  MD_FLOAT balancedLoad[3] = {0,0,0};  //1/nprocs
+  MD_FLOAT minLoad[3]  = {0,0,0};      //beta*(1/nprocs) 
+  MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
+  MD_FLOAT boundaries[6] ={0,0,0,0,0,0}; // xlo,xhi,ylo,yhi,zlo,zhi
+
+  //Create sub-communications along each dimension
+  for(int dim = 0; dim<3; dim++){
+     if(dim == _x){
+        color[_x] = (coord[_y] == 0 && coord[_z] ==0) ? 1:MPI_UNDEFINED;
+        id[_x] = me;
+     } else if(dim == _y) {
+        color[_y] = coord[_z] == 0 ? coord[_x]:MPI_UNDEFINED; 
+        id[_y] = (coord[_y] == 0 && coord[_z] == 0) ? 0:me;
+     } else {
+        color[_z]= coord[_y]*nprocs[_x]+coord[_x]; 
+        id[_z] = coord[_z] == 0 ? 0 : me; 
+     }
+    MPI_Comm_split(world, color[dim], id[dim], &subComm[dim]);
+  } 
+
+  //Set the minimum load and the balance load
+  for(int dim = 0; dim<3; dim++){
+    balancedLoad[dim] = 1./nprocs[dim]; 
+    minLoad[dim]  = 0.8*balancedLoad[dim]; 
+  }
+  //set and communicate the workload in reverse order
+  for(int dim = _z; dim>= _x; dim--)
+  {
+    if(subComm[dim] != MPI_COMM_NULL){
+      MPI_Gather(&time,1,type,load[dim],1,type,0,subComm[dim]);
+
+      if(id[dim] == 0)
+      {
+        for(int n=0; n<nprocs[dim]; n++) 
+          t_sum[dim] += load[dim][n];
+
+        for(int n=0; n<nprocs[dim]; n++)
+          load[dim][n] /= t_sum[dim];
+      }
+      time =t_sum[dim];
+    }
+    MPI_Barrier(world);
+  }
+
+  //Brodacast the new boundaries along dimensions
+  for(int dim=0; dim<3; dim++){
+    
+    if(subComm[dim] != MPI_COMM_NULL){
+
+      MPI_Bcast(boundaries,6,type,0,subComm[dim]);
+      if(id[dim] == 0) {
+        fixedPointIteration(load[dim], nprocs[dim], minLoad[dim]); 
+        MD_FLOAT inv_sum=0;
+        for(int n=0; n<nprocs[dim];n++)
+          inv_sum +=(1/load[dim][n]);
+        
+        for(int n=0; n<nprocs[dim];n++)
+          cellSize[n] = (prd[dim]/load[dim][n])*(1./inv_sum); 
+   
+        MD_FLOAT sum=0;
+        for(int n=0; n<nprocs[dim]; n++){
+          limits[2*n] = sum; 
+          limits[2*n+1] = sum+cellSize[n];
+          sum+= cellSize[n]; 
+        }
+        limits[2*nprocs[dim]-1] = prd[dim];
+      } 
+      MPI_Scatter(limits,2,type,recv_buf,2,type,0,subComm[dim]); 
+      boundaries[2*dim] = recv_buf[0];
+      boundaries[2*dim+1] = recv_buf[1];
+    }
+     MPI_Barrier(world);
+  }  
+
+  atom->mybox.lo[_x]=boundaries[0]; atom->mybox.hi[_x]=boundaries[1];
+  atom->mybox.lo[_y]=boundaries[2]; atom->mybox.hi[_y]=boundaries[3];
+  atom->mybox.lo[_z]=boundaries[4]; atom->mybox.hi[_z]=boundaries[5];
+ 
+  MD_FLOAT domain[6] = {boundaries[0], boundaries[2], boundaries[4], boundaries[1], boundaries[3], boundaries[5]};
+  MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
+  
+  //because cells change dynamically, It is required to increase the neighbouring exchange region 
+  for(int dim =_x; dim<=_z; dim++){
+    MD_FLOAT dr,dr_max; 
+    int n = grid->nprocs[dim]; 
+    MD_FLOAT maxdelta = 0.2*prd[dim];
+    dr = MAX(fabs(lo[dim] - atom->mybox.lo[dim]),fabs(hi[dim] - atom->mybox.hi[dim]));
+    MPI_Allreduce(&dr, &dr_max, 1, type, MPI_MAX, world);
+    grid->cutneigh[dim] = param->cutneigh+dr_max; 
+  }
+
+  for(int dim=0; dim<3; dim++) {
+    if(subComm[dim] != MPI_COMM_NULL){
+      MPI_Comm_free(&subComm[dim]);
+    }
+    free(load[dim]);
+  }
+  free(load); 
+  free(limits);
+}
+
+//RCB Balancing
+MD_FLOAT meanTimeBisect(Atom *atom, MPI_Comm subComm, int dim, double time)
+{
+  MD_FLOAT mean=0, sum=0, total_sum=0, weightAtoms= 0, total_weight=0;
+
+  for(int i=0; i<atom->Nlocal; i++){
+    sum += atom_pos(i);
+  }
+  sum*=time;
+  weightAtoms = atom->Nlocal*time;
+  MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
+  MPI_Allreduce(&weightAtoms, &total_weight, 1, type, MPI_SUM, subComm);
+
+  mean = total_sum/total_weight;
+  return mean;
+}
+
+MD_FLOAT meanBisect(Atom* atom, MPI_Comm subComm, int dim, double time)
+{  
+  int Natoms = 0;
+  MD_FLOAT sum=0, mean=0, total_sum=0;
+
+  for(int i=0; i<atom->Nlocal; i++){
+    sum += atom_pos(i);
+  }
+  MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
+  MPI_Allreduce(&atom->Nlocal, &Natoms, 1, MPI_INT, MPI_SUM, subComm);
+  mean = total_sum/Natoms;
+  return mean;
+} 
+
+void nextBisectionLevel(Grid* grid, Atom* atom, RCB_Method method, MPI_Comm subComm, int dim ,int* color, int ilevel, double time)
+{ 
+  int rank, size;
+  int branch = 0, i = 0, m = 0;
+  int nsend = 0, nrecv = 0, nrecv2 = 0;
+  int values_per_atom = 7; 
+  MD_FLOAT bisection, pos;
+  MPI_Request request[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
+  MPI_Comm_rank(subComm,&rank);
+  MPI_Comm_size(subComm,&size);
+   
+  int odd = size%2;
+  int extraProc = odd ? size-1:size;
+  int half = (int) (0.5*size);
+  int partner = (rank<half) ? rank+half:rank-half;
+  if(odd && rank == extraProc) partner = 0;
+  //Apply the bisection 
+  bisection = method(atom,subComm,dim,time);
+  //Define the new boundaries
+  if(rank<half){
+    atom->mybox.hi[dim] = bisection;
+    branch = 0;
+  } else {
+    atom->mybox.lo[dim] = bisection;
+    branch = 1;
+  }
+  //Define new color for the further communicaton
+  *color = (branch << ilevel) | *color;
+  //Grow the send buffer
+  if(atom->Nlocal>=grid->maxsend){
+      if(grid->buf_send) free(grid->buf_send); 
+      grid->buf_send = (MD_FLOAT*) malloc(atom->Nlocal*values_per_atom* sizeof(MD_FLOAT));
+      grid->maxsend = atom->Nlocal;
+  }
+  //buffer particles to send
+  while(i < atom->Nlocal) {
+    pos = atom_pos(i);
+    if(pos < atom->mybox.lo[dim] || pos >= atom->mybox.hi[dim]) {
+      nsend += packExchange(atom, i, &grid->buf_send[nsend]);
+      copy(atom, i, atom->Nlocal-1);
+      atom->Nlocal--;
+    } else i++;
+  }
+
+  //Communicate the number of elements to be sent
+  if(rank < extraProc){
+    MPI_Irecv(&nrecv,1,MPI_INT,partner,0,subComm,&request[0]);
+  }
+  if(odd && rank == 0){ 
+    MPI_Irecv(&nrecv2,1,MPI_INT,extraProc,0,subComm,&request[1]);
+  }
+  MPI_Send(&nsend,1,MPI_INT,partner,0,subComm);
+  MPI_Waitall(2,request,MPI_STATUS_IGNORE);
+
+  //Grow the recv buffer 
+  if(nrecv+nrecv2>=grid->maxrecv){
+      if(grid->buf_recv) free(grid->buf_recv); 
+      grid->buf_recv = (MD_FLOAT*) malloc((nrecv+nrecv2)*values_per_atom*sizeof(MD_FLOAT));
+      grid->maxrecv = nrecv+nrecv2;
+  } 
+
+  //communicate elements in the buffer
+  request[0] = MPI_REQUEST_NULL; 
+  request[1] = MPI_REQUEST_NULL;
+
+  if(rank < extraProc){
+    MPI_Irecv(grid->buf_recv,nrecv,type,partner,0,subComm,&request[0]);
+  }
+  if(odd && rank == 0){ 
+    MPI_Irecv(&grid->buf_recv[nrecv],nrecv2,type,extraProc,0,subComm,&request[1]);
+  }
+  MPI_Send (grid->buf_send,nsend,type,partner,0,subComm); 
+  MPI_Waitall(2,request,MPI_STATUS_IGNORE);
+
+  //store atoms in atom list
+  while(m < nrecv+nrecv2){ 
+    m += unpackExchange(atom, atom->Nlocal++, &grid->buf_recv[m]);
+  }
+}
+
+void rcbBalance(Grid* grid, Atom* atom, Parameter* param, RCB_Method method, int ndim, double newTime)
+{
+  int me, nprocs=0, ilevel=0, nboxes=1;
+  int color = 0, size =0;
+  int index, prd[3];
+  MPI_Comm subComm;
+  MPI_Comm_size(world, &nprocs);
+  MPI_Comm_rank(world, &me);
+  
+  //set the elapsed time since the last dynamic balance
+  double time = newTime - grid->Timer;
+  
+  prd[_x] = atom->mybox.xprd = param->xprd; 
+  prd[_y] = atom->mybox.yprd = param->yprd; 
+  prd[_z] = atom->mybox.zprd = param->zprd;
+
+  //Sort by larger dimension 
+  int largerDim[3] ={_x, _y, _z};
+
+  for(int i = 0; i< 2; i++){
+    for(int j = i+1; j<3; j++)
+    {
+      if(prd[largerDim[j]]>prd[largerDim[i]]){
+        MD_FLOAT tmp = largerDim[j];
+        largerDim[j] = largerDim[i];
+        largerDim[i] = tmp;
+      }  
+    }
+  }
+  //Initial Partition
+  atom->mybox.lo[_x] = 0; atom->mybox.hi[_x] = atom->mybox.xprd;
+  atom->mybox.lo[_y] = 0; atom->mybox.hi[_y] = atom->mybox.yprd;
+  atom->mybox.lo[_z] = 0; atom->mybox.hi[_z] = atom->mybox.zprd;
+  
+  //Recursion tree 
+  while(nboxes<nprocs)
+  {  
+    index = ilevel%ndim; 
+    MPI_Comm_split(world, color, me, &subComm);
+    MPI_Comm_size(subComm,&size);
+    if(size > 1){
+      nextBisectionLevel(grid, atom, method, subComm, largerDim[index], &color, ilevel, time);
+    }
+    MPI_Comm_free(&subComm);
+    nboxes = pow(2,++ilevel);
+  }
+  //Set the new timer grid
+  grid->Timer = newTime;
+
+  //Creating the global map
+  MD_FLOAT domain[6] = {atom->mybox.lo[_x], atom->mybox.lo[_y], atom->mybox.lo[_z], atom->mybox.hi[_x], atom->mybox.hi[_y], atom->mybox.hi[_z]};
+  MPI_Allgather(domain, 6, type, grid->map, 6, type, world);  
+  
+  //Define the same cutneighbour in all dimensions for the exchange communication
+  for(int dim =_x; dim<=_z; dim++)
+    grid->cutneigh[dim] = param->cutneigh;
+}
+
+//Regular grid
+void cartisian3d(Grid* grid, Parameter* param, Box* box)
+{
+  int me, nproc;
+  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  
+  int numdim=3;
+  int reorder=0;
+  int periods[3]={1,1,1}; 
+  int mycoord[3]={0,0,0};
+  int griddim[3]={0,0,0};
+  MD_FLOAT len[3];
+  MPI_Comm cartesian;
+
+  box->xprd = param->xprd;
+  box->yprd = param->yprd;
+  box->zprd = param->zprd;
+
+ //Creates a cartesian 3d grid 
+  MPI_Dims_create(nproc, numdim, griddim); 
+  MPI_Cart_create(world,numdim,griddim,periods,reorder,&cartesian); 
+  grid->nprocs[_x] = griddim[_x];
+  grid->nprocs[_y] = griddim[_y]; 
+  grid->nprocs[_z] = griddim[_z];
+
+  //Coordinates position in the grid
+  MPI_Cart_coords(cartesian,me,3,mycoord); 
+  grid->coord[_x] = mycoord[_x];
+  grid->coord[_y] = mycoord[_y];
+  grid->coord[_z] = mycoord[_z];
+
+  //boundaries of my local box, with origin in (0,0,0). 
+  len[_x] = param->xprd / griddim[_x];
+  len[_y] = param->yprd / griddim[_y];
+  len[_z] = param->zprd / griddim[_z];
+
+  box->lo[_x] = mycoord[_x] * len[_x];
+  box->hi[_x] = (mycoord[_x] + 1) * len[_x];
+  box->lo[_y] = mycoord[_y] * len[_y];
+  box->hi[_y] = (mycoord[_y] + 1) * len[_y];
+  box->lo[_z] = mycoord[_z] * len[_z];
+  box->hi[_z] = (mycoord[_z] + 1) * len[_z];
+  
+  MD_FLOAT domain[6] = {box->lo[_x], box->lo[_y], box->lo[_z], box->hi[_x], box->hi[_y], box->hi[_z]};
+  MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
+  MPI_Comm_free(&cartesian);
+
+  //Define the same cutneighbour in all dimensions for the exchange communication
+  for(int dim =_x; dim<=_z; dim++)
+    grid->cutneigh[dim] = param->cutneigh;
+}
+
+//Other Functions from the grid
+void initGrid(Grid* grid)
+{ //start with regular grid
+  int nprocs;
+  MPI_Comm_size(world, &nprocs);
+  grid->map_size = 6 * nprocs;             
+  grid->map  = (MD_FLOAT*) allocate(ALIGNMENT, grid->map_size * sizeof(MD_FLOAT));  
+  //========rcb=======
+  grid->maxsend = 0; 
+  grid->maxrecv = 0;
+  grid->buf_send = NULL;  
+  grid->buf_recv = NULL;
+  //====staggered=====
+  grid->Timer = 0.;
+}
+
+void setupGrid(Grid* grid, Atom* atom, Parameter* param)
+{
+  int me; 
+  MD_FLOAT xlo, ylo, zlo, xhi, yhi, zhi; 
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  initGrid(grid);
+
+  //Set the origin at (0,0,0)
+  if(param->input_file){
+    for(int i=0; i<atom->Nlocal; i++){
+      atom_x(i) = atom_x(i) - param->xlo;
+      atom_y(i) = atom_y(i) - param->ylo;
+      atom_z(i) = atom_z(i) - param->zlo;
+    }
+  }
+
+  cartisian3d(grid, param, &atom->mybox);
+  
+  xlo = atom->mybox.lo[_x]; xhi = atom->mybox.hi[_x];  
+  ylo = atom->mybox.lo[_y]; yhi = atom->mybox.hi[_y];
+  zlo = atom->mybox.lo[_z]; zhi = atom->mybox.hi[_z];  
+
+  int i = 0; 
+  while(i < atom->Nlocal) 
+  {
+    if(atom_x(i) >= xlo && atom_x(i)< xhi &&  
+       atom_y(i) >= ylo && atom_y(i)< yhi &&  
+       atom_z(i) >= zlo && atom_z(i)< zhi)
+      {
+        i++;
+      } else {
+        copy(atom, i, atom->Nlocal-1);
+        atom->Nlocal--; 
+      }
+  } 
+
+  //printGrid(grid);
+  if(!param->balance){
+    MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); 
+    printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
+    MPI_Barrier(world);
+  }  
+}
+
+void printGrid(Grid* grid)
+{
+  int me, nprocs;
+  MPI_Comm_size(world, &nprocs);
+  MPI_Comm_rank(world, &me);
+  MD_FLOAT* map = grid->map;
+  if(me==0)
+  {
+ 
+    printf("GRID:\n");
+    printf("===================================================================================================\n");
+    for(int i=0; i<nprocs; i++)
+      printf("Box:%i\txlo:%.4f\txhi:%.4f\tylo:%.4f\tyhi:%.4f\tzlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
+    printf("\n\n");    
+    //printf("Box processor:%i\n xlo:%.4f\txhi:%.4f\n ylo:%.4f\tyhi:%.4f\n zlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
+  }
+  MPI_Barrier(world);
+}
+
+
+
--- a/common/includes/box.h
+++ b/common/includes/box.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <parameter.h>
+
+#ifndef __BOX_H_
+#define __BOX_H_
+
+typedef struct {
+  int id;
+  MD_FLOAT xprd, yprd, zprd;     //Domain Dimension
+  MD_FLOAT lo[3];               //smallest coordinate of my subdomain
+  MD_FLOAT hi[3];               //Highest coordinate of my subdomain
+} Box;
+
+int overlapBox(int, int , const Box*, const Box* , Box* , MD_FLOAT , MD_FLOAT);
+int overlapFullBox(Parameter*, MD_FLOAT*, const Box*, const Box*);
+void expandBox(int , const Box*, const Box* , Box* , MD_FLOAT);
+#endif
--- a/common/includes/comm.h
+++ b/common/includes/comm.h
@@ -0,0 +1,104 @@
+#include <atom.h>
+#include <parameter.h>
+#include <box.h>
+#include <grid.h>
+
+#ifndef COMM_H
+#define COMM_H
+
+#ifdef GROMACS
+#define FORWARD_SIZE  (3*CLUSTER_N)   
+#define REVERSE_SIZE  (3*CLUSTER_N)
+#define GHOST_SIZE    (4*CLUSTER_N+10)
+#define EXCHANGE_SIZE 7
+
+#define JFAC MAX(1, CLUSTER_N / CLUSTER_M)
+#define LOCAL atom->Nclusters_local / JFAC
+#define GHOST atom->Nclusters_ghost 
+
+#define IsinRegionToSend(cj)                                                                  \
+           ((atom->jclusters[(cj)].bbminx >= xlo || atom->jclusters[(cj)].bbmaxx >= xlo)  &&  \
+            (atom->jclusters[(cj)].bbminx  < xhi || atom->jclusters[(cj)].bbmaxx  < xhi)  &&  \
+            (atom->jclusters[(cj)].bbminy >= ylo || atom->jclusters[(cj)].bbmaxy >= ylo)  &&  \
+            (atom->jclusters[(cj)].bbminy  < yhi || atom->jclusters[(cj)].bbmaxy  < yhi)  &&  \
+            (atom->jclusters[(cj)].bbminz >= zlo || atom->jclusters[(cj)].bbmaxz >= zlo)  &&  \
+            (atom->jclusters[(cj)].bbminz  < zhi || atom->jclusters[(cj)].bbmaxz  < zhi))  
+
+#else
+
+#define FORWARD_SIZE  3   
+#define REVERSE_SIZE  3
+#define GHOST_SIZE    4
+#define EXCHANGE_SIZE 7
+#define LOCAL atom->Nlocal
+#define GHOST atom->Nghost
+
+#define IsinRegionToSend(i)                                 \
+           ((atom_x((i)) >= xlo && atom_x((i)) < xhi) &&    \
+            (atom_y((i)) >= ylo && atom_y((i)) < yhi) &&    \
+            (atom_z((i)) >= zlo && atom_z((i)) < zhi)) 
+
+#endif 
+
+typedef struct {
+  int myproc;                       // my proc ID
+  int numproc;                      // # of processors
+	
+  int numneigh;                     // # of all my neighs along all swaps 
+  int maxneigh;										  // Buffer size for my neighs
+	int sendfrom[6];                  //return the lowest neigh index to send in each swap
+  int sendtill[6];                  //return the highest neigh index to send in each swao
+  int recvfrom[6];                  //return the lowest neigh index to recv in each swap
+  int recvtill[6];                  //return the highest neigh index to recv in each swap
+  int* nsend;											  // neigh whose I want to send
+  int* nrecv;                       // neigh whose I want to recv
+
+	int* pbc_x;                       // if pbc in x
+	int* pbc_y;                       // if pbc in y
+	int* pbc_z;                       // if pbc in z
+	
+  int* atom_send, *atom_recv;       // # of atoms to send/recv for each of my neighs 
+	int* off_atom_send;               // atom offset to send, inside of a swap
+  int* off_atom_recv;               // atom offset to recv, inside of a swap
+         
+  int* nexch;                        //procs to exchange
+  int numneighexch;                  //# of neighbours to exchange
+  int maxneighexch;                  //max buff size to store neighbours
+
+	int numswap;                      // # of swaps to perform, it is 6
+  int swapdim[6]; 									// dimension of the swap (_x, _y or _z)
+	int swapdir[6];										// direction of the swap 0 or 1
+  int swap[3][2];                   // given a dim and dir, knows the swap
+  int othersend[6];                 // Determine if a proc interact with more procs in a given swap
+
+	int firstrecv[6];                 // where to put 1st recv atom in each swap
+  int** sendlist;                   // list of atoms to send in each swap   
+  int* maxsendlist;								  // max # of atoms send in each list-swap
+
+	int maxsend;											// max elements in buff sender 									
+	int maxrecv;											// max elements in buff receiver
+  MD_FLOAT* buf_send;               // sender buffer for all comm
+	MD_FLOAT* buf_recv;               // receicer buffer for all comm
+	 	  
+	int forwardSize;					        // # of paramaters per atom in forward comm.
+	int reverseSize;			        		// # of parameters per atom in reverse
+  int exchangeSize;                 // # of parameters per atom in exchange
+	int ghostSize;                    // # of parameters per atom in ghost list                               
+
+  int  iterAtom;                     //last atom to iterate in each swap.
+  Box* boxes; 											 // Boundaries to  be sent to other procs as ghost.
+} Comm;
+
+
+void initComm(int*, char***, Comm*); 						    //Init MPI 
+void endComm(Comm*);													      //End MPI
+void setupComm(Comm*,Parameter*,Grid*);             //Creates a 3d grid or rcb grid
+void neighComm(Comm*,Parameter*,Grid*);             //Find neighbours within cut-off and defines ghost regions
+void forwardComm(Comm*,Atom*,int);							    //Send info in one direction
+void reverseComm(Comm*,Atom*,int);							    //Return info after forward communication
+void exchangeComm(Comm*,Atom*);							        //Exchange info between procs
+void ghostComm(Comm*, Atom*,int);                   //Build the ghost neighbours to send during next forwards
+void growSend(Comm*,int);										        //Grows the size of the buffer sender
+void growRecv(Comm*,int);										        //Grows the size of the buffer receiver
+void growList(Comm*, int, int);                     //Grows the size of the list to send
+#endif
--- a/common/includes/grid.h
+++ b/common/includes/grid.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+
+
+#include <parameter.h>
+#include <box.h>
+#include <atom.h>
+#include <mpi.h>
+
+#ifndef __MAP_H_
+#define __MAP_H_
+
+#define world MPI_COMM_WORLD
+#define atom_pos(i) ((dim == _x) ? atom_x((i)) : (dim == _y) ? atom_y((i)) : atom_z((i)))
+
+enum {RCB=1, meanTimeRCB, Staggered};
+
+typedef struct {
+  int balance_every;
+  int  map_size;
+  MD_FLOAT* map;
+  //===Param for Staggerd balance
+  int nprocs[3]; 
+  int coord[3];
+  MD_FLOAT cutneigh[3];
+  double Timer;
+  //===Param for RCB balance 
+  MD_FLOAT* buf_send;
+  MD_FLOAT* buf_recv;
+  int maxsend; 
+  int maxrecv; 
+} Grid; 
+
+
+typedef MD_FLOAT(*RCB_Method)(Atom*,MPI_Comm,int,double);
+
+void setupGrid(Grid*, Atom*, Parameter*);
+void cartisian3d(Grid*, Parameter*, Box*);
+void rcbBalance(Grid*, Atom*, Parameter* ,RCB_Method, int, double);
+void staggeredBalance(Grid*, Atom*, Parameter*, double); 
+void printGrid(Grid*); 
+//rcb methods
+MD_FLOAT meanBisect(Atom* , MPI_Comm, int, double);
+MD_FLOAT meanTimeBisect(Atom*, MPI_Comm, int, double);
+#endif
+
+
--- a/common/includes/parameter.h
+++ b/common/includes/parameter.h
@@ -53,6 +53,10 @@ typedef struct {
    MD_FLOAT k_dn;
    MD_FLOAT gx, gy, gz;
    MD_FLOAT reflect_x, reflect_y, reflect_z;
+    //MPI implementation
+    int balance;
+    int method;
+    int balance_every;
 } Parameter;

 void initParameter(Parameter*);
--- a/common/includes/shell_methods.h
+++ b/common/includes/shell_methods.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+#include <math.h>
+#include <comm.h>
+#include <atom.h>
+#include <timing.h>
+#include <parameter.h>
+#include <util.h>
+
+//static void addDummyCluster(Atom*);
+
+double forward(Comm* comm, Atom *atom, Parameter* param){
+    double S, E;    
+    S = getTimeStamp();  
+    if(param->method == halfShell){
+        for(int iswap = 0; iswap < 5; iswap++) 
+            forwardComm(comm, atom, iswap);
+    } else if(param->method == eightShell){
+        for(int iswap = 0; iswap < 6; iswap+=2) 
+            forwardComm(comm, atom, iswap);
+    } else {
+        for(int iswap = 0; iswap < 6; iswap++) 
+            forwardComm(comm, atom, iswap);
+    }
+    E = getTimeStamp();
+    return E-S;
+}
+
+double reverse(Comm* comm, Atom *atom, Parameter* param){
+    double S, E;    
+    S = getTimeStamp(); 
+    if(param->method == halfShell){
+        for(int iswap = 4; iswap >= 0; iswap--) 
+            reverseComm(comm, atom, iswap);
+    } else if(param->method == eightShell){
+        for(int iswap = 4; iswap >= 0; iswap-=2) 
+            reverseComm(comm, atom, iswap);
+    } else if(param->method == halfStencil){
+        for(int iswap = 5; iswap >= 0; iswap--) 
+            reverseComm(comm, atom, iswap);
+    }  else { }  //Full Shell Reverse does nothing 
+    E = getTimeStamp();
+    return E-S;
+}
+
+void ghostNeighbor(Comm* comm, Atom* atom, Parameter* param)
+{   
+    #ifdef GROMACS
+    atom->Nclusters_ghost = 0;
+    #endif
+    atom->Nghost = 0;    
+    if(param->method == halfShell){
+        for(int iswap=0; iswap<5; iswap++) 
+            ghostComm(comm,atom,iswap);
+    } else if(param->method == eightShell){
+        for(int iswap = 0; iswap<6; iswap+=2)
+            ghostComm(comm, atom,iswap);
+    } else {
+        for(int iswap=0; iswap<6; iswap++) 
+            ghostComm(comm,atom,iswap);
+    }
+}
--- a/common/includes/timers.h
+++ b/common/includes/timers.h
@@ -9,9 +9,15 @@

 typedef enum {
    TOTAL = 0,
-    NEIGH,
    FORCE,
+    NEIGH,
+    FORWARD,
+    REVERSE,
+    UPDATE,
+    BALANCE,
+    SETUP,
+    REST,
    NUMTIMER
-} timertype;
+ } timerComm;

 #endif
--- a/common/includes/util.h
+++ b/common/includes/util.h
@@ -4,6 +4,8 @@
 * Use of this source code is governed by a LGPL-3.0
 * license that can be found in the LICENSE file.
 */
+#include <math.h>
+
 #ifndef __UTIL_H_
 #define __UTIL_H_

@@ -35,6 +37,13 @@
 #   define PRECISION_STRING     "double"
 #endif

+#define BigOrEqual(a,b) (fabs((a)-(b))<1e-9 || (a)>(b))
+#define Equal(a,b) (fabs((a)-(b))<1e-9)
+
+enum {_x=0, _y, _z}; 
+enum {fullShell=0, halfShell, eightShell, halfStencil};
+
+
 extern double myrandom(int*);
 extern void random_reset(int *seed, int ibase, double *coord);
 extern int str2ff(const char *string);
--- a/common/parameter.c
+++ b/common/parameter.c
@@ -11,6 +11,7 @@
 #include <atom.h>
 #include <parameter.h>
 #include <util.h>
+#include <mpi.h>

 void initParameter(Parameter *param) {
    param->input_file = NULL;
@@ -54,13 +55,17 @@ void initParameter(Parameter *param) {
    param->reflect_x = 0.0;
    param->reflect_y = 0.0;
    param->reflect_z = 0.0;
+    //MPI
+    param->balance = 0;
+    param->method = 0;
+    param->balance_every =param->reneigh_every; 
 }

 void readParameter(Parameter *param, const char *filename) {
    FILE *fp = fopen(filename, "r");
    char line[MAXLINE];
    int i;
-
+    
    if(!fp) {
        fprintf(stderr, "Could not open parameter file: %s\n", filename);
        exit(-1);
@@ -72,8 +77,8 @@ void readParameter(Parameter *param, const char *filename) {
        for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
        line[i] = '\0';

-        char *tok = strtok(line, " ");
-        char *val = strtok(NULL, " ");
+        char *tok = strtok(line, "\t ");
+        char *val = strtok(NULL, "\t ");

        #define PARSE_PARAM(p,f)   if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
        #define PARSE_STRING(p)    PARSE_PARAM(p, strdup)
@@ -117,15 +122,20 @@ void readParameter(Parameter *param, const char *filename) {
            PARSE_INT(x_out_every);
            PARSE_INT(v_out_every);
            PARSE_INT(half_neigh);
+            PARSE_INT(method);
+            PARSE_INT(balance);
+            PARSE_INT(balance_every);
        }
    }
-
    // Update dtforce
    param->dtforce = 0.5 * param->dt;

    // Update sigma6 parameter
    MD_FLOAT s2 = param->sigma * param->sigma;
    param->sigma6 = s2 * s2 * s2;
+    
+    //Update balance parameter, 10 could be change
+    param->balance_every *=param->reneigh_every;
    fclose(fp);
 }

@@ -183,4 +193,19 @@ void printParameter(Parameter *param) {
    printf("\tSkin: %e\n", param->skin);
    printf("\tHalf neighbor lists: %d\n", param->half_neigh);
    printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
+
+    // ================ New MPI features =============
+    char str[20]; 
+    strcpy(str, (param->method == 1) ? "Half Shell"  :
+                (param->method == 2) ? "Eight Shell" :
+                (param->method == 3) ? "Half Stencil":                      
+                                       "Full Shell");
+    printf("\tMethod: %s\n", str);
+    strcpy(str, (param->balance == 1) ? "mean RCB"      : 
+                (param->balance == 2) ? "mean Time RCB" :
+                (param->balance == 3) ? "Staggered"     :
+                                        "cartisian");
+    printf("\tPartition: %s\n", str);
+    if(param->balance) 
+        printf("\tRebalancing every (timesteps): %d\n",param->balance_every); 
 }
--- a/common/thermo.c
+++ b/common/thermo.c
@@ -10,6 +10,7 @@

 #include <thermo.h>
 #include <util.h>
+#include <mpi.h>

 static int *steparr;
 static MD_FLOAT *tmparr;
@@ -24,6 +25,7 @@ static MD_FLOAT t_act;
 static MD_FLOAT p_act;
 static MD_FLOAT e_act;
 static int mstat;
+static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;

 /* exported subroutines */
 void setupThermo(Parameter *param, int natoms)
@@ -53,57 +55,73 @@ void setupThermo(Parameter *param, int natoms)

 void computeThermo(int iflag, Parameter *param, Atom *atom)
 {
-    MD_FLOAT t = 0.0, p;
+    MD_FLOAT t_sum = 0.0, t = 0.0, p;
+    int me; 
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
    for(int i = 0; i < atom->Nlocal; i++) {
        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
    }

-    t = t * t_scale;
-    p = (t * dof_boltz) * p_scale;
-    int istep = iflag;
+    MPI_Reduce(&t, &t_sum, 1, type, MPI_SUM, 0 ,MPI_COMM_WORLD);
+    if(me == 0)
+    {
+        t = t_sum * t_scale;
+        p = (t * dof_boltz) * p_scale;
+        int istep = iflag;

-    if(iflag == -1){
-        istep = param->ntimes;
-    }
-    if(iflag == 0){
-        mstat = 0;
-    }
+        if(iflag == -1){
+            istep = param->ntimes;
+        }
+        if(iflag == 0){
+            mstat = 0;
+        }

-    steparr[mstat] = istep;
-    tmparr[mstat] = t;
-    prsarr[mstat] = p;
-    mstat++;
-    fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
+        steparr[mstat] = istep;
+        tmparr[mstat] = t;
+        prsarr[mstat] = p;
+        mstat++;
+        fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
+    }
 }

 void adjustThermo(Parameter *param, Atom *atom)
 {
    /* zero center-of-mass motion */
    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
-
+    MD_FLOAT v_sum[3], vtot[3];  
+    
    for(int i = 0; i < atom->Nlocal; i++) {
        vxtot += atom_vx(i);
        vytot += atom_vy(i);
        vztot += atom_vz(i);
    }
+    
+    vtot[0] = vxtot; vtot[1] = vytot; vtot[2] = vztot;  

-    vxtot = vxtot / atom->Natoms;
-    vytot = vytot / atom->Natoms;
-    vztot = vztot / atom->Natoms;
+    MPI_Allreduce(vtot, v_sum, 3, type, MPI_SUM, MPI_COMM_WORLD);
+    
+    vxtot = v_sum[0] / atom->Natoms;
+    vytot = v_sum[1] / atom->Natoms;
+    vztot = v_sum[2] / atom->Natoms;

    for(int i = 0; i < atom->Nlocal; i++) {
        atom_vx(i) -= vxtot;
        atom_vy(i) -= vytot;
        atom_vz(i) -= vztot;
    }
-
-    t_act = 0;
+   
    MD_FLOAT t = 0.0;
+    MD_FLOAT t_sum = 0.0;

    for(int i = 0; i < atom->Nlocal; i++) {
        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
    }

+    MPI_Allreduce(&t, &t_sum, 1,type, MPI_SUM,MPI_COMM_WORLD);
+
+    t = t_sum; 
    t *= t_scale;
    MD_FLOAT factor = sqrt(param->temp / t);

--- a/common/util.c
+++ b/common/util.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <util.h>
+#include <math.h>

 /* Park/Miller RNG w/out MASKING, so as to be like f90s version */
 #define IA 16807
@@ -86,6 +87,7 @@ int get_cuda_num_threads() {

 void readline(char *line, FILE *fp) {
    if(fgets(line, MAXLINE, fp) == NULL) {
+        printf("error %i\n",errno);
        if(errno != 0) {
            perror("readline()");
            exit(-1);