diff --git a/Makefile b/Makefile index defffb6..55a20ea 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,9 @@ include $(MAKE_DIR)/include_ISA.mk include $(MAKE_DIR)/include_GROMACS.mk INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes +ifeq ($(strip $(OPT_SCHEME)),gromacs) + DEFINES += -DGROMACS +endif ifeq ($(strip $(DATA_LAYOUT)),AOS) DEFINES += -DAOS endif diff --git a/common/box.c b/common/box.c new file mode 100644 index 0000000..8a0d1c9 --- /dev/null +++ b/common/box.c @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +#include +#include +#include +#include +#include + +int overlapBox(int dim, int dir, const Box* mybox, const Box* other, Box* cut, MD_FLOAT xprd, MD_FLOAT cutneigh) +{ + int pbc = -100; + MD_FLOAT min[3], max[3]; + int same = (mybox->id == other->id) ? 1 : 0; + + //projections + min[_x] = MAX(mybox->lo[_x], other->lo[_x]); max[_x] = MIN(mybox->hi[_x], other->hi[_x]); + min[_y] = MAX(mybox->lo[_y], other->lo[_y]); max[_y] = MIN(mybox->hi[_y], other->hi[_y]); + min[_z] = MAX(mybox->lo[_z], other->lo[_z]); max[_z] = MIN(mybox->hi[_z], other->hi[_z]); + + //Intersection no periodic case + if(!same){ + if (dir == 0) max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ cutneigh); + if (dir == 1) min[dim] = MAX(mybox->lo[dim], other->lo[dim]- cutneigh); + if ((min[_x]lo[dim] , other->lo[dim]- xprd); + max[dim] = MIN(mybox->hi[dim] , other->hi[dim]- xprd + cutneigh); + + } else { + min[dim] = MAX(mybox->lo[dim], other->lo[dim]+ xprd - cutneigh); + max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ xprd); + + } + if((min[_x]lo[_x] = min[_x]; cut->hi[_x] = max[_x]; + cut->lo[_y] = min[_y]; cut->hi[_y] = max[_y]; + cut->lo[_z] = min[_z]; cut->hi[_z] = max[_z]; + + return pbc; +} + +int overlapFullBox(Parameter* param, MD_FLOAT *cutneigh ,const Box* mybox, const Box* other) +{ + MD_FLOAT min[3], max[3]; + MD_FLOAT xprd = param->xprd; + MD_FLOAT yprd = param->yprd; + MD_FLOAT zprd = param->zprd; + + for(int k = -1; k < 2; k++) + { + for(int j = -1; j < 2; j++) + { + for(int i= -1; i < 2; i++) + { + min[_x] = MAX(mybox->lo[_x], other->lo[_x]-cutneigh[_x] + i*xprd); + min[_y] = MAX(mybox->lo[_y], other->lo[_y]-cutneigh[_y] + j*yprd); + min[_z] = MAX(mybox->lo[_z], other->lo[_z]-cutneigh[_z] + k*zprd); + max[_x] = MIN(mybox->hi[_x], other->hi[_x]+cutneigh[_x] + i*xprd); + max[_y] = MIN(mybox->hi[_y], other->hi[_y]+cutneigh[_y] + j*yprd); + max[_z] = MIN(mybox->hi[_z], other->hi[_z]+cutneigh[_z] + k*zprd); + if ((min[_x]lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh; + if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh; + } + + if(iswap==4 || iswap==5){ + if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh; + if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh; + if(me->lo[_y] <= other->lo[_y]) cut->lo[_y] -= cutneigh; + if(me->hi[_y] >= other->hi[_y]) cut->hi[_y] += cutneigh; + } +} + diff --git a/common/comm.c b/common/comm.c new file mode 100644 index 0000000..169916c --- /dev/null +++ b/common/comm.c @@ -0,0 +1,556 @@ +#include +#include +#include +#include +#include +#include +#include + +#define NEIGHMIN 6 +#define BUFFACTOR 2 +#define BUFMIN 1000 +#define BUFEXTRA 100 +#define world MPI_COMM_WORLD + +MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE; +static inline void allocDynamicBuffers(Comm*); +static inline void freeDynamicBuffers(Comm*); +static inline void freeBuffers(Comm*); + +void defineReverseList(Comm* comm){ + int dim = 0; + int index = 0; + int me = comm->myproc; + + //Set the inverse list + for(int iswap = 0; iswap<6; iswap++){ + int dim = comm->swapdim[iswap]; + int dir = comm->swapdir[iswap]; + int invswap = comm->swap[dim][(dir+1)%2]; + + for(int ineigh = comm->sendfrom[invswap]; ineigh< comm->sendtill[invswap]; ineigh++) + comm->nrecv[index++] = comm->nsend[ineigh]; + + comm->recvfrom[iswap] = (iswap == 0) ? 0 : comm->recvtill[iswap-1]; + comm->recvtill[iswap] = index; + } + + //set if myproc is unique in the swap + for(int iswap = 0; iswap<6; iswap++){ + int sizeswap = comm->sendtill[iswap]-comm->sendfrom[iswap]; + int index = comm->sendfrom[iswap]; + int myneigh = comm->nsend[index]; + comm->othersend[iswap] = (sizeswap != 1 || comm->myproc != myneigh) ? 1 : 0; + } +} + +void addNeighToExchangeList(Comm* comm, int newneigh){ + + int numneigh = comm->numneighexch; + + if(comm->numneighexch>=comm->maxneighexch){ + size_t oldByteSize = comm->maxneighexch*sizeof(int); + comm->maxneighexch *=2; + comm->nexch = (int*) reallocate(comm->nexch, ALIGNMENT, comm->maxneighexch * sizeof(int), oldByteSize); + } + + // Add the new element to the list + comm->nexch[numneigh] = newneigh; + comm->numneighexch++; +} + +//Exported functions +void neighComm(Comm *comm, Parameter* param, Grid *grid) +{ + int me = comm->myproc; + int numproc = comm ->numproc; + int PAD = 6; //number of elements for processor in the map + int ineigh = 0; + int sneigh = 0; + MD_FLOAT *map = grid->map; + MD_FLOAT cutneigh = param->cutneigh; + MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd}; + Box mybox, other, cut; + + //needed for rebalancing + freeDynamicBuffers(comm); + + //Local box + mybox.id = me; + mybox.lo[_x] = map[me*PAD+0]; mybox.hi[_x] = map[me*PAD+3]; + mybox.lo[_y] = map[me*PAD+1]; mybox.hi[_y] = map[me*PAD+4]; + mybox.lo[_z] = map[me*PAD+2]; mybox.hi[_z] = map[me*PAD+5]; + + //Check for all possible neighbours only for exchange atoms + comm->numneighexch = 0; + for(int proc = 0; proc cutneigh,&mybox,&other); + if(intersection) addNeighToExchangeList(comm,proc); + } + } + + //MAP is stored as follows: xlo,ylo,zlo,xhi,yhi,zhi + for(int iswap = 0; iswap <6; iswap++) + { + int dir = comm->swapdir[iswap]; + int dim = comm->swapdim[iswap]; + + for(int proc = 0; proc < numproc; proc++) + { + //Check for neighbours along dimmensions, for forwardComm, backwardComm and ghostComm + other.id = proc; + other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3]; + other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4]; + other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5]; + + //return if two boxes intersect: -100 not intersection, 0, 1 and -1 intersection for each different pbc. + int pbc = overlapBox(dim,dir,&mybox,&other,&cut,prd[dim],cutneigh); + if(pbc == -100) continue; + + expandBox(iswap, &mybox, &other, &cut, cutneigh); + + if(ineigh >= comm->maxneigh) { + size_t oldByteSize = comm->maxneigh*sizeof(int); + size_t oldBoxSize = comm->maxneigh*sizeof(Box); + comm->maxneigh = 2*ineigh; + comm->nsend = (int*) reallocate(comm->nsend, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize); + comm->nrecv = (int*) reallocate(comm->nrecv, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize); + comm->pbc_x = (int*) reallocate(comm->pbc_x, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize); + comm->pbc_y = (int*) reallocate(comm->pbc_y, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize); + comm->pbc_z = (int*) reallocate(comm->pbc_z, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize); + comm->boxes = (Box*) reallocate(comm->boxes, ALIGNMENT, comm->maxneigh * sizeof(Box), oldBoxSize); + } + + comm->boxes[ineigh] = cut; + comm->nsend[ineigh] = proc; + comm->pbc_x[ineigh] = (dim == _x) ? pbc : 0; + comm->pbc_y[ineigh] = (dim == _y) ? pbc : 0; + comm->pbc_z[ineigh] = (dim == _z) ? pbc : 0; + ineigh++; + } + + comm->sendfrom[iswap] = (iswap == 0) ? 0:comm->sendtill[iswap-1]; + comm->sendtill[iswap] = ineigh; + comm->numneigh = ineigh; + } + + allocDynamicBuffers(comm); + defineReverseList(comm); +} + +void initComm(int* argc, char*** argv, Comm* comm) +{ + //MPI Initialize + MPI_Init(argc, argv); + MPI_Comm_size(MPI_COMM_WORLD, &(comm->numproc)); + MPI_Comm_rank(MPI_COMM_WORLD, &(comm->myproc)); + comm->numneigh = 0; + comm->numneighexch = 0; + comm->nrecv=NULL; + comm->nsend=NULL; + comm->nexch=NULL; + comm->pbc_x=NULL; + comm->pbc_y=NULL; + comm->pbc_z=NULL; + comm->boxes=NULL; + comm->atom_send=NULL; + comm->atom_recv=NULL; + comm->off_atom_send=NULL; + comm->off_atom_recv=NULL; + comm->maxsendlist=NULL; + comm->sendlist=NULL; + comm->buf_send=NULL; + comm->buf_recv=NULL; +} + +void endComm(Comm* comm) +{ + comm->maxneigh = 0; + comm->maxneighexch =0; + comm->maxsend = 0; + comm->maxrecv = 0; + freeBuffers(comm); + MPI_Finalize(); +} + +void setupComm(Comm* comm, Parameter* param, Grid* grid){ + + comm->swap[_x][0] = 0; comm->swap[_x][1] =1; + comm->swap[_y][0] = 2; comm->swap[_y][1] =3; + comm->swap[_z][0] = 4; comm->swap[_z][1] =5; + + comm->swapdim[0] = comm->swapdim[1] = _x; + comm->swapdim[2] = comm->swapdim[3] = _y; + comm->swapdim[4] = comm->swapdim[5] = _z; + + comm->swapdir[0] = comm->swapdir[2] = comm->swapdir[4] = 0; + comm->swapdir[1] = comm->swapdir[3] = comm->swapdir[5] = 1; + + for(int i = 0; i<6; i++){ + comm->sendfrom[i] = 0; + comm->sendtill[i] = 0; + comm->recvfrom[i] = 0; + comm->recvtill[i] = 0; + } + + comm->forwardSize = FORWARD_SIZE; //send coordiantes x,y,z + comm->reverseSize = REVERSE_SIZE; //return forces fx, fy, fz + comm->ghostSize = GHOST_SIZE; //send x,y,z,type; + comm->exchangeSize = EXCHANGE_SIZE; //send x,y,z,vx,vy,vz,type + + //Allocate memory for recv buffer and recv buffer + comm->maxsend = BUFMIN; + comm->maxrecv = BUFMIN; + comm->buf_send = (MD_FLOAT*) allocate(ALIGNMENT,(comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT)); + comm->buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT)); + + comm->maxneighexch = NEIGHMIN; + comm->nexch = (int*) allocate(ALIGNMENT, comm->maxneighexch * sizeof(int)); + + comm->maxneigh = NEIGHMIN; + comm->nsend = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int)); + comm->nrecv = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int)); + comm->pbc_x = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int)); + comm->pbc_y = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int)); + comm->pbc_z = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int)); + comm->boxes = (Box*) allocate(ALIGNMENT, comm->maxneigh * sizeof(Box)); + + neighComm(comm, param, grid); +} + +void forwardComm(Comm* comm, Atom* atom, int iswap) +{ + int nrqst=0, offset=0, nsend=0, nrecv=0; + int pbc[3]; + int size = comm->forwardSize; + int maxrqst = comm->numneigh; + MD_FLOAT* buf; + MPI_Request requests[maxrqst]; + + for(int ineigh = comm->sendfrom[iswap]; ineigh < comm->sendtill[iswap]; ineigh++){ + offset = comm->off_atom_send[ineigh]; + pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh]; + packForward(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &comm->buf_send[offset*size],pbc); + } + + //Receives elements + if(comm->othersend[iswap]) + for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){ + offset = comm->off_atom_recv[ineigh]*size; + nrecv = comm->atom_recv[ineigh]*size; + MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]); + } + + //Send elements + if(comm->othersend[iswap]) + for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){ + offset = comm->off_atom_send[ineigh]*size; + nsend = comm->atom_send[ineigh]*size; + MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world); + } + + if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE); + + if(comm->othersend[iswap]) buf = comm->buf_recv; + else buf = comm->buf_send; + + /* unpack buffer */ + for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){ + offset = comm->off_atom_recv[ineigh]; + unpackForward(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &buf[offset*size]); + } +} + +void reverseComm(Comm* comm, Atom* atom, int iswap) +{ + int nrqst=0, offset=0, nsend=0, nrecv=0 ; + int size = comm->reverseSize; + int maxrqst = comm->numneigh; + MD_FLOAT* buf; + MPI_Request requests[maxrqst]; + + for(int ineigh = comm->recvfrom[iswap]; ineigh < comm->recvtill[iswap]; ineigh++){ + offset = comm->off_atom_recv[ineigh]; + packReverse(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &comm->buf_send[offset*size]); + } + //Receives elements + if(comm->othersend[iswap]) + for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){ + offset = comm->off_atom_send[ineigh]*size; + nrecv = comm->atom_send[ineigh]*size; + MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nsend[ineigh],0,world,&requests[nrqst++]); + } + //Send elements + if(comm->othersend[iswap]) + for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){ + offset = comm->off_atom_recv[ineigh]*size; + nsend = comm->atom_recv[ineigh]*size; + MPI_Send(&comm->buf_send[offset],nsend,type,comm->nrecv[ineigh],0,world); + } + if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE); + if(comm->othersend[iswap]) buf = comm->buf_recv; + else buf = comm->buf_send; + + /* unpack buffer */ + for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){ + offset = comm->off_atom_send[ineigh]; + unpackReverse(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &buf[offset*size]); + } +} + +void ghostComm(Comm* comm, Atom* atom,int iswap){ + + MD_FLOAT xlo=0, xhi=0, ylo=0, yhi=0, zlo=0, zhi=0; + MD_FLOAT* buf; + int nrqst=0, nsend=0, nrecv=0, offset=0, ineigh=0, pbc[3]; + int all_recv=0, all_send=0, currentSend=0; + int size = comm->ghostSize; + int maxrqrst = comm->numneigh; + MPI_Request requests[maxrqrst]; + for(int i = 0; iiterAtom = LOCAL+GHOST; + int iter = 0; + for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++) + { + Box* tile = &comm->boxes[ineigh]; + + xlo = tile->lo[_x]; ylo = tile->lo[_y]; zlo = tile->lo[_z]; + xhi = tile->hi[_x]; yhi = tile->hi[_y]; zhi = tile->hi[_z]; + pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh]; + nsend = 0; + + for(int i = 0; i < comm->iterAtom ; i++) + { + if(IsinRegionToSend(i)){ + if(nsend >= comm->maxsendlist[ineigh]) growList(comm,ineigh,nsend); + if(currentSend + size >= comm->maxsend) growSend(comm,currentSend); + comm->sendlist[ineigh][nsend++] = i; + currentSend += packGhost(atom, i, &comm->buf_send[currentSend], pbc); + } + } + comm->atom_send[ineigh] = nsend; //#atoms send per neigh + comm->off_atom_send[ineigh] = all_send; //offset atom respect to neighbours in a swap + all_send += nsend; //all atoms send + } + //Receives how many elements to be received. + if(comm->othersend[iswap]) + for(nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++) + MPI_Irecv(&comm->atom_recv[ineigh],1,MPI_INT,comm->nrecv[ineigh],0,world,&requests[nrqst++]); + + if(!comm->othersend[iswap]) comm->atom_recv[comm->recvfrom[iswap]] = nsend; + + //Communicate how many elements to be sent. + if(comm->othersend[iswap]) + for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++) + MPI_Send(&comm->atom_send[ineigh],1,MPI_INT,comm->nsend[ineigh],0,world); + if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE); + + //Define offset to store in the recv_buff + for(int ineigh = comm->recvfrom[iswap]; ineighrecvtill[iswap]; ineigh++){ + comm->off_atom_recv[ineigh] = all_recv; + all_recv += comm->atom_recv[ineigh]; + } + + if(all_recv*size>=comm->maxrecv) growRecv(comm,all_recv*size); + + //Receives elements + if(comm->othersend[iswap]) + for (nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){ + offset = comm->off_atom_recv[ineigh]*size; + nrecv = comm->atom_recv[ineigh]*size; + MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]); + } + //Send elements + if(comm->othersend[iswap]) + for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){ + offset = comm->off_atom_send[ineigh]*size; + nsend = comm->atom_send[ineigh]*size; + MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world); + } + if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE); + + if(comm->othersend[iswap]) buf = comm->buf_recv; + else buf = comm->buf_send; + //unpack elements + comm->firstrecv[iswap] = LOCAL+GHOST; + for(int i = 0; i < all_recv; i++) + unpackGhost(atom, LOCAL+GHOST, &buf[i*size]); + + //Increases the buffer if needed + int max_size = MAX(comm->forwardSize,comm->reverseSize); + int max_buf = max_size * MAX(all_recv, all_send); + if(max_buf>=comm->maxrecv) growRecv(comm,max_buf); + if(max_buf>=comm->maxsend) growSend(comm,max_buf); +} + +void exchangeComm(Comm* comm, Atom* atom){ + + MD_FLOAT x,y,z; + MD_FLOAT *lo = atom->mybox.lo; + MD_FLOAT *hi = atom->mybox.hi; + int size = comm->exchangeSize; + int numneigh = comm->numneighexch; + int offset_recv[numneigh]; + int size_recv[numneigh]; + MPI_Request requests[numneigh]; + int i =0, nsend = 0, nrecv = 0; + int nrqst = 0; + int nlocal, offset,m; + + /* enforce PBC */ + pbc(atom); + + if(comm->numneigh == 0) return; + + nlocal = atom->Nlocal; + while(i < nlocal) { + if(atom_x(i) < lo[_x] || atom_x(i) >= hi[_x] || + atom_y(i) < lo[_y] || atom_y(i) >= hi[_y] || + atom_z(i) < lo[_z] || atom_z(i) >= hi[_z]) { + if(nsend+size >= comm->maxsend) growSend(comm, nsend); + nsend += packExchange(atom, i, &comm->buf_send[nsend]); + copy(atom, i, nlocal-1); + nlocal--; + } else i++; + } + atom->Nlocal = nlocal; + + /* send/recv number of to share atoms with neighbouring procs*/ + for(int ineigh = 0; ineigh < numneigh; ineigh++) + MPI_Irecv(&size_recv[ineigh],1,MPI_INT,comm->nexch[ineigh],0,world,&requests[nrqst++]); + + for (int ineigh = 0; ineigh < numneigh; ineigh++) + MPI_Send(&nsend,1,MPI_INT,comm->nexch[ineigh],0,world); + MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE); + + //Define offset to store in the recv_buff + for(int ineigh = 0; ineigh= comm->maxrecv) growRecv(comm,nrecv); + + //Receives elements + nrqst=0; + for (int ineigh = 0; ineigh< numneigh; ineigh++){ + offset = offset_recv[ineigh]; + MPI_Irecv(&comm->buf_recv[offset], size_recv[ineigh], type, comm->nexch[ineigh],0,world,&requests[nrqst++]); + } + //Send elements + for (int ineigh = 0; ineigh< numneigh; ineigh++) + MPI_Send(comm->buf_send,nsend,type,comm->nexch[ineigh],0,world); + MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE); + + nlocal = atom->Nlocal; + m = 0; + while(m < nrecv) { + x = comm->buf_recv[m + _x]; + y = comm->buf_recv[m + _y]; + z = comm->buf_recv[m + _z]; + + if(x >= lo[_x] && x < hi[_x] && + y >= lo[_y] && y < hi[_y] && + z >= lo[_z] && z < hi[_z]){ + m += unpackExchange(atom, nlocal++, &comm->buf_recv[m]); + } else { + m += size; + } + } + atom->Nlocal = nlocal; + + int all_atoms=0; + MPI_Allreduce(&atom->Nlocal, &all_atoms, 1, MPI_INT, MPI_SUM, world); + if(atom->Natoms!=all_atoms && comm->myproc ==0){ + printf("Losing atoms! current atoms:%d expected atoms:%d\n",all_atoms,atom->Natoms); + } +} + +//Internal functions + +inline void growRecv(Comm* comm, int n) +{ + comm -> maxrecv = BUFFACTOR * n; + if(comm->buf_recv) free(comm -> buf_recv); + comm -> buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT)); +} + +inline void growSend(Comm* comm, int n) +{ + size_t oldByteSize = (comm->maxsend+BUFEXTRA)*sizeof(MD_FLOAT); + comm -> maxsend = BUFFACTOR * n; + comm -> buf_send = (MD_FLOAT*) reallocate(comm->buf_send, ALIGNMENT, (comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT), oldByteSize); +} + +inline void growList(Comm* comm, int ineigh, int n) +{ + size_t oldByteSize = comm->maxsendlist[ineigh]*sizeof(int); + comm->maxsendlist[ineigh] = BUFFACTOR * n; + comm->sendlist[ineigh] = (int*) reallocate(comm->sendlist[ineigh],ALIGNMENT, comm->maxsendlist[ineigh] * sizeof(int), oldByteSize); +} + +static inline void allocDynamicBuffers(Comm* comm) +{ + //Buffers depending on the # of my neighs + int numneigh = comm->numneigh; + comm->atom_send = (int*) allocate(ALIGNMENT, numneigh * sizeof(int)); + comm->atom_recv = (int*) allocate(ALIGNMENT, numneigh * sizeof(int)); + comm->off_atom_send = (int*) allocate(ALIGNMENT,numneigh * sizeof(int)); + comm->off_atom_recv = (int*) allocate(ALIGNMENT,numneigh * sizeof(int)); + comm->maxsendlist = (int*) allocate(ALIGNMENT,numneigh * sizeof(int)); + + for(int i = 0; i < numneigh; i++) + comm->maxsendlist[i] = BUFMIN; + + comm->sendlist = (int**) allocate(ALIGNMENT, numneigh * sizeof(int*)); + for(int i = 0; i < numneigh; i++) + comm->sendlist[i] = (int*) allocate(ALIGNMENT, comm->maxsendlist[i] * sizeof(int)); +} + +static inline void freeDynamicBuffers(Comm* comm) +{ + int numneigh =comm->numneigh; + + if(comm->atom_send) free(comm->atom_send); + if(comm->atom_recv) free(comm->atom_recv); + if(comm->off_atom_send) free(comm->off_atom_send); + if(comm->off_atom_recv) free(comm->off_atom_recv); + if(comm->maxsendlist) free(comm->maxsendlist); + if(comm->sendlist){ + for(int i = 0; i < numneigh; i++) + if(comm->sendlist[i]) free(comm->sendlist[i]); + } + if(comm->sendlist) free(comm->sendlist); +} + +static inline void freeBuffers(Comm* comm) +{ + if(comm->nrecv) free(comm->nrecv); + if(comm->nsend) free(comm->nsend); + if(comm->nexch) free(comm->nexch); + if(comm->pbc_x) free(comm->pbc_x); + if(comm->pbc_y) free(comm->pbc_y); + if(comm->pbc_z) free(comm->pbc_z); + if(comm->boxes) free(comm->boxes); + if(comm->atom_send) free(comm->atom_send); + if(comm->atom_recv) free(comm->atom_recv); + if(comm->off_atom_send) free(comm->off_atom_send); + if(comm->off_atom_recv) free(comm->off_atom_recv); + if(comm->maxsendlist) free(comm->maxsendlist); + + if(comm->sendlist){ + for(int i = 0; i < comm->numneigh; i++) + if(comm->sendlist[i]) free(comm->sendlist[i]); + } + if(comm->sendlist) free(comm->sendlist); + + if(comm->buf_send) free(comm->buf_send); + if(comm->buf_recv) free(comm->buf_recv); +} diff --git a/common/grid.c b/common/grid.c new file mode 100644 index 0000000..f6672c8 --- /dev/null +++ b/common/grid.c @@ -0,0 +1,490 @@ +#include +#include +#include +#include +#include +#include +#include + +static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE; + +//Grommacs Balancing +MD_FLOAT f_normalization(MD_FLOAT* x,MD_FLOAT* fx, MD_FLOAT minx, int nprocs) { + + MD_FLOAT sum=0; + for(int n = 0; n= tolerance) { + converged = 0; + break; + } + } + + for (int n=0; ncoord; + int *nprocs = grid ->nprocs; + //Elapsed time since the last rebalance + double time = newTime - grid->Timer; + grid->Timer = newTime; + //store the older dimm to compare later for exchange + MD_FLOAT lo[3], hi[3]; + for(int dim = 0; dim< 3; dim++){ + lo[dim] = atom->mybox.lo[dim]; + hi[dim] = atom->mybox.hi[dim]; + } + + //Define parameters + MPI_Comm subComm[3]; + int color[3] = {0,0,0}; + int id[3] = {0,0,0}; + MD_FLOAT ** load = (MD_FLOAT**) malloc(3*sizeof(MD_FLOAT*)); + for(int dim = 0; dim<3; dim++) + load[dim] = (MD_FLOAT*) malloc(nprocs[dim]*sizeof(MD_FLOAT)); + + int maxprocs = MAX(MAX(nprocs[_x],nprocs[_y]),nprocs[_z]); + MD_FLOAT* cellSize = (MD_FLOAT*) malloc(maxprocs*sizeof(MD_FLOAT)); + MD_FLOAT* limits = (MD_FLOAT*) malloc(2*maxprocs*sizeof(MD_FLOAT)); //limits: (x0, x1), (x1, x2)... Repeat values in between to perfom MPI_Scatter later + MD_FLOAT t_sum[3] = {0,0,0}; + MD_FLOAT recv_buf[2] = {0,0}; //Each proc only receives 2 elments per dimension xlo and xhi + MD_FLOAT balancedLoad[3] = {0,0,0}; //1/nprocs + MD_FLOAT minLoad[3] = {0,0,0}; //beta*(1/nprocs) + MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd}; + MD_FLOAT boundaries[6] ={0,0,0,0,0,0}; // xlo,xhi,ylo,yhi,zlo,zhi + + //Create sub-communications along each dimension + for(int dim = 0; dim<3; dim++){ + if(dim == _x){ + color[_x] = (coord[_y] == 0 && coord[_z] ==0) ? 1:MPI_UNDEFINED; + id[_x] = me; + } else if(dim == _y) { + color[_y] = coord[_z] == 0 ? coord[_x]:MPI_UNDEFINED; + id[_y] = (coord[_y] == 0 && coord[_z] == 0) ? 0:me; + } else { + color[_z]= coord[_y]*nprocs[_x]+coord[_x]; + id[_z] = coord[_z] == 0 ? 0 : me; + } + MPI_Comm_split(world, color[dim], id[dim], &subComm[dim]); + } + + //Set the minimum load and the balance load + for(int dim = 0; dim<3; dim++){ + balancedLoad[dim] = 1./nprocs[dim]; + minLoad[dim] = 0.8*balancedLoad[dim]; + } + //set and communicate the workload in reverse order + for(int dim = _z; dim>= _x; dim--) + { + if(subComm[dim] != MPI_COMM_NULL){ + MPI_Gather(&time,1,type,load[dim],1,type,0,subComm[dim]); + + if(id[dim] == 0) + { + for(int n=0; nmybox.lo[_x]=boundaries[0]; atom->mybox.hi[_x]=boundaries[1]; + atom->mybox.lo[_y]=boundaries[2]; atom->mybox.hi[_y]=boundaries[3]; + atom->mybox.lo[_z]=boundaries[4]; atom->mybox.hi[_z]=boundaries[5]; + + MD_FLOAT domain[6] = {boundaries[0], boundaries[2], boundaries[4], boundaries[1], boundaries[3], boundaries[5]}; + MPI_Allgather(domain, 6, type, grid->map, 6, type, world); + + //because cells change dynamically, It is required to increase the neighbouring exchange region + for(int dim =_x; dim<=_z; dim++){ + MD_FLOAT dr,dr_max; + int n = grid->nprocs[dim]; + MD_FLOAT maxdelta = 0.2*prd[dim]; + dr = MAX(fabs(lo[dim] - atom->mybox.lo[dim]),fabs(hi[dim] - atom->mybox.hi[dim])); + MPI_Allreduce(&dr, &dr_max, 1, type, MPI_MAX, world); + grid->cutneigh[dim] = param->cutneigh+dr_max; + } + + for(int dim=0; dim<3; dim++) { + if(subComm[dim] != MPI_COMM_NULL){ + MPI_Comm_free(&subComm[dim]); + } + free(load[dim]); + } + free(load); + free(limits); +} + +//RCB Balancing +MD_FLOAT meanTimeBisect(Atom *atom, MPI_Comm subComm, int dim, double time) +{ + MD_FLOAT mean=0, sum=0, total_sum=0, weightAtoms= 0, total_weight=0; + + for(int i=0; iNlocal; i++){ + sum += atom_pos(i); + } + sum*=time; + weightAtoms = atom->Nlocal*time; + MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm); + MPI_Allreduce(&weightAtoms, &total_weight, 1, type, MPI_SUM, subComm); + + mean = total_sum/total_weight; + return mean; +} + +MD_FLOAT meanBisect(Atom* atom, MPI_Comm subComm, int dim, double time) +{ + int Natoms = 0; + MD_FLOAT sum=0, mean=0, total_sum=0; + + for(int i=0; iNlocal; i++){ + sum += atom_pos(i); + } + MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm); + MPI_Allreduce(&atom->Nlocal, &Natoms, 1, MPI_INT, MPI_SUM, subComm); + mean = total_sum/Natoms; + return mean; +} + +void nextBisectionLevel(Grid* grid, Atom* atom, RCB_Method method, MPI_Comm subComm, int dim ,int* color, int ilevel, double time) +{ + int rank, size; + int branch = 0, i = 0, m = 0; + int nsend = 0, nrecv = 0, nrecv2 = 0; + int values_per_atom = 7; + MD_FLOAT bisection, pos; + MPI_Request request[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL}; + MPI_Comm_rank(subComm,&rank); + MPI_Comm_size(subComm,&size); + + int odd = size%2; + int extraProc = odd ? size-1:size; + int half = (int) (0.5*size); + int partner = (rankmybox.hi[dim] = bisection; + branch = 0; + } else { + atom->mybox.lo[dim] = bisection; + branch = 1; + } + //Define new color for the further communicaton + *color = (branch << ilevel) | *color; + //Grow the send buffer + if(atom->Nlocal>=grid->maxsend){ + if(grid->buf_send) free(grid->buf_send); + grid->buf_send = (MD_FLOAT*) malloc(atom->Nlocal*values_per_atom* sizeof(MD_FLOAT)); + grid->maxsend = atom->Nlocal; + } + //buffer particles to send + while(i < atom->Nlocal) { + pos = atom_pos(i); + if(pos < atom->mybox.lo[dim] || pos >= atom->mybox.hi[dim]) { + nsend += packExchange(atom, i, &grid->buf_send[nsend]); + copy(atom, i, atom->Nlocal-1); + atom->Nlocal--; + } else i++; + } + + //Communicate the number of elements to be sent + if(rank < extraProc){ + MPI_Irecv(&nrecv,1,MPI_INT,partner,0,subComm,&request[0]); + } + if(odd && rank == 0){ + MPI_Irecv(&nrecv2,1,MPI_INT,extraProc,0,subComm,&request[1]); + } + MPI_Send(&nsend,1,MPI_INT,partner,0,subComm); + MPI_Waitall(2,request,MPI_STATUS_IGNORE); + + //Grow the recv buffer + if(nrecv+nrecv2>=grid->maxrecv){ + if(grid->buf_recv) free(grid->buf_recv); + grid->buf_recv = (MD_FLOAT*) malloc((nrecv+nrecv2)*values_per_atom*sizeof(MD_FLOAT)); + grid->maxrecv = nrecv+nrecv2; + } + + //communicate elements in the buffer + request[0] = MPI_REQUEST_NULL; + request[1] = MPI_REQUEST_NULL; + + if(rank < extraProc){ + MPI_Irecv(grid->buf_recv,nrecv,type,partner,0,subComm,&request[0]); + } + if(odd && rank == 0){ + MPI_Irecv(&grid->buf_recv[nrecv],nrecv2,type,extraProc,0,subComm,&request[1]); + } + MPI_Send (grid->buf_send,nsend,type,partner,0,subComm); + MPI_Waitall(2,request,MPI_STATUS_IGNORE); + + //store atoms in atom list + while(m < nrecv+nrecv2){ + m += unpackExchange(atom, atom->Nlocal++, &grid->buf_recv[m]); + } +} + +void rcbBalance(Grid* grid, Atom* atom, Parameter* param, RCB_Method method, int ndim, double newTime) +{ + int me, nprocs=0, ilevel=0, nboxes=1; + int color = 0, size =0; + int index, prd[3]; + MPI_Comm subComm; + MPI_Comm_size(world, &nprocs); + MPI_Comm_rank(world, &me); + + //set the elapsed time since the last dynamic balance + double time = newTime - grid->Timer; + + prd[_x] = atom->mybox.xprd = param->xprd; + prd[_y] = atom->mybox.yprd = param->yprd; + prd[_z] = atom->mybox.zprd = param->zprd; + + //Sort by larger dimension + int largerDim[3] ={_x, _y, _z}; + + for(int i = 0; i< 2; i++){ + for(int j = i+1; j<3; j++) + { + if(prd[largerDim[j]]>prd[largerDim[i]]){ + MD_FLOAT tmp = largerDim[j]; + largerDim[j] = largerDim[i]; + largerDim[i] = tmp; + } + } + } + //Initial Partition + atom->mybox.lo[_x] = 0; atom->mybox.hi[_x] = atom->mybox.xprd; + atom->mybox.lo[_y] = 0; atom->mybox.hi[_y] = atom->mybox.yprd; + atom->mybox.lo[_z] = 0; atom->mybox.hi[_z] = atom->mybox.zprd; + + //Recursion tree + while(nboxes 1){ + nextBisectionLevel(grid, atom, method, subComm, largerDim[index], &color, ilevel, time); + } + MPI_Comm_free(&subComm); + nboxes = pow(2,++ilevel); + } + //Set the new timer grid + grid->Timer = newTime; + + //Creating the global map + MD_FLOAT domain[6] = {atom->mybox.lo[_x], atom->mybox.lo[_y], atom->mybox.lo[_z], atom->mybox.hi[_x], atom->mybox.hi[_y], atom->mybox.hi[_z]}; + MPI_Allgather(domain, 6, type, grid->map, 6, type, world); + + //Define the same cutneighbour in all dimensions for the exchange communication + for(int dim =_x; dim<=_z; dim++) + grid->cutneigh[dim] = param->cutneigh; +} + +//Regular grid +void cartisian3d(Grid* grid, Parameter* param, Box* box) +{ + int me, nproc; + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + MPI_Comm_rank(MPI_COMM_WORLD, &me); + + int numdim=3; + int reorder=0; + int periods[3]={1,1,1}; + int mycoord[3]={0,0,0}; + int griddim[3]={0,0,0}; + MD_FLOAT len[3]; + MPI_Comm cartesian; + + box->xprd = param->xprd; + box->yprd = param->yprd; + box->zprd = param->zprd; + + //Creates a cartesian 3d grid + MPI_Dims_create(nproc, numdim, griddim); + MPI_Cart_create(world,numdim,griddim,periods,reorder,&cartesian); + grid->nprocs[_x] = griddim[_x]; + grid->nprocs[_y] = griddim[_y]; + grid->nprocs[_z] = griddim[_z]; + + //Coordinates position in the grid + MPI_Cart_coords(cartesian,me,3,mycoord); + grid->coord[_x] = mycoord[_x]; + grid->coord[_y] = mycoord[_y]; + grid->coord[_z] = mycoord[_z]; + + //boundaries of my local box, with origin in (0,0,0). + len[_x] = param->xprd / griddim[_x]; + len[_y] = param->yprd / griddim[_y]; + len[_z] = param->zprd / griddim[_z]; + + box->lo[_x] = mycoord[_x] * len[_x]; + box->hi[_x] = (mycoord[_x] + 1) * len[_x]; + box->lo[_y] = mycoord[_y] * len[_y]; + box->hi[_y] = (mycoord[_y] + 1) * len[_y]; + box->lo[_z] = mycoord[_z] * len[_z]; + box->hi[_z] = (mycoord[_z] + 1) * len[_z]; + + MD_FLOAT domain[6] = {box->lo[_x], box->lo[_y], box->lo[_z], box->hi[_x], box->hi[_y], box->hi[_z]}; + MPI_Allgather(domain, 6, type, grid->map, 6, type, world); + MPI_Comm_free(&cartesian); + + //Define the same cutneighbour in all dimensions for the exchange communication + for(int dim =_x; dim<=_z; dim++) + grid->cutneigh[dim] = param->cutneigh; +} + +//Other Functions from the grid +void initGrid(Grid* grid) +{ //start with regular grid + int nprocs; + MPI_Comm_size(world, &nprocs); + grid->map_size = 6 * nprocs; + grid->map = (MD_FLOAT*) allocate(ALIGNMENT, grid->map_size * sizeof(MD_FLOAT)); + //========rcb======= + grid->maxsend = 0; + grid->maxrecv = 0; + grid->buf_send = NULL; + grid->buf_recv = NULL; + //====staggered===== + grid->Timer = 0.; +} + +void setupGrid(Grid* grid, Atom* atom, Parameter* param) +{ + int me; + MD_FLOAT xlo, ylo, zlo, xhi, yhi, zhi; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + initGrid(grid); + + //Set the origin at (0,0,0) + if(param->input_file){ + for(int i=0; iNlocal; i++){ + atom_x(i) = atom_x(i) - param->xlo; + atom_y(i) = atom_y(i) - param->ylo; + atom_z(i) = atom_z(i) - param->zlo; + } + } + + cartisian3d(grid, param, &atom->mybox); + + xlo = atom->mybox.lo[_x]; xhi = atom->mybox.hi[_x]; + ylo = atom->mybox.lo[_y]; yhi = atom->mybox.hi[_y]; + zlo = atom->mybox.lo[_z]; zhi = atom->mybox.hi[_z]; + + int i = 0; + while(i < atom->Nlocal) + { + if(atom_x(i) >= xlo && atom_x(i)< xhi && + atom_y(i) >= ylo && atom_y(i)< yhi && + atom_z(i) >= zlo && atom_z(i)< zhi) + { + i++; + } else { + copy(atom, i, atom->Nlocal-1); + atom->Nlocal--; + } + } + + //printGrid(grid); + if(!param->balance){ + MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); + printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms); + MPI_Barrier(world); + } +} + +void printGrid(Grid* grid) +{ + int me, nprocs; + MPI_Comm_size(world, &nprocs); + MPI_Comm_rank(world, &me); + MD_FLOAT* map = grid->map; + if(me==0) + { + + printf("GRID:\n"); + printf("===================================================================================================\n"); + for(int i=0; i + +#ifndef __BOX_H_ +#define __BOX_H_ + +typedef struct { + int id; + MD_FLOAT xprd, yprd, zprd; //Domain Dimension + MD_FLOAT lo[3]; //smallest coordinate of my subdomain + MD_FLOAT hi[3]; //Highest coordinate of my subdomain +} Box; + +int overlapBox(int, int , const Box*, const Box* , Box* , MD_FLOAT , MD_FLOAT); +int overlapFullBox(Parameter*, MD_FLOAT*, const Box*, const Box*); +void expandBox(int , const Box*, const Box* , Box* , MD_FLOAT); +#endif diff --git a/common/includes/comm.h b/common/includes/comm.h new file mode 100644 index 0000000..eb58630 --- /dev/null +++ b/common/includes/comm.h @@ -0,0 +1,104 @@ +#include +#include +#include +#include + +#ifndef COMM_H +#define COMM_H + +#ifdef GROMACS +#define FORWARD_SIZE (3*CLUSTER_N) +#define REVERSE_SIZE (3*CLUSTER_N) +#define GHOST_SIZE (4*CLUSTER_N+10) +#define EXCHANGE_SIZE 7 + +#define JFAC MAX(1, CLUSTER_N / CLUSTER_M) +#define LOCAL atom->Nclusters_local / JFAC +#define GHOST atom->Nclusters_ghost + +#define IsinRegionToSend(cj) \ + ((atom->jclusters[(cj)].bbminx >= xlo || atom->jclusters[(cj)].bbmaxx >= xlo) && \ + (atom->jclusters[(cj)].bbminx < xhi || atom->jclusters[(cj)].bbmaxx < xhi) && \ + (atom->jclusters[(cj)].bbminy >= ylo || atom->jclusters[(cj)].bbmaxy >= ylo) && \ + (atom->jclusters[(cj)].bbminy < yhi || atom->jclusters[(cj)].bbmaxy < yhi) && \ + (atom->jclusters[(cj)].bbminz >= zlo || atom->jclusters[(cj)].bbmaxz >= zlo) && \ + (atom->jclusters[(cj)].bbminz < zhi || atom->jclusters[(cj)].bbmaxz < zhi)) + +#else + +#define FORWARD_SIZE 3 +#define REVERSE_SIZE 3 +#define GHOST_SIZE 4 +#define EXCHANGE_SIZE 7 +#define LOCAL atom->Nlocal +#define GHOST atom->Nghost + +#define IsinRegionToSend(i) \ + ((atom_x((i)) >= xlo && atom_x((i)) < xhi) && \ + (atom_y((i)) >= ylo && atom_y((i)) < yhi) && \ + (atom_z((i)) >= zlo && atom_z((i)) < zhi)) + +#endif + +typedef struct { + int myproc; // my proc ID + int numproc; // # of processors + + int numneigh; // # of all my neighs along all swaps + int maxneigh; // Buffer size for my neighs + int sendfrom[6]; //return the lowest neigh index to send in each swap + int sendtill[6]; //return the highest neigh index to send in each swao + int recvfrom[6]; //return the lowest neigh index to recv in each swap + int recvtill[6]; //return the highest neigh index to recv in each swap + int* nsend; // neigh whose I want to send + int* nrecv; // neigh whose I want to recv + + int* pbc_x; // if pbc in x + int* pbc_y; // if pbc in y + int* pbc_z; // if pbc in z + + int* atom_send, *atom_recv; // # of atoms to send/recv for each of my neighs + int* off_atom_send; // atom offset to send, inside of a swap + int* off_atom_recv; // atom offset to recv, inside of a swap + + int* nexch; //procs to exchange + int numneighexch; //# of neighbours to exchange + int maxneighexch; //max buff size to store neighbours + + int numswap; // # of swaps to perform, it is 6 + int swapdim[6]; // dimension of the swap (_x, _y or _z) + int swapdir[6]; // direction of the swap 0 or 1 + int swap[3][2]; // given a dim and dir, knows the swap + int othersend[6]; // Determine if a proc interact with more procs in a given swap + + int firstrecv[6]; // where to put 1st recv atom in each swap + int** sendlist; // list of atoms to send in each swap + int* maxsendlist; // max # of atoms send in each list-swap + + int maxsend; // max elements in buff sender + int maxrecv; // max elements in buff receiver + MD_FLOAT* buf_send; // sender buffer for all comm + MD_FLOAT* buf_recv; // receicer buffer for all comm + + int forwardSize; // # of paramaters per atom in forward comm. + int reverseSize; // # of parameters per atom in reverse + int exchangeSize; // # of parameters per atom in exchange + int ghostSize; // # of parameters per atom in ghost list + + int iterAtom; //last atom to iterate in each swap. + Box* boxes; // Boundaries to be sent to other procs as ghost. +} Comm; + + +void initComm(int*, char***, Comm*); //Init MPI +void endComm(Comm*); //End MPI +void setupComm(Comm*,Parameter*,Grid*); //Creates a 3d grid or rcb grid +void neighComm(Comm*,Parameter*,Grid*); //Find neighbours within cut-off and defines ghost regions +void forwardComm(Comm*,Atom*,int); //Send info in one direction +void reverseComm(Comm*,Atom*,int); //Return info after forward communication +void exchangeComm(Comm*,Atom*); //Exchange info between procs +void ghostComm(Comm*, Atom*,int); //Build the ghost neighbours to send during next forwards +void growSend(Comm*,int); //Grows the size of the buffer sender +void growRecv(Comm*,int); //Grows the size of the buffer receiver +void growList(Comm*, int, int); //Grows the size of the list to send +#endif \ No newline at end of file diff --git a/common/includes/grid.h b/common/includes/grid.h new file mode 100644 index 0000000..0cba637 --- /dev/null +++ b/common/includes/grid.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ + + +#include +#include +#include +#include + +#ifndef __MAP_H_ +#define __MAP_H_ + +#define world MPI_COMM_WORLD +#define atom_pos(i) ((dim == _x) ? atom_x((i)) : (dim == _y) ? atom_y((i)) : atom_z((i))) + +enum {RCB=1, meanTimeRCB, Staggered}; + +typedef struct { + int balance_every; + int map_size; + MD_FLOAT* map; + //===Param for Staggerd balance + int nprocs[3]; + int coord[3]; + MD_FLOAT cutneigh[3]; + double Timer; + //===Param for RCB balance + MD_FLOAT* buf_send; + MD_FLOAT* buf_recv; + int maxsend; + int maxrecv; +} Grid; + + +typedef MD_FLOAT(*RCB_Method)(Atom*,MPI_Comm,int,double); + +void setupGrid(Grid*, Atom*, Parameter*); +void cartisian3d(Grid*, Parameter*, Box*); +void rcbBalance(Grid*, Atom*, Parameter* ,RCB_Method, int, double); +void staggeredBalance(Grid*, Atom*, Parameter*, double); +void printGrid(Grid*); +//rcb methods +MD_FLOAT meanBisect(Atom* , MPI_Comm, int, double); +MD_FLOAT meanTimeBisect(Atom*, MPI_Comm, int, double); +#endif + + diff --git a/common/includes/parameter.h b/common/includes/parameter.h index 614be9f..7c1ae1e 100644 --- a/common/includes/parameter.h +++ b/common/includes/parameter.h @@ -53,6 +53,10 @@ typedef struct { MD_FLOAT k_dn; MD_FLOAT gx, gy, gz; MD_FLOAT reflect_x, reflect_y, reflect_z; + //MPI implementation + int balance; + int method; + int balance_every; } Parameter; void initParameter(Parameter*); diff --git a/common/includes/shell_methods.h b/common/includes/shell_methods.h new file mode 100644 index 0000000..4d1235b --- /dev/null +++ b/common/includes/shell_methods.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//static void addDummyCluster(Atom*); + +double forward(Comm* comm, Atom *atom, Parameter* param){ + double S, E; + S = getTimeStamp(); + if(param->method == halfShell){ + for(int iswap = 0; iswap < 5; iswap++) + forwardComm(comm, atom, iswap); + } else if(param->method == eightShell){ + for(int iswap = 0; iswap < 6; iswap+=2) + forwardComm(comm, atom, iswap); + } else { + for(int iswap = 0; iswap < 6; iswap++) + forwardComm(comm, atom, iswap); + } + E = getTimeStamp(); + return E-S; +} + +double reverse(Comm* comm, Atom *atom, Parameter* param){ + double S, E; + S = getTimeStamp(); + if(param->method == halfShell){ + for(int iswap = 4; iswap >= 0; iswap--) + reverseComm(comm, atom, iswap); + } else if(param->method == eightShell){ + for(int iswap = 4; iswap >= 0; iswap-=2) + reverseComm(comm, atom, iswap); + } else if(param->method == halfStencil){ + for(int iswap = 5; iswap >= 0; iswap--) + reverseComm(comm, atom, iswap); + } else { } //Full Shell Reverse does nothing + E = getTimeStamp(); + return E-S; +} + +void ghostNeighbor(Comm* comm, Atom* atom, Parameter* param) +{ + #ifdef GROMACS + atom->Nclusters_ghost = 0; + #endif + atom->Nghost = 0; + if(param->method == halfShell){ + for(int iswap=0; iswap<5; iswap++) + ghostComm(comm,atom,iswap); + } else if(param->method == eightShell){ + for(int iswap = 0; iswap<6; iswap+=2) + ghostComm(comm, atom,iswap); + } else { + for(int iswap=0; iswap<6; iswap++) + ghostComm(comm,atom,iswap); + } +} diff --git a/common/includes/timers.h b/common/includes/timers.h index 28e17cd..1b2ad6f 100644 --- a/common/includes/timers.h +++ b/common/includes/timers.h @@ -9,9 +9,15 @@ typedef enum { TOTAL = 0, - NEIGH, FORCE, + NEIGH, + FORWARD, + REVERSE, + UPDATE, + BALANCE, + SETUP, + REST, NUMTIMER -} timertype; + } timerComm; #endif diff --git a/common/includes/util.h b/common/includes/util.h index 00de105..52e0900 100644 --- a/common/includes/util.h +++ b/common/includes/util.h @@ -4,6 +4,8 @@ * Use of this source code is governed by a LGPL-3.0 * license that can be found in the LICENSE file. */ +#include + #ifndef __UTIL_H_ #define __UTIL_H_ @@ -35,6 +37,13 @@ # define PRECISION_STRING "double" #endif +#define BigOrEqual(a,b) (fabs((a)-(b))<1e-9 || (a)>(b)) +#define Equal(a,b) (fabs((a)-(b))<1e-9) + +enum {_x=0, _y, _z}; +enum {fullShell=0, halfShell, eightShell, halfStencil}; + + extern double myrandom(int*); extern void random_reset(int *seed, int ibase, double *coord); extern int str2ff(const char *string); diff --git a/common/parameter.c b/common/parameter.c index 8aab646..85379fd 100644 --- a/common/parameter.c +++ b/common/parameter.c @@ -11,6 +11,7 @@ #include #include #include +#include void initParameter(Parameter *param) { param->input_file = NULL; @@ -54,13 +55,17 @@ void initParameter(Parameter *param) { param->reflect_x = 0.0; param->reflect_y = 0.0; param->reflect_z = 0.0; + //MPI + param->balance = 0; + param->method = 0; + param->balance_every =param->reneigh_every; } void readParameter(Parameter *param, const char *filename) { FILE *fp = fopen(filename, "r"); char line[MAXLINE]; int i; - + if(!fp) { fprintf(stderr, "Could not open parameter file: %s\n", filename); exit(-1); @@ -72,8 +77,8 @@ void readParameter(Parameter *param, const char *filename) { for(i = 0; line[i] != '\0' && line[i] != '#'; i++); line[i] = '\0'; - char *tok = strtok(line, " "); - char *val = strtok(NULL, " "); + char *tok = strtok(line, "\t "); + char *val = strtok(NULL, "\t "); #define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); } #define PARSE_STRING(p) PARSE_PARAM(p, strdup) @@ -117,15 +122,20 @@ void readParameter(Parameter *param, const char *filename) { PARSE_INT(x_out_every); PARSE_INT(v_out_every); PARSE_INT(half_neigh); + PARSE_INT(method); + PARSE_INT(balance); + PARSE_INT(balance_every); } } - // Update dtforce param->dtforce = 0.5 * param->dt; // Update sigma6 parameter MD_FLOAT s2 = param->sigma * param->sigma; param->sigma6 = s2 * s2 * s2; + + //Update balance parameter, 10 could be change + param->balance_every *=param->reneigh_every; fclose(fp); } @@ -183,4 +193,19 @@ void printParameter(Parameter *param) { printf("\tSkin: %e\n", param->skin); printf("\tHalf neighbor lists: %d\n", param->half_neigh); printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq); + + // ================ New MPI features ============= + char str[20]; + strcpy(str, (param->method == 1) ? "Half Shell" : + (param->method == 2) ? "Eight Shell" : + (param->method == 3) ? "Half Stencil": + "Full Shell"); + printf("\tMethod: %s\n", str); + strcpy(str, (param->balance == 1) ? "mean RCB" : + (param->balance == 2) ? "mean Time RCB" : + (param->balance == 3) ? "Staggered" : + "cartisian"); + printf("\tPartition: %s\n", str); + if(param->balance) + printf("\tRebalancing every (timesteps): %d\n",param->balance_every); } diff --git a/common/thermo.c b/common/thermo.c index 0683876..18bf910 100644 --- a/common/thermo.c +++ b/common/thermo.c @@ -10,6 +10,7 @@ #include #include +#include static int *steparr; static MD_FLOAT *tmparr; @@ -24,6 +25,7 @@ static MD_FLOAT t_act; static MD_FLOAT p_act; static MD_FLOAT e_act; static int mstat; +static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE; /* exported subroutines */ void setupThermo(Parameter *param, int natoms) @@ -53,57 +55,73 @@ void setupThermo(Parameter *param, int natoms) void computeThermo(int iflag, Parameter *param, Atom *atom) { - MD_FLOAT t = 0.0, p; + MD_FLOAT t_sum = 0.0, t = 0.0, p; + int me; + + MPI_Comm_rank(MPI_COMM_WORLD, &me); + for(int i = 0; i < atom->Nlocal; i++) { t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass; } - t = t * t_scale; - p = (t * dof_boltz) * p_scale; - int istep = iflag; + MPI_Reduce(&t, &t_sum, 1, type, MPI_SUM, 0 ,MPI_COMM_WORLD); + if(me == 0) + { + t = t_sum * t_scale; + p = (t * dof_boltz) * p_scale; + int istep = iflag; - if(iflag == -1){ - istep = param->ntimes; - } - if(iflag == 0){ - mstat = 0; - } + if(iflag == -1){ + istep = param->ntimes; + } + if(iflag == 0){ + mstat = 0; + } - steparr[mstat] = istep; - tmparr[mstat] = t; - prsarr[mstat] = p; - mstat++; - fprintf(stdout, "%i\t%e\t%e\n", istep, t, p); + steparr[mstat] = istep; + tmparr[mstat] = t; + prsarr[mstat] = p; + mstat++; + fprintf(stdout, "%i\t%e\t%e\n", istep, t, p); + } } void adjustThermo(Parameter *param, Atom *atom) { /* zero center-of-mass motion */ MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0; - + MD_FLOAT v_sum[3], vtot[3]; + for(int i = 0; i < atom->Nlocal; i++) { vxtot += atom_vx(i); vytot += atom_vy(i); vztot += atom_vz(i); } + + vtot[0] = vxtot; vtot[1] = vytot; vtot[2] = vztot; - vxtot = vxtot / atom->Natoms; - vytot = vytot / atom->Natoms; - vztot = vztot / atom->Natoms; + MPI_Allreduce(vtot, v_sum, 3, type, MPI_SUM, MPI_COMM_WORLD); + + vxtot = v_sum[0] / atom->Natoms; + vytot = v_sum[1] / atom->Natoms; + vztot = v_sum[2] / atom->Natoms; for(int i = 0; i < atom->Nlocal; i++) { atom_vx(i) -= vxtot; atom_vy(i) -= vytot; atom_vz(i) -= vztot; } - - t_act = 0; + MD_FLOAT t = 0.0; + MD_FLOAT t_sum = 0.0; for(int i = 0; i < atom->Nlocal; i++) { t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass; } + MPI_Allreduce(&t, &t_sum, 1,type, MPI_SUM,MPI_COMM_WORLD); + + t = t_sum; t *= t_scale; MD_FLOAT factor = sqrt(param->temp / t); diff --git a/common/util.c b/common/util.c index 350c96f..a782010 100644 --- a/common/util.c +++ b/common/util.c @@ -10,6 +10,7 @@ #include #include #include +#include /* Park/Miller RNG w/out MASKING, so as to be like f90s version */ #define IA 16807 @@ -86,6 +87,7 @@ int get_cuda_num_threads() { void readline(char *line, FILE *fp) { if(fgets(line, MAXLINE, fp) == NULL) { + printf("error %i\n",errno); if(errno != 0) { perror("readline()"); exit(-1); diff --git a/config.mk b/config.mk index f8e1ede..74f07fe 100644 --- a/config.mk +++ b/config.mk @@ -1,15 +1,15 @@ # Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC) -TAG ?= ICC +TAG ?= MPIICC # Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512) ISA ?= AVX512 # Optimization scheme (lammps/gromacs/clusters_per_bin) -OPT_SCHEME ?= lammps +OPT_SCHEME ?= gromacs # Enable likwid (true or false) -ENABLE_LIKWID ?= true +ENABLE_LIKWID ?= false # SP or DP DATA_TYPE ?= DP # AOS or SOA -DATA_LAYOUT ?= AOS +DATA_LAYOUT ?= SOA # Assembly syntax to generate (ATT/INTEL) ASM_SYNTAX ?= ATT # Debug @@ -24,13 +24,13 @@ MEM_TRACER ?= false # Trace indexes and distances for gather-md (true or false) INDEX_TRACER ?= false # Compute statistics -COMPUTE_STATS ?= true +COMPUTE_STATS ?= false # Configurations for lammps optimization scheme # Use omp simd pragma when running with half neighbor-lists -ENABLE_OMP_SIMD ?= true +ENABLE_OMP_SIMD ?= false # Use kernel with explicit SIMD intrinsics -USE_SIMD_KERNEL ?= false +USE_SIMD_KERNEL ?= true # Configurations for gromacs optimization scheme # Use reference version @@ -38,7 +38,7 @@ USE_REFERENCE_VERSION ?= false # Enable XTC output XTC_OUTPUT ?= false # Check if cj is local when decreasing reaction force -HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true +HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false # Configurations for CUDA # Use CUDA host memory to optimize transfers diff --git a/gromacs/atom.c b/gromacs/atom.c index b50a3d1..87a17a7 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -12,7 +12,8 @@ #include #include #include - +#include + void initAtom(Atom *atom) { atom->x = NULL; atom->y = NULL; atom->z = NULL; atom->vx = NULL; atom->vy = NULL; atom->vz = NULL; @@ -27,6 +28,7 @@ void initAtom(Atom *atom) { atom->Nclusters = 0; atom->Nclusters_local = 0; atom->Nclusters_ghost = 0; + atom->NmaxGhost = 0; //Temporal atom->Nclusters_max = 0; atom->type = NULL; atom->ntypes = 0; @@ -37,10 +39,19 @@ void initAtom(Atom *atom) { atom->iclusters = NULL; atom->jclusters = NULL; atom->icluster_bin = NULL; + atom->PBCx = NULL; + atom->PBCy = NULL; + atom->PBCz = NULL; initMasks(atom); + //MPI + Box *mybox = &(atom->mybox); + mybox->xprd = mybox->yprd = mybox->zprd = 0; + mybox->lo[0] = mybox->lo[1] = mybox->lo[2] = 0; + mybox->hi[0] = mybox->hi[1] = mybox->hi[2] = 0; } void createAtom(Atom *atom, Parameter *param) { + MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd; MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd; MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd; @@ -90,7 +101,7 @@ void createAtom(Atom *atom, Parameter *param) { ytmp = 0.5 * alat * j; ztmp = 0.5 * alat * k; - if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) { + if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) { n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1; for(m = 0; m < 5; m++) { myrandom(&n); } vxtmp = myrandom(&n); @@ -128,22 +139,26 @@ int type_str2int(const char *type) { } int readAtom(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); int len = strlen(param->input_file); if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); } if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); } if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); } - fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file); + if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file); exit(-1); return -1; } int readAtom_pdb(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; int read_atoms = 0; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } @@ -153,11 +168,11 @@ int readAtom_pdb(Atom* atom, Parameter* param) { char *item = strtok(line, " "); if(strncmp(item, "CRYST1", 6) == 0) { param->xlo = 0.0; - param->xhi = atof(strtok(NULL, " ")); + param->xhi = atof(strtok(NULL, "\t ")); param->ylo = 0.0; - param->yhi = atof(strtok(NULL, " ")); + param->yhi = atof(strtok(NULL, "\t ")); param->zlo = 0.0; - param->zhi = atof(strtok(NULL, " ")); + param->zhi = atof(strtok(NULL, "\t ")); param->xprd = param->xhi - param->xlo; param->yprd = param->yhi - param->ylo; param->zprd = param->zhi - param->zlo; @@ -166,23 +181,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) { char *label; int atom_id, comp_id; MD_FLOAT occupancy, charge; - atom_id = atoi(strtok(NULL, " ")) - 1; + atom_id = atoi(strtok(NULL, "\t ")) - 1; while(atom_id + 1 >= atom->Nmax) { growAtom(atom); } - atom->type[atom_id] = type_str2int(strtok(NULL, " ")); - label = strtok(NULL, " "); - comp_id = atoi(strtok(NULL, " ")); - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); + atom->type[atom_id] = type_str2int(strtok(NULL, "\t ")); + label = strtok(NULL, "\t "); + comp_id = atoi(strtok(NULL, "\t ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); atom->vx[atom_id] = 0.0; atom->vy[atom_id] = 0.0; atom->vz[atom_id] = 0.0; - occupancy = atof(strtok(NULL, " ")); - charge = atof(strtok(NULL, " ")); + occupancy = atof(strtok(NULL, "\t ")); + charge = atof(strtok(NULL, "\t ")); atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes); atom->Natoms++; atom->Nlocal++; @@ -194,14 +209,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) { strncmp(item, "ENDMDL", 6) == 0) { // Do nothing } else { - fprintf(stderr, "Invalid item: %s\n", item); + if(me==0) fprintf(stderr, "Invalid item: %s\n", item); exit(-1); return -1; } } if(!read_atoms) { - fprintf(stderr, "Input error: No atoms read!\n"); + if(me==0) fprintf(stderr, "Input error: No atoms read!\n"); exit(-1); return -1; } @@ -217,12 +232,15 @@ int readAtom_pdb(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); + if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); fclose(fp); return read_atoms; } int readAtom_gro(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; char desc[MAXLINE]; @@ -231,7 +249,7 @@ int readAtom_gro(Atom* atom, Parameter* param) { int i = 0; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } @@ -241,25 +259,25 @@ int readAtom_gro(Atom* atom, Parameter* param) { desc[i] = '\0'; readline(line, fp); atoms_to_read = atoi(strtok(line, " ")); - fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read); + if(me==0) fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read); while(!feof(fp) && read_atoms < atoms_to_read) { readline(line, fp); - char *label = strtok(line, " "); - int type = type_str2int(strtok(NULL, " ")); - int atom_id = atoi(strtok(NULL, " ")) - 1; + char *label = strtok(line, "\t "); + int type = type_str2int(strtok(NULL, "\t ")); + int atom_id = atoi(strtok(NULL, "\t ")) - 1; atom_id = read_atoms; while(atom_id + 1 >= atom->Nmax) { growAtom(atom); } atom->type[atom_id] = type; - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); - atom->vx[atom_id] = atof(strtok(NULL, " ")); - atom->vy[atom_id] = atof(strtok(NULL, " ")); - atom->vz[atom_id] = atof(strtok(NULL, " ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); + atom->vx[atom_id] = atof(strtok(NULL, "\t ")); + atom->vy[atom_id] = atof(strtok(NULL, "\t ")); + atom->vz[atom_id] = atof(strtok(NULL, "\t ")); atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes); atom->Natoms++; atom->Nlocal++; @@ -269,18 +287,18 @@ int readAtom_gro(Atom* atom, Parameter* param) { if(!feof(fp)) { readline(line, fp); param->xlo = 0.0; - param->xhi = atof(strtok(line, " ")); + param->xhi = atof(strtok(line, "\t ")); param->ylo = 0.0; - param->yhi = atof(strtok(NULL, " ")); + param->yhi = atof(strtok(NULL, "\t ")); param->zlo = 0.0; - param->zhi = atof(strtok(NULL, " ")); + param->zhi = atof(strtok(NULL, "\t ")); param->xprd = param->xhi - param->xlo; param->yprd = param->yhi - param->ylo; param->zprd = param->zhi - param->zlo; } if(read_atoms != atoms_to_read) { - fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read); + if(me==0) fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read); exit(-1); return -1; } @@ -296,12 +314,14 @@ int readAtom_gro(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); + if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); fclose(fp); return read_atoms; } int readAtom_dmp(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; int natoms = 0; @@ -310,7 +330,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) { int ts = -1; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } @@ -333,47 +353,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) { } } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) { readline(line, fp); - param->xlo = atof(strtok(line, " ")); - param->xhi = atof(strtok(NULL, " ")); + param->xlo = atof(strtok(line, "\t ")); + param->xhi = atof(strtok(NULL, "\t ")); param->xprd = param->xhi - param->xlo; readline(line, fp); - param->ylo = atof(strtok(line, " ")); - param->yhi = atof(strtok(NULL, " ")); + param->ylo = atof(strtok(line, "\t ")); + param->yhi = atof(strtok(NULL, "\t ")); param->yprd = param->yhi - param->ylo; readline(line, fp); - param->zlo = atof(strtok(line, " ")); - param->zhi = atof(strtok(NULL, " ")); + param->zlo = atof(strtok(line, "\t ")); + param->zhi = atof(strtok(NULL, "\t ")); param->zprd = param->zhi - param->zlo; } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) { for(int i = 0; i < natoms; i++) { readline(line, fp); - atom_id = atoi(strtok(line, " ")) - 1; - atom->type[atom_id] = atoi(strtok(NULL, " ")); - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); - atom->vx[atom_id] = atof(strtok(NULL, " ")); - atom->vy[atom_id] = atof(strtok(NULL, " ")); - atom->vz[atom_id] = atof(strtok(NULL, " ")); + atom_id = atoi(strtok(line, "\t ")) - 1; + atom->type[atom_id] = atoi(strtok(NULL, "\t ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); + atom->vx[atom_id] = atof(strtok(NULL, "\t ")); + atom->vy[atom_id] = atof(strtok(NULL, "\t ")); + atom->vz[atom_id] = atof(strtok(NULL, "\t ")); atom->ntypes = MAX(atom->type[atom_id], atom->ntypes); read_atoms++; } } else { - fprintf(stderr, "Invalid item: %s\n", item); + if(me==0) fprintf(stderr, "Invalid item: %s\n", item); exit(-1); return -1; } } else { - fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line); + if(me==0) fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line); exit(-1); return -1; } } if(ts < 0 || !natoms || !read_atoms) { - fprintf(stderr, "Input error: atom data was not read!\n"); + if(me==0) fprintf(stderr, "Input error: atom data was not read!\n"); exit(-1); return -1; } @@ -389,7 +409,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file); + if(me==0) fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file); fclose(fp); return natoms; } @@ -530,3 +550,249 @@ void growClusters(Atom *atom) { atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int)); } + +/* MPI added*/ +void growPbc(Atom* atom) { + int nold = atom->NmaxGhost; + atom->NmaxGhost += DELTA; + + if (atom->PBCx || atom->PBCy || atom->PBCz){ + atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int)); + atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int)); + atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int)); + } else { + atom->PBCx = (int*) malloc(atom->NmaxGhost * sizeof(int)); + atom->PBCy = (int*) malloc(atom->NmaxGhost * sizeof(int)); + atom->PBCz = (int*) malloc(atom->NmaxGhost * sizeof(int)); + } +} + +void packForward(Atom* atom, int nc, int* list, MD_FLOAT* buf, int* pbc) +{ + for(int i = 0; i < nc; i++) { + int cj = list[i]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + int displ = i*CLUSTER_N; + + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + buf[3*(displ+cjj)+0] = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd; + buf[3*(displ+cjj)+1] = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd; + buf[3*(displ+cjj)+2] = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd; + } + + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + buf[3*(displ+cjj)+0] = -1; //x + buf[3*(displ+cjj)+1] = -1; //y + buf[3*(displ+cjj)+2] = -1; //z + } + } +} + +void unpackForward(Atom* atom, int nc, int c0, MD_FLOAT* buf) +{ + for(int i = 0; i < nc; i++) { + int cj = c0+i; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + int displ = i*CLUSTER_N; + + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + if(cj_x[CL_X_OFFSET + cjj] count = 23 value/cluster + trackpbc[x] + trackpbc[y] + trackpbc[z] + int m = 0; + if(atom->jclusters[cj].natoms > 0) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; + MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; + MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; + + buf[m++] = atom->jclusters[cj].natoms; + + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + + MD_FLOAT xtmp = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd; + MD_FLOAT ytmp = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd; + MD_FLOAT ztmp = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd; + + buf[m++] = xtmp; + buf[m++] = ytmp; + buf[m++] = ztmp; + buf[m++]= atom->cl_type[cj_sca_base + cjj]; + + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + buf[m++] = -1; //x + buf[m++] = -1; //y + buf[m++] = -1; //z + buf[m++] = -1; //type + } + + buf[m++] = bbminx; + buf[m++] = bbmaxx; + buf[m++] = bbminy; + buf[m++] = bbmaxy; + buf[m++] = bbminz; + buf[m++] = bbmaxz; + //TODO: check atom->ncj + int ghostId = cj-atom->ncj; + //check for ghost particles + buf[m++] = (cj-atom->ncj>=0) ? pbc[_x]+atom->PBCx[ghostId]:pbc[_x]; + buf[m++] = (cj-atom->ncj>=0) ? pbc[_y]+atom->PBCy[ghostId]:pbc[_y]; + buf[m++] = (cj-atom->ncj>=0) ? pbc[_z]+atom->PBCz[ghostId]:pbc[_z]; + } + return m; +} + +int unpackGhost(Atom* atom, int cj, MD_FLOAT* buf) +{ + int m = 0; + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + if(cj*jfac>=atom->Nclusters_max) growClusters(atom); + if(atom->Nclusters_ghost>=atom->NmaxGhost) growPbc(atom); + + int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + + atom->jclusters[cj].natoms = buf[m++]; + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + + cj_x[CL_X_OFFSET + cjj] = buf[m++]; + cj_x[CL_Y_OFFSET + cjj] = buf[m++]; + cj_x[CL_Z_OFFSET + cjj] = buf[m++]; + atom->cl_type[cj_sca_base + cjj] = buf[m++]; + atom->Nghost++; + } + + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + cj_x[CL_X_OFFSET + cjj] = INFINITY; + cj_x[CL_Y_OFFSET + cjj] = INFINITY; + cj_x[CL_Z_OFFSET + cjj] = INFINITY; + atom->cl_type[cj_sca_base + cjj] = -1; + m+=4; + } + + atom->jclusters[cj].bbminx = buf[m++]; + atom->jclusters[cj].bbmaxx = buf[m++]; + atom->jclusters[cj].bbminy = buf[m++]; + atom->jclusters[cj].bbmaxy = buf[m++]; + atom->jclusters[cj].bbminz = buf[m++]; + atom->jclusters[cj].bbmaxz = buf[m++]; + atom->PBCx[atom->Nclusters_ghost] = buf[m++]; + atom->PBCy[atom->Nclusters_ghost] = buf[m++]; + atom->PBCz[atom->Nclusters_ghost] = buf[m++]; + atom->Nclusters_ghost++; + +} + +void packReverse(Atom* atom, int nc, int c0, MD_FLOAT* buf) +{ + for(int i = 0; i < nc; i++) { + int cj = c0+i; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + int displ = i*CLUSTER_N; + + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + buf[3*(displ+cjj)+0] = cj_f[CL_X_OFFSET + cjj]; + buf[3*(displ+cjj)+1] = cj_f[CL_Y_OFFSET + cjj]; + buf[3*(displ+cjj)+2] = cj_f[CL_Z_OFFSET + cjj]; + } + + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + buf[3*(displ+cjj)+0] = -1; //x + buf[3*(displ+cjj)+1] = -1; //y + buf[3*(displ+cjj)+2] = -1; //z + } + } +} + +void unpackReverse(Atom* atom, int nc, int* list, MD_FLOAT* buf) +{ + for(int i = 0; i < nc; i++) { + int cj = list[i]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + int displ = i*CLUSTER_N; + + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + cj_f[CL_X_OFFSET + cjj] += buf[3*(displ+cjj)+0]; + cj_f[CL_Y_OFFSET + cjj] += buf[3*(displ+cjj)+1]; + cj_f[CL_Z_OFFSET + cjj] += buf[3*(displ+cjj)+2]; + } + } +} + +int packExchange(Atom* atom, int i, MD_FLOAT* buf) +{ + int m = 0; + buf[m++] = atom_x(i); + buf[m++] = atom_y(i); + buf[m++] = atom_z(i); + buf[m++] = atom_vx(i); + buf[m++] = atom_vy(i); + buf[m++] = atom_vz(i); + buf[m++] = atom->type[i]; + return m; +} + +int unpackExchange(Atom* atom, int i, MD_FLOAT* buf) +{ + while(i >= atom->Nmax) growAtom(atom); + int m = 0; + atom_x(i) = buf[m++]; + atom_y(i) = buf[m++]; + atom_z(i) = buf[m++]; + atom_vx(i) = buf[m++]; + atom_vy(i) = buf[m++]; + atom_vz(i) = buf[m++]; + atom->type[i] = buf[m++]; + return m; +} + +void pbc(Atom* atom) +{ + for(int i = 0; i < atom->Nlocal; i++) { + + MD_FLOAT xprd = atom->mybox.xprd; + MD_FLOAT yprd = atom->mybox.yprd; + MD_FLOAT zprd = atom->mybox.zprd; + + if(atom_x(i) < 0.0) atom_x(i) += xprd; + if(atom_y(i) < 0.0) atom_y(i) += yprd; + if(atom_z(i) < 0.0) atom_z(i) +=zprd; + if(atom_x(i) >= xprd) atom_x(i) -=xprd; + if(atom_y(i) >= yprd) atom_y(i) -=yprd; + if(atom_z(i) >= zprd) atom_z(i) -=zprd; + } +} + +void copy(Atom* atom, int i, int j) +{ + atom_x(i) = atom_x(j); + atom_y(i) = atom_y(j); + atom_z(i) = atom_z(j); + atom_vx(i) = atom_vx(j); + atom_vy(i) = atom_vy(j); + atom_vz(i) = atom_vz(j); + atom->type[i] = atom->type[j]; +} diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 84e384a..50ec1ea 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -14,8 +14,9 @@ #include #include #include +#include - +void computeForceGhostShell(Parameter*, Atom*, Neighbor*); /* static inline void gmx_load_simd_2xnn_interactions( int excl, @@ -49,7 +50,6 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT cutforcesq = param->cutforce * param->cutforce; MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; - for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; @@ -60,13 +60,23 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat } } + for(int cg = atom->ncj; cg < atom->ncj+atom->Nclusters_ghost; cg++) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg); + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) { + cj_f[CL_X_OFFSET + cjj] = 0.0; + cj_f[CL_Y_OFFSET + cjj] = 0.0; + cj_f[CL_Z_OFFSET + cjj] = 0.0; + } + } + double S = getTimeStamp(); #pragma omp parallel { LIKWID_MARKER_START("force"); - #pragma omp for schedule(runtime) + #pragma omp for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); int ci_cj1 = CJ1_FROM_CI(ci); @@ -75,14 +85,14 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; - + for(int k = 0; k < numneighs; k++) { int cj = neighs[k]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int any = 0; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; - + for(int cii = 0; cii < CLUSTER_M; cii++) { MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; @@ -103,6 +113,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat cond = neighbor->half_neigh ? (ci_cj0 != cj || cii < cjj) && (ci_cj1 != cj || cii < cjj + CLUSTER_N) : (ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N); #endif + if(cond) { MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj]; MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj]; @@ -113,12 +124,11 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; - if(neighbor->half_neigh) { + if(neighbor->half_neigh || param->method) { cj_f[CL_X_OFFSET + cjj] -= delx * force; cj_f[CL_Y_OFFSET + cjj] -= dely * force; cj_f[CL_Z_OFFSET + cjj] -= delz * force; } - fix += delx * force; fiy += dely * force; fiz += delz * force; @@ -129,13 +139,11 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat } } } - if(any != 0) { addStat(stats->clusters_within_cutoff, 1); } else { addStat(stats->clusters_outside_cutoff, 1); } - ci_f[CL_X_OFFSET + cii] += fix; ci_f[CL_Y_OFFSET + cii] += fiy; ci_f[CL_Z_OFFSET + cii] += fiz; @@ -146,7 +154,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->num_neighs, numneighs); addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } - + if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor); LIKWID_MARKER_STOP("force"); } @@ -168,7 +176,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - for(int ci = 0; ci < atom->Nclusters_local; ci++) { + for(int ci = 0; ci < atom->Nclusters_local+atom->Nclusters_ghost; ci++) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { @@ -178,6 +186,16 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor } } + for(int cg = atom->ncj; cg < atom->ncj+atom->Nclusters_ghost; cg++) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg); + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) { + cj_f[CL_X_OFFSET + cjj] = 0.0; + cj_f[CL_Y_OFFSET + cjj] = 0.0; + cj_f[CL_Z_OFFSET + cjj] = 0.0; + } + } + double S = getTimeStamp(); #pragma omp parallel @@ -213,7 +231,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor #endif */ - #pragma omp for schedule(runtime) + #pragma omp for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); #if CLUSTER_M > CLUSTER_N @@ -322,7 +340,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor fiz2 += tz2; #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ - if(cj < CJ1_FROM_CI(atom->Nlocal)) { + if(cj < CJ1_FROM_CI(atom->Nlocal)|| param->method) { simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2); } #else @@ -373,7 +391,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor fiz2 += tz2; #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ - if(cj < CJ1_FROM_CI(atom->Nlocal)) { + if(cj < CJ1_FROM_CI(atom->Nlocal) || param->method) { simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2); } #else @@ -389,7 +407,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor addStat(stats->num_neighs, numneighs); addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } - + if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor); LIKWID_MARKER_STOP("force"); } @@ -427,7 +445,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor { LIKWID_MARKER_START("force"); - #pragma omp for schedule(runtime) + #pragma omp for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); #if CLUSTER_M > CLUSTER_N @@ -562,7 +580,6 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta if(neighbor->half_neigh) { return computeForceLJ_2xnn_half(param, atom, neighbor, stats); } - return computeForceLJ_2xnn_full(param, atom, neighbor, stats); } @@ -589,13 +606,23 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, } } + for(int cg = atom->ncj; cg < atom->ncj+atom->Nclusters_ghost; cg++) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg); + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) { + cj_f[CL_X_OFFSET + cjj] = 0.0; + cj_f[CL_Y_OFFSET + cjj] = 0.0; + cj_f[CL_Z_OFFSET + cjj] = 0.0; + } + } + double S = getTimeStamp(); #pragma omp parallel { LIKWID_MARKER_START("force"); - #pragma omp for schedule(runtime) + #pragma omp for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); #if CLUSTER_M > CLUSTER_N @@ -726,7 +753,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, fiz3 = simd_add(fiz3, tz3); #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ - if(cj < CJ1_FROM_CI(atom->Nlocal)) { + if(cj < CJ1_FROM_CI(atom->Nlocal) || param->method) { simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3)); simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3)); simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3)); @@ -811,7 +838,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, fiz3 = simd_add(fiz3, tz3); #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ - if(cj < CJ1_FROM_CI(atom->Nlocal)) { + if(cj < CJ1_FROM_CI(atom->Nlocal) || param->method) { simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3)); simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3)); simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3)); @@ -831,7 +858,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, addStat(stats->num_neighs, numneighs); addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } - + if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor); LIKWID_MARKER_STOP("force"); } @@ -869,7 +896,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, { LIKWID_MARKER_START("force"); - #pragma omp for schedule(runtime) + #pragma omp for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); #if CLUSTER_M > CLUSTER_N @@ -1070,3 +1097,120 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat return computeForceLJ_4xn_full(param, atom, neighbor, stats); } + +//Routine for eight shell method +void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) { + DEBUG_MESSAGE("computeForceLJ begin\n"); + + int Nshell = neighbor->Nshell; + int *neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + + for(int ci = 0; ci < Nshell; ci++) { + neighs = &neighbor->neighshell[ci * neighbor->maxneighs]; + int numneighs = neighbor->numNeighShell[ci]; + int cs = neighbor->listshell[ci]; + int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs); + MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base]; + MD_FLOAT *cs_f = &atom->cl_f[cs_vec_base]; + + for(int k = 0; k < numneighs; k++) { + int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + + for(int css = 0; css < CLUSTER_N; css++) { + MD_FLOAT x = cs_x[CL_X_OFFSET + css]; + MD_FLOAT y = cs_x[CL_Y_OFFSET + css]; + MD_FLOAT z = cs_x[CL_Z_OFFSET + css]; + + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; + + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + + MD_FLOAT delx = x - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = y - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = z - cj_x[CL_Z_OFFSET + cjj]; + + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + + cj_f[CL_X_OFFSET + cjj] -= delx * force; + cj_f[CL_Y_OFFSET + cjj] -= dely * force; + cj_f[CL_Z_OFFSET + cjj] -= delz * force; + + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + } + } + + cs_f[CL_X_OFFSET + css] += fix; + cs_f[CL_Y_OFFSET + css] += fiy; + cs_f[CL_Z_OFFSET + css] += fiz; + } + } + // addStat(stats->calculated_forces, 1); + // addStat(stats->num_neighs, numneighs); + // addStat(stats->force_iters, (long long int)((double)numneighs)); + } +} + +/* +void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) { + + int Nshell = neighbor->Nshell; + Pair* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + + for(int ci = 0; ci < Nshell; ci++) { + neighs = &neighbor->neighshell[ci * neighbor->maxneighs]; + int numneighs = neighbor->numNeighShell[ci]; + int cs = neighbor->listshell[ci].cluster; + int css = neighbor->listshell[ci].atom; + int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs); + MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base]; + MD_FLOAT *cs_f = &atom->cl_f[cs_vec_base]; + MD_FLOAT x = cs_x[CL_X_OFFSET + css]; + MD_FLOAT y = cs_x[CL_Y_OFFSET + css]; + MD_FLOAT z = cs_x[CL_Z_OFFSET + css]; + + for(int k = 0; k < numneighs; k++) { + int cj = neighs[k].cluster; + int cjj = neighs[k].atom; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base]; + + MD_FLOAT delx = x - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = y - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = z - cj_x[CL_Z_OFFSET + cjj]; + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + + cj_f[CL_X_OFFSET + cjj] -= delx * force; + cj_f[CL_Y_OFFSET + cjj] -= dely * force; + cj_f[CL_Z_OFFSET + cjj] -= delz * force; + cs_f[CL_X_OFFSET + css] += delx * force; + cs_f[CL_Y_OFFSET + css] += delx * force; + cs_f[CL_Z_OFFSET + css] += delx * force; + + } + } + } +} +*/ diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index b674cdc..e73a0d7 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -5,6 +5,7 @@ * license that can be found in the LICENSE file. */ #include +#include #ifndef __ATOM_H_ #define __ATOM_H_ @@ -102,7 +103,7 @@ typedef struct { typedef struct { int Natoms, Nlocal, Nghost, Nmax; - int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max; + int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max, NmaxGhost,ncj; MD_FLOAT *x, *y, *z; MD_FLOAT *vx, *vy, *vz; int *border_map; @@ -112,6 +113,7 @@ typedef struct { MD_FLOAT *sigma6; MD_FLOAT *cutforcesq; MD_FLOAT *cutneighsq; + //track the movement of a particle along boundaries int *PBCx, *PBCy, *PBCz; // Data in cluster format MD_FLOAT *cl_x; @@ -128,6 +130,9 @@ typedef struct { unsigned int masks_2xnn_fn[8]; unsigned int masks_4xn_hn[16]; unsigned int masks_4xn_fn[16]; + + //Info Subdomain + Box mybox; } Atom; extern void initAtom(Atom*); @@ -140,6 +145,18 @@ extern int readAtom_dmp(Atom*, Parameter*); extern void growAtom(Atom*); extern void growClusters(Atom*); +int packGhost(Atom*, int, MD_FLOAT* , int*); +int unpackGhost(Atom*, int, MD_FLOAT*); +int packExchange(Atom*, int, MD_FLOAT*); +int unpackExchange(Atom*, int, MD_FLOAT*); +void packForward(Atom*, int, int*, MD_FLOAT*, int*); +void unpackForward(Atom*, int, int, MD_FLOAT*); +void packReverse(Atom* , int , int , MD_FLOAT*); +void unpackReverse(Atom*, int, int*, MD_FLOAT*); +void pbc(Atom*); +void copy(Atom*, int, int); + + #ifdef AOS # define POS_DATA_LAYOUT "AoS" # define atom_x(i) atom->x[(i) * 3 + 0] diff --git a/gromacs/includes/integrate.h b/gromacs/includes/integrate.h index 5236ed8..3274d4f 100644 --- a/gromacs/includes/integrate.h +++ b/gromacs/includes/integrate.h @@ -9,10 +9,13 @@ #include #include #include - +#include +#include +#include +/* void cpuInitialIntegrate(Parameter *param, Atom *atom) { + DEBUG_MESSAGE("cpuInitialIntegrate start\n"); - for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; @@ -32,9 +35,9 @@ void cpuInitialIntegrate(Parameter *param, Atom *atom) { DEBUG_MESSAGE("cpuInitialIntegrate end\n"); } -void cpuFinalIntegrate(Parameter *param, Atom *atom) { - DEBUG_MESSAGE("cpuFinalIntegrate start\n"); +void cpuFinalIntegrate(Parameter *param, Atom *atom) { + DEBUG_MESSAGE("cpuFinalIntegrate start\n"); for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; @@ -46,6 +49,56 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) { ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; } } + DEBUG_MESSAGE("cpuFinalIntegrate end\n"); +} +*/ + +void cpuInitialIntegrate(Parameter *param, Atom *atom) { + + DEBUG_MESSAGE("cpuInitialIntegrate start\n"); + for(int ci = 0; ci < atom->Nclusters_local; ci+=2) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + + MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce); + MD_SIMD_FLOAT dt = simd_broadcast(param->dt); + + MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET])); + MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET])); + MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET])); + MD_SIMD_FLOAT x_vector = simd_fma(vx_vector, dt, simd_load(&ci_x[CL_X_OFFSET])); + MD_SIMD_FLOAT y_vector = simd_fma(vy_vector, dt, simd_load(&ci_x[CL_Y_OFFSET])); + MD_SIMD_FLOAT z_vector = simd_fma(vz_vector, dt, simd_load(&ci_x[CL_Z_OFFSET])); + + simd_store(&ci_v[CL_X_OFFSET], vx_vector); + simd_store(&ci_v[CL_Y_OFFSET], vy_vector); + simd_store(&ci_v[CL_Z_OFFSET], vz_vector); + simd_store(&ci_x[CL_X_OFFSET], x_vector); + simd_store(&ci_x[CL_Y_OFFSET], y_vector); + simd_store(&ci_x[CL_Z_OFFSET], z_vector); + } + + DEBUG_MESSAGE("cpuInitialIntegrate end\n"); +} + +void cpuFinalIntegrate(Parameter *param, Atom *atom) { + + DEBUG_MESSAGE("cpuFinalIntegrate start\n"); + for(int ci = 0; ci < atom->Nclusters_local; ci+=2) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + + MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce); + MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET])); + MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET])); + MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET])); + simd_store(&ci_v[CL_X_OFFSET], vx_vector); + simd_store(&ci_v[CL_Y_OFFSET], vy_vector); + simd_store(&ci_v[CL_Z_OFFSET], vz_vector); + } DEBUG_MESSAGE("cpuFinalIntegrate end\n"); } @@ -54,3 +107,6 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) { void cudaInitialIntegrate(Parameter*, Atom*); void cudaFinalIntegrate(Parameter*, Atom*); #endif + + + \ No newline at end of file diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index 668edc8..2e8c6fd 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -25,6 +25,11 @@ #define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU #define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U +typedef struct { + int cluster; + int atom; +} Pair; + typedef struct { int every; int ncalls; @@ -34,8 +39,20 @@ typedef struct { int half_neigh; int* neighbors; unsigned int* neighbors_imask; + //MPI + /* + int Nshell; //# of atoms in listShell(Cluster here cover all possible ghost interactions) + int *numNeighShell; //# of neighs for each atom in listShell + Pair *neighshell; //list of neighs for each atom in listShell + Pair *listshell; //Atoms to compute the force + */ + int Nshell; //# of cluster in listShell(Cluster here cover all possible ghost interactions) + int *numNeighShell; //# of neighs for each atom in listShell + int *neighshell; //list of neighs for each atom in listShell + int *listshell; //Atoms to compute the force } Neighbor; + extern void initNeighbor(Neighbor*, Parameter*); extern void setupNeighbor(Parameter*, Atom*); extern void binatoms(Atom*); diff --git a/gromacs/includes/vtk.h b/gromacs/includes/vtk.h index 5311342..9299976 100644 --- a/gromacs/includes/vtk.h +++ b/gromacs/includes/vtk.h @@ -5,6 +5,8 @@ * license that can be found in the LICENSE file. */ #include +#include +#include #ifndef __VTK_H_ #define __VTK_H_ @@ -13,4 +15,5 @@ extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int t extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep); +extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep); #endif diff --git a/gromacs/main.c b/gromacs/main.c index 91030eb..317e90c 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -5,9 +5,7 @@ * license that can be found in the LICENSE file. */ #include -#include #include -#include //-- #include //-- @@ -26,6 +24,10 @@ #include #include #include +#include +#include +#include +#include #define HLINE "----------------------------------------------------------------------------\n" @@ -42,17 +44,55 @@ extern void copyDataFromCUDADevice(Atom *atom); extern void cudaDeviceFree(); #endif -double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { +double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time) +{ + double S, E; + int dims = 3; //TODO: Adjust to do in 3d and 2d + S = getTimeStamp(); + if(param->balance == RCB) { + rcbBalance(grid, atom, param, meanBisect,dims,0); + neighComm(comm, param, grid); + }else if(param->balance == meanTimeRCB){ + rcbBalance(grid, atom, param, meanTimeBisect,dims,time); + neighComm(comm, param, grid); + }else if(param->balance == Staggered) { + staggeredBalance(grid, atom, param, time); + neighComm(comm, param, grid); + exchangeComm(comm,atom); + }else { } //Do nothing + //printGrid(grid); + E = getTimeStamp(); + + return E-S; +} + +double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) +{ + double E,S,time; + int me; + MPI_Comm_rank(world,&me); + S = getTimeStamp(); + if(param->balance == meanTimeRCB || param->balance == RCB){ + rcbBalance(grid, atom, param, meanBisect,3,0); + neighComm(comm, param, grid); + } + MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); + printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms); + MPI_Barrier(world); + E = getTimeStamp(); + return E-S; +} + +double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; param->lattice = pow((4.0 / param->rho), (1.0 / 3.0)); param->xprd = param->nx * param->lattice; param->yprd = param->ny * param->lattice; param->zprd = param->nz * param->lattice; - S = getTimeStamp(); initAtom(atom); - initPbc(atom); + //initPbc(atom); initStats(stats); initNeighbor(neighbor, param); if(param->input_file == NULL) { @@ -60,13 +100,18 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * } else { readAtom(atom, param); } - + setupGrid(grid,atom,param); setupNeighbor(param, atom); + setupComm(comm, param, grid); + if(param->balance){ + initialBalance(param, eam, atom, neighbor, stats, comm, grid); + } setupThermo(param, atom->Natoms); if(param->input_file == NULL) { adjustThermo(param, atom); } buildClusters(atom); defineJClusters(atom); - setupPbc(atom, param); + //setupPbc(atom, param); + ghostNeighbor(comm, atom, param); //change binClusters(atom); buildNeighbor(atom, neighbor); initDevice(atom, neighbor); @@ -74,15 +119,15 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * return E-S; } -double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { +double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) { double S, E; S = getTimeStamp(); LIKWID_MARKER_START("reneighbour"); - updateSingleAtoms(atom); - updateAtomsPbc(atom, param); + //updateAtomsPbc(atom, param); buildClusters(atom); defineJClusters(atom); - setupPbc(atom, param); + //setupPbc(atom, param); + ghostNeighbor(comm, atom, param); binClusters(atom); buildNeighbor(atom, neighbor); LIKWID_MARKER_STOP("reneighbour"); @@ -90,15 +135,13 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { return E-S; } -void printAtomState(Atom *atom) { - printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", - atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax); - - /* int nall = atom->Nlocal + atom->Nghost; */ - - /* for (int i=0; ix[i], atom->y[i], atom->z[i]); */ - /* } */ +double updateAtoms(Comm* comm, Atom* atom){ + double S,E; + S = getTimeStamp(); + updateSingleAtoms(atom); + exchangeComm(comm, atom); + E = getTimeStamp(); + return E-S; } int main(int argc, char** argv) { @@ -108,7 +151,8 @@ int main(int argc, char** argv) { Neighbor neighbor; Stats stats; Parameter param; - + Comm comm; + Grid grid; LIKWID_MARKER_INIT; #pragma omp parallel { @@ -116,7 +160,7 @@ int main(int argc, char** argv) { //LIKWID_MARKER_REGISTER("reneighbour"); //LIKWID_MARKER_REGISTER("pbc"); } - + initComm(&argc, &argv, &comm); //change initParameter(¶m); for(int i = 0; i < argc; i++) { if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) { @@ -158,6 +202,24 @@ int main(int argc, char** argv) { param.half_neigh = atoi(argv[++i]); continue; } + if((strcmp(argv[i], "-method") == 0)) { + param.method = atoi(argv[++i]); + if (param.method>2 || param.method< 0){ + if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n"); + endComm(&comm); + exit(0); + } + continue; + } + if((strcmp(argv[i], "-bal") == 0)) { + param.balance = atoi(argv[++i]); + if (param.balance>3 || param.balance< 0){ + if(comm.myproc == 0) fprintf(stderr, "Load balance does not exist!\n"); + endComm(&comm); + exit(0); + } + continue; + } if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) { param.mass = atof(argv[++i]); continue; @@ -188,6 +250,7 @@ int main(int argc, char** argv) { continue; } if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { + //TODO: add the shell and ac print options printf("MD Bench: A minimalistic re-implementation of miniMD\n"); printf(HLINE); printf("-p : file to read parameters from (can be specified more than once)\n"); @@ -205,98 +268,101 @@ int main(int argc, char** argv) { exit(EXIT_SUCCESS); } } + + if(param.balance>0 && param.method == 1){ + if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n"); + endComm(&comm); + exit(0); + } param.cutneigh = param.cutforce + param.skin; - setup(¶m, &eam, &atom, &neighbor, &stats); - printParameter(¶m); - printf(HLINE); - - printf("step\ttemp\t\tpressure\n"); + timer[SETUP]=setup(¶m, &eam, &atom, &neighbor, &stats, &comm, &grid); + if(comm.myproc == 0) printParameter(¶m); + if(comm.myproc == 0) printf(HLINE); + if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n"); computeThermo(0, ¶m, &atom); #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); #endif - #ifdef CUDA_TARGET copyDataToCUDADevice(&atom); #endif - if(param.force_field == FF_EAM) { timer[FORCE] = computeForceEam(&eam, ¶m, &atom, &neighbor, &stats); } else { timer[FORCE] = computeForceLJ(¶m, &atom, &neighbor, &stats); } - - timer[NEIGH] = 0.0; + timer[NEIGH] = 0.0; + timer[FORWARD] = 0.0; + timer[UPDATE] = 0.0; + timer[BALANCE] = 0.0; + timer[REVERSE] = reverse(&comm, &atom, ¶m); + MPI_Barrier(world); timer[TOTAL] = getTimeStamp(); - if(param.vtk_file != NULL) { - write_data_to_vtk_file(param.vtk_file, &atom, 0); + //write_data_to_vtk_file(param.vtk_file, &comm ,&atom, 0); + printvtk(param.vtk_file, &comm, &atom, ¶m, 0); } - + //TODO: modify xct if(param.xtc_file != NULL) { xtc_init(param.xtc_file, &atom, 0); } - - for(int n = 0; n < param.ntimes; n++) { + double forceTime=0.0; + double commTime=0.0; + for(int n = 0; n < param.ntimes; n++) { initialIntegrate(¶m, &atom); - if((n + 1) % param.reneigh_every) { - if(!((n + 1) % param.prune_every)) { + timer[FORWARD]+=forward(&comm, &atom, ¶m); + if(!((n + 1) % param.prune_every)){ pruneNeighbor(¶m, &atom, &neighbor); } - - updatePbc(&atom, ¶m, 0); } else { #ifdef CUDA_TARGET copyDataFromCUDADevice(&atom); #endif - - timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); - + timer[UPDATE] +=updateAtoms(&comm,&atom); + if(param.balance && !((n+1)%param.balance_every)) + timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , ¶m, timer[FORCE]); + timer[NEIGH] += reneighbour(&comm, ¶m, &atom, &neighbor); #ifdef CUDA_TARGET copyDataToCUDADevice(&atom); isReneighboured = 1; #endif } - #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); #endif - if(param.force_field == FF_EAM) { - timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats); + timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats); } else { timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats); - } - + } + timer[REVERSE] += reverse(&comm, &atom, ¶m); finalIntegrate(¶m, &atom); - if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { computeThermo(n + 1, ¶m, &atom); } - int write_pos = !((n + 1) % param.x_out_every); int write_vel = !((n + 1) % param.v_out_every); if(write_pos || write_vel) { if(param.vtk_file != NULL) { - write_data_to_vtk_file(param.vtk_file, &atom, n + 1); + printvtk(param.vtk_file, &comm, &atom, ¶m, n+1); } - + //TODO: xtc file if(param.xtc_file != NULL) { xtc_write(&atom, n + 1, write_pos, write_vel); } - } + } } #ifdef CUDA_TARGET copyDataFromCUDADevice(&atom); #endif - + MPI_Barrier(world); timer[TOTAL] = getTimeStamp() - timer[TOTAL]; - updateSingleAtoms(&atom); + updateAtoms(&comm,&atom); computeThermo(-1, ¶m, &atom); - + //TODO: if(param.xtc_file != NULL) { xtc_end(); } @@ -304,41 +370,35 @@ int main(int argc, char** argv) { #ifdef CUDA_TARGET cudaDeviceFree(); #endif - - printf(HLINE); - printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); - printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n", - timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]); - printf(HLINE); + double mint[NUMTIMER]; + double maxt[NUMTIMER]; + double sumt[NUMTIMER]; + timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE]; + MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world); + MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world); + MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world); + int Nghost; + MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world); - int nthreads = 0; - int chunkSize = 0; - omp_sched_t schedKind; - char schedType[10]; -#pragma omp parallel -#pragma omp master - { - omp_get_schedule(&schedKind, &chunkSize); - - switch (schedKind) - { - case omp_sched_static: strcpy(schedType, "static"); break; - case omp_sched_dynamic: strcpy(schedType, "dynamic"); break; - case omp_sched_guided: strcpy(schedType, "guided"); break; - case omp_sched_auto: strcpy(schedType, "auto"); break; - } - - nthreads = omp_get_max_threads(); - } - - printf("Num threads: %d\n", nthreads); - printf("Schedule: (%s,%d)\n", schedType, chunkSize); - - printf("Performance: %.2f million atom updates per second\n", - 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); - #ifdef COMPUTE_STATS + if(comm.myproc == 0){ + int n = comm.numproc; + printf(HLINE); + printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes); + printf("TOTAL %.2fs\n\n",timer[TOTAL]); + printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP"); + printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n"); + printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n); + printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]); + printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]); + printf(HLINE); + printf("Performance: %.2f million atom updates per second\n", + 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); + +#ifdef COMPUTE_STATS displayStatistics(&atom, ¶m, &stats, timer); - #endif +#endif + } + endComm(&comm); LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; } diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index ba9fc82..cd371b7 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -7,15 +7,15 @@ #include #include #include - #include #include #include #include +#include #define SMALL 1.0e-6 #define FACTOR 0.999 - +#define eps 1.0e-9 static MD_FLOAT xprd, yprd, zprd; static MD_FLOAT bininvx, bininvy; static int mbinxlo, mbinylo; @@ -34,9 +34,16 @@ static int nmax; static int nstencil; // # of bins in stencil static int* stencil; // stencil list of bin offsets static MD_FLOAT binsizex, binsizey; +int me; //rank +int method; // method +int shellMethod; //If shell method exist static int coord2bin(MD_FLOAT, MD_FLOAT); static MD_FLOAT bindist(int, int); +//static int ghostZone(Atom*, int); +static int halfZoneCluster(Atom*,int); +static int ghostClusterinRange(Atom*, int, int, MD_FLOAT); +static void neighborGhost(Atom*, Neighbor*); /* exported subroutines */ void initNeighbor(Neighbor *neighbor, Parameter *param) { @@ -53,12 +60,25 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { bincount = NULL; bin_clusters = NULL; bin_nclusters = NULL; - neighbor->half_neigh = param->half_neigh; - neighbor->maxneighs = 100; + neighbor->maxneighs = 200; neighbor->numneigh = NULL; neighbor->numneigh_masked = NULL; neighbor->neighbors = NULL; neighbor->neighbors_imask = NULL; + //MPI + shellMethod = 0; + method = param->method; + if(method == halfShell || method == eightShell){ + param->half_neigh = 1; + shellMethod = 1; + } + me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + neighbor->half_neigh = param->half_neigh; + neighbor->Nshell = 0; + neighbor->numNeighShell = NULL; + neighbor->neighshell = NULL; + neighbor->listshell = NULL; } void setupNeighbor(Parameter *param, Atom *atom) { @@ -77,7 +97,7 @@ void setupNeighbor(Parameter *param, Atom *atom) { MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd; MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd; - MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); + MD_FLOAT atom_density = ((MD_FLOAT)(atom->Natoms)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N); MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density); @@ -146,6 +166,7 @@ void setupNeighbor(Parameter *param, Atom *atom) { } MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { + MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx; MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx; MD_FLOAT dm = MAX(dl, dh); @@ -163,6 +184,7 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; + return d2; } @@ -225,7 +247,6 @@ static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) { void buildNeighbor(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighbor start\n"); - /* extend atom arrays if necessary */ if(atom->Nclusters_local > nmax) { nmax = atom->Nclusters_local; @@ -249,7 +270,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { while(resize) { int new_maxneighs = neighbor->maxneighs; resize = 0; - for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj1 = CJ1_FROM_CI(ci); int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); @@ -262,14 +282,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy; MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz; MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz; - for(int k = 0; k < nstencil; k++) { int jbin = ibin + stencil[k]; int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; int cj, m = -1; MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; const int c = bin_nclusters[jbin]; - if(c > 0) { MD_FLOAT dl, dh, dm, dm0, d_bb_sq; @@ -279,6 +297,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(neighbor->half_neigh && ci_cj1 > cj) { continue; } + jbb_zmin = atom->jclusters[cj].bbminz; jbb_zmax = atom->jclusters[cj].bbmaxz; dl = ibb_zmin - jbb_zmax; @@ -287,7 +306,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { dm0 = MAX(dm, 0.0); d_bb_sq = dm0 * dm0; } while(m + 1 < c && d_bb_sq > cutneighsq); - jbb_xmin = atom->jclusters[cj].bbminx; jbb_xmax = atom->jclusters[cj].bbmaxx; jbb_ymin = atom->jclusters[cj].bbminy; @@ -357,7 +375,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { jbb_zmin = atom->jclusters[cj].bbminz; jbb_zmax = atom->jclusters[cj].bbmaxz; } - } + } } } @@ -383,14 +401,15 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(resize) { neighbor->maxneighs = new_maxneighs * 1.2; - fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); + fprintf(stdout, "RESIZE %d, PROC %d\n", neighbor->maxneighs,me); free(neighbor->neighbors); free(neighbor->neighbors_imask); neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int)); neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int)); } } - + if(method == eightShell) neighborGhost(atom, neighbor); + /* DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq); for(int ci = 0; ci < 6; ci++) { @@ -511,46 +530,44 @@ int coord2bin(MD_FLOAT xin, MD_FLOAT yin) { int ix, iy; if(xin >= xprd) { - ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo; + ix = (int)((xin + eps - xprd) * bininvx) + nbinx - mbinxlo; } else if(xin >= 0.0) { - ix = (int)(xin * bininvx) - mbinxlo; + ix = (int)((xin+eps) * bininvx) - mbinxlo; } else { - ix = (int)(xin * bininvx) - mbinxlo - 1; + ix = (int)((xin+eps) * bininvx) - mbinxlo - 1; } - if(yin >= yprd) { - iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo; + if(yin >= yprd) { + iy = (int)(((yin+eps) - yprd) * bininvy) + nbiny - mbinylo; } else if(yin >= 0.0) { - iy = (int)(yin * bininvy) - mbinylo; + iy = (int)((yin+eps) * bininvy) - mbinylo; } else { - iy = (int)(yin * bininvy) - mbinylo - 1; + iy = (int)((yin+eps) * bininvy) - mbinylo - 1; } - + return (iy * mbinx + ix + 1); } void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) { if(xin >= xprd) { - *ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo; + *ix = (int)((xin + eps - xprd) * bininvx) + nbinx - mbinxlo; } else if(xin >= 0.0) { - *ix = (int)(xin * bininvx) - mbinxlo; + *ix = (int)((xin+eps) * bininvx) - mbinxlo; } else { - *ix = (int)(xin * bininvx) - mbinxlo - 1; + *ix = (int)((xin+eps) * bininvx) - mbinxlo - 1; } - - if(yin >= yprd) { - *iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo; + if(yin >= yprd) { + *iy = (int)((yin + eps - yprd) * bininvy) + nbiny - mbinylo; } else if(yin >= 0.0) { - *iy = (int)(yin * bininvy) - mbinylo; + *iy = (int)((yin+eps) * bininvy) - mbinylo; } else { - *iy = (int)(yin * bininvy) - mbinylo - 1; + *iy = (int)((yin+eps) * bininvy) - mbinylo - 1; } } void binAtoms(Atom *atom) { DEBUG_MESSAGE("binAtoms start\n"); int resize = 1; - while(resize > 0) { resize = 0; @@ -567,7 +584,7 @@ void binAtoms(Atom *atom) { resize = 1; } } - + if(resize) { free(bins); atoms_per_bin *= 2; @@ -615,8 +632,7 @@ void buildClusters(Atom *atom) { /* bin local atoms */ binAtoms(atom); - sortAtomsByZCoord(atom); - + sortAtomsByZCoord(atom); for(int bin = 0; bin < mbins; bin++) { int c = bincount[bin]; int ac = 0; @@ -688,6 +704,9 @@ void buildClusters(Atom *atom) { void defineJClusters(Atom *atom) { DEBUG_MESSAGE("defineJClusters start\n"); + const int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + atom->ncj = atom->Nclusters_local / jfac; + for(int ci = 0; ci < atom->Nclusters_local; ci++) { int cj0 = CJ0_FROM_CI(ci); @@ -830,12 +849,11 @@ void binClusters(Atom *atom) { } } } - for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) { const int cj = ncj + cg; int ix = -1, iy = -1; MD_FLOAT xtmp, ytmp; - + if(shellMethod == halfShell && !halfZoneCluster(atom, cj)) continue; if(atom->jclusters[cj].natoms > 0) { int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; @@ -846,6 +864,7 @@ void binClusters(Atom *atom) { coord2bin2D(xtmp, ytmp, &ix, &iy); ix = MAX(MIN(ix, mbinx - 1), 0); iy = MAX(MIN(iy, mbiny - 1), 0); + for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) { int nix, niy; xtmp = cj_x[CL_X_OFFSET + cjj]; @@ -853,7 +872,7 @@ void binClusters(Atom *atom) { coord2bin2D(xtmp, ytmp, &nix, &niy); nix = MAX(MIN(nix, mbinx - 1), 0); niy = MAX(MIN(niy, mbiny - 1), 0); - + // Always put the cluster on the bin of its innermost atom so // the cluster should be closer to local clusters if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } @@ -861,7 +880,6 @@ void binClusters(Atom *atom) { if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } } - int bin = iy * mbinx + ix + 1; int c = bin_nclusters[bin]; if(c < clusters_per_bin) { @@ -883,25 +901,21 @@ void binClusters(Atom *atom) { break; } } - if(!inserted) { bin_clusters[bin * clusters_per_bin + c] = cj; } - bin_nclusters[bin]++; } else { resize = 1; } } } - if(resize) { free(bin_clusters); clusters_per_bin *= 2; bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); } } - /* DEBUG_MESSAGE("bin_nclusters\n"); for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); } @@ -919,7 +933,6 @@ void updateSingleAtoms(Atom *atom) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; - for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { atom_x(Natom) = ci_x[CL_X_OFFSET + cii]; atom_y(Natom) = ci_x[CL_Y_OFFSET + cii]; @@ -928,12 +941,174 @@ void updateSingleAtoms(Atom *atom) { atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii]; atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii]; Natom++; - } + } } - if(Natom != atom->Nlocal) { fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n"); } DEBUG_MESSAGE("updateSingleAtoms stop\n"); } + +//MPI Shell Methods + +static int eightZoneCluster(Atom* atom, int cj) +{ + //Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7 + int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7}; + int zone = 0; + MD_FLOAT *hi = atom->mybox.hi; + + if (atom->jclusters[cj].bbminx +eps >=hi[_x]){ + zone += 1; + } + if (atom->jclusters[cj].bbminy +eps >=hi[_y]){ + zone += 2; + } + if (atom->jclusters[cj].bbminz +eps >=hi[_z]){ + zone += 4; + } + return zoneMapping[zone]; +} + +static int halfZoneCluster(Atom* atom, int cj) +{ + MD_FLOAT *hi = atom->mybox.hi; + MD_FLOAT *lo = atom->mybox.lo; + + if(atom->jclusters[cj].bbmaxx < lo[_x] && atom->jclusters[cj].bbmaxy < hi[_y] && + atom->jclusters[cj].bbmaxz < hi[_z]){ + return 0; + } else if(atom->jclusters[cj].bbmaxy < lo[_y] && atom->jclusters[cj].bbmaxz < hi[_z]){ + return 0; + } else if(atom->jclusters[cj].bbmaxz < lo[_z]){ + return 0; + } else { + return 1; + } +} + +int BoxGhostDistance(Atom *atom, int ci, int cj) { + + MD_FLOAT dl = atom->jclusters[ci].bbminx - atom->jclusters[cj].bbmaxx; + MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->jclusters[ci].bbmaxx; + MD_FLOAT dm = MAX(dl, dh); + MD_FLOAT dm0 = MAX(dm, 0.0); + MD_FLOAT dx2 = dm0 * dm0; + + dl = atom->jclusters[ci].bbminy - atom->jclusters[cj].bbmaxy; + dh = atom->jclusters[cj].bbminy - atom->jclusters[ci].bbmaxy; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + MD_FLOAT dy2 = dm0 * dm0; + + dl = atom->jclusters[ci].bbminz - atom->jclusters[cj].bbmaxz; + dh = atom->jclusters[cj].bbminz - atom->jclusters[ci].bbmaxz; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + MD_FLOAT dz2 = dm0 * dm0; + + return dx2 > cutneighsq ? 0 : dy2 > cutneighsq ? 0 : dz2 > cutneighsq ? 0 : 1; +} + +static int ghostClusterinRange(Atom *atom, int cs, int cg, MD_FLOAT rsq) { + int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs); + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg); + MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base]; + MD_FLOAT *cg_x = &atom->cl_x[cj_vec_base]; + + for(int cii = 0; cii < atom->jclusters[cs].natoms; cii++) { + for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) { + MD_FLOAT delx = cs_x[CL_X_OFFSET + cii] - cg_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = cs_x[CL_Y_OFFSET + cii] - cg_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = cs_x[CL_Z_OFFSET + cii] - cg_x[CL_Z_OFFSET + cjj]; + if(delx * delx + dely * dely + delz * delz < rsq) { + return 1; + } + } + } + return 0; +} + +static void neighborGhost(Atom *atom, Neighbor *neighbor) { + int Nshell=0; + int Ncluster_local = atom->Nclusters_local; + int Nclusterghost = atom->Nclusters_ghost; + if(neighbor->listshell) free(neighbor->listshell); + neighbor->listshell = (int*) malloc(Nclusterghost * sizeof(int)); + int* listzone = (int*) malloc(8 * Nclusterghost * sizeof(int)); + int countCluster[8] = {0,0,0,0,0,0,0,0}; + + //Selecting ghost atoms for interaction and putting them into regions + for(int cg = atom->ncj; cg < atom->ncj+Nclusterghost; cg++) { + int czone = eightZoneCluster(atom,cg); + int *list = &listzone[Nclusterghost*czone]; + int n = countCluster[czone]; + list[n] = cg; + countCluster[czone]++; + //It is only necessary to find neighbour particles for 3 regions + //if(czone == 1 || czone == 2 || czone == 3) + //neighbor->listshell[Nshell++] = cg; + } + + for(int zone = 1; zone<=3; zone++){ + int *list = &listzone[Nclusterghost*zone]; + for(int n=0; nlistshell[Nshell++] = list[n]; + } + + neighbor->Nshell = Nshell; + if(neighbor->numNeighShell) free(neighbor->numNeighShell); + if(neighbor->neighshell) free(neighbor->neighshell); + neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int)); + neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int)); + + int resize = 1; + + while(resize) + { + resize = 0; + for(int ic = 0; ic < Nshell; ic++) { + int *neighshell = &(neighbor->neighshell[ic*neighbor->maxneighs]); + int n = 0; + int icluster = neighbor->listshell[ic]; + int iczone = eightZoneCluster(atom, icluster); + + for(int jczone=0; jczone<8; jczone++){ + + if(jczone <=iczone) continue; + if(iczone == 1 && (jczone==5||jczone==6||jczone==7)) continue; + if(iczone == 2 && (jczone==4||jczone==6||jczone==7)) continue; + if(iczone == 3 && (jczone==4||jczone==5||jczone==7)) continue; + + int Ncluster = countCluster[jczone]; + int* loc_zone = &listzone[jczone * Nclusterghost]; + + for(int k = 0; k < Ncluster ; k++) { + int jcluster = loc_zone[k]; + + if(BoxGhostDistance(atom, icluster, jcluster)) + { + if(ghostClusterinRange(atom, icluster, jcluster, cutneighsq)) + neighshell[n++] = jcluster; + } + + } + } + neighbor->numNeighShell[ic] = n; + + if(n >= neighbor->maxneighs){ + resize = 1; + neighbor->maxneighs = n * 1.2; + fprintf(stdout, "RESIZE EIGHT SHELL %d, PROC %d\n", neighbor->maxneighs,me); + break; + } + } + + if(resize) { + free(neighbor->neighshell); + neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int)); + } + } + free(listzone); +} diff --git a/gromacs/vtk.c b/gromacs/vtk.c index b683dc8..cfd2540 100644 --- a/gromacs/vtk.c +++ b/gromacs/vtk.c @@ -9,6 +9,11 @@ #include #include +#include +#include + +static MPI_File _fh; +static inline void flushBuffer(char*); void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) { write_local_atoms_to_vtk_file(filename, atom, timestep); @@ -188,3 +193,128 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int fclose(fp); return 0; } + +int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep) +{ + char msg[256]; + char timestep_filename[128]; + snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep); + MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh); + if(_fh == MPI_FILE_NULL) { + if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n"); + return -1; + } + + if (comm->myproc==0){ + sprintf(msg, "# vtk DataFile Version 2.0\n"); + sprintf(msg, "%sParticle data\n",msg); + sprintf(msg, "%sASCII\n",msg); + sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg); + sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms); + flushBuffer(msg); + } +} + +int vtkVector(Comm* comm, Atom* atom, Parameter* param) +{ + if (_fh == MPI_FILE_NULL) { + if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n"); + return -1; + } + + int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n" + int extrabuff = 100; + int sizebuff = sizeline*atom->Nlocal+extrabuff; + int mysize = 0; + char* msg = (char*) malloc(sizebuff); + sprintf(msg, ""); + for(int i = 0; i < atom->Nlocal; i++){ + if(mysize+extrabuff >= sizebuff){ + sizebuff*= 1.5; + msg = (char*) realloc(msg, sizebuff); + } + //TODO: do not forget to add param->xlo, param->ylo, param->zlo + sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i)); + mysize = strlen(msg); + } + int gatherSize[comm->numproc]; + + MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD); + int offset=0; + int globalSize = 0; + + for(int i = 0; i < comm->myproc; i++) + offset+= gatherSize[i]; + + for(int i = 0; i < comm->numproc; i++) + globalSize+= gatherSize[i]; + + MPI_Offset displ; + MPI_Datatype FileType; + int GlobalSize[] = {globalSize}; + int LocalSize[] = {mysize}; + int Start[] = {offset}; + + if(LocalSize[0]>0){ + MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType); + } else { + MPI_Type_vector(0,0,0,MPI_CHAR,&FileType); + } + MPI_Type_commit(&FileType); + MPI_File_get_size(_fh, &displ); + MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL); + MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE); + MPI_Barrier(MPI_COMM_WORLD); + MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL); + + if (comm->myproc==0){ + + sprintf(msg, "\n\n"); + sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2); + + for(int i = 0; i < atom->Natoms; i++) + sprintf(msg, "%s1 %d\n", msg, i); + flushBuffer(msg); + + sprintf(msg, "\n\n"); + sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms); + for(int i = 0; i < atom->Natoms; i++) + sprintf(msg, "%s1\n",msg); + flushBuffer(msg); + + sprintf(msg, "\n\n"); + sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms); + sprintf(msg, "%sSCALARS mass double\n",msg); + sprintf(msg, "%sLOOKUP_TABLE default\n",msg); + for(int i = 0; i < atom->Natoms; i++) + sprintf(msg, "%s1.0\n",msg); + sprintf(msg, "%s\n\n",msg); + flushBuffer(msg); + } +} + +void vtkClose() +{ + MPI_File_close(&_fh); + _fh=MPI_FILE_NULL; +} + +//TODO: print ghost and cluster using MPI +void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep) +{ + if(comm->numproc == 1) + { + write_data_to_vtk_file(filename, atom, timestep); + return; + } + + vtkOpen(filename, comm, atom, timestep); + vtkVector(comm, atom, param); + vtkClose(); +} + +static inline void flushBuffer(char* msg){ + MPI_Offset displ; + MPI_File_get_size(_fh, &displ); + MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE); +} \ No newline at end of file diff --git a/include_MPIICC.mk b/include_MPIICC.mk new file mode 100644 index 0000000..8643b19 --- /dev/null +++ b/include_MPIICC.mk @@ -0,0 +1,32 @@ +CC = mpiicc +LINKER = $(CC) + +OPENMP = #-qopenmp +PROFILE = #-profile-functions -g -pg + +ifeq ($(ISA),AVX512) +OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE) #-g -debug +endif + +ifeq ($(ISA),AVX2) +OPTS = -Ofast -xCORE-AVX2 $(PROFILE) +#OPTS = -Ofast -xAVX2 $(PROFILE) +#OPTS = -Ofast -march=core-avx2 $(PROFILE) +endif + +ifeq ($(ISA),AVX) +OPTS = -Ofast -xAVX $(PROFILE) +endif + +ifeq ($(ISA),SSE) +OPTS = -Ofast -xSSE4.2 $(PROFILE) +endif + +#OPTS = -Ofast -no-vec $(PROFILE) +#OPTS = -Ofast -xHost $(PROFILE) +CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS) +ASFLAGS = #-masm=intel +LFLAGS = $(PROFILE) $(OPTS) $(OPENMP) +DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN +INCLUDES = +LIBS = -lm diff --git a/lammps/atom.c b/lammps/atom.c index 5dd0cdf..1f4f3f6 100644 --- a/lammps/atom.c +++ b/lammps/atom.c @@ -9,10 +9,12 @@ #include #include +#include #include #include #include #include +#include #define DELTA 20000 @@ -21,10 +23,10 @@ #endif #ifndef MAX -#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) #endif -void initAtom(Atom *atom) { +void initAtom(Atom *atom){ atom->x = NULL; atom->y = NULL; atom->z = NULL; atom->vx = NULL; atom->vy = NULL; atom->vz = NULL; atom->fx = NULL; atom->fy = NULL; atom->fz = NULL; @@ -41,6 +43,7 @@ void initAtom(Atom *atom) { atom->radius = NULL; atom->av = NULL; atom->r = NULL; + atom->border_map = NULL; DeviceAtom *d_atom = &(atom->d_atom); d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL; @@ -52,12 +55,19 @@ void initAtom(Atom *atom) { d_atom->sigma6 = NULL; d_atom->cutforcesq = NULL; d_atom->cutneighsq = NULL; + //MPI + Box *mybox = &(atom->mybox); + mybox->xprd = mybox->yprd = mybox->zprd = 0; + mybox->lo[_x] = mybox->lo[_y] = mybox->lo[_z] = 0; + mybox->hi[_x] = mybox->hi[_y] = mybox->hi[_z] = 0; } void createAtom(Atom *atom, Parameter *param) { - MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd; - MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd; - MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd; + + MD_FLOAT xlo = 0; MD_FLOAT xhi = param->xprd; + MD_FLOAT ylo = 0; MD_FLOAT yhi = param->yprd; + MD_FLOAT zlo = 0; MD_FLOAT zhi = param->zprd; + atom->Natoms = 4 * param->nx * param->ny * param->nz; atom->Nlocal = 0; atom->ntypes = param->ntypes; @@ -107,15 +117,15 @@ void createAtom(Atom *atom, Parameter *param) { xtmp = 0.5 * alat * i; ytmp = 0.5 * alat * j; ztmp = 0.5 * alat * k; - + if( xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) { - + n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1; - + for(m = 0; m < 5; m++) { myrandom(&n); } @@ -131,7 +141,7 @@ void createAtom(Atom *atom, Parameter *param) { } vztmp = myrandom(&n); - if(atom->Nlocal == atom->Nmax) { + while(atom->Nlocal >= atom->Nmax) { growAtom(atom); } @@ -163,38 +173,42 @@ int type_str2int(const char *type) { return -1; } -int readAtom(Atom* atom, Parameter* param) { +int readAtom(Atom *atom, Parameter *param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); int len = strlen(param->input_file); if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); } if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); } if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); } if(strncmp(¶m->input_file[len - 3], ".in", 3) == 0) { return readAtom_in(atom, param); } - fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file); + if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file); exit(-1); return -1; } int readAtom_pdb(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; int read_atoms = 0; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } while(!feof(fp)) { readline(line, fp); - char *item = strtok(line, " "); + char *item = strtok(line, "\t "); if(strncmp(item, "CRYST1", 6) == 0) { param->xlo = 0.0; - param->xhi = atof(strtok(NULL, " ")); + param->xhi = atof(strtok(NULL, "\t ")); param->ylo = 0.0; - param->yhi = atof(strtok(NULL, " ")); + param->yhi = atof(strtok(NULL, "\t ")); param->zlo = 0.0; - param->zhi = atof(strtok(NULL, " ")); + param->zhi = atof(strtok(NULL, "\t ")); param->xprd = param->xhi - param->xlo; param->yprd = param->yhi - param->ylo; param->zprd = param->zhi - param->zlo; @@ -203,23 +217,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) { char *label; int atom_id, comp_id; MD_FLOAT occupancy, charge; - atom_id = atoi(strtok(NULL, " ")) - 1; + atom_id = atoi(strtok(NULL, "\t ")) - 1; while(atom_id + 1 >= atom->Nmax) { growAtom(atom); } - atom->type[atom_id] = type_str2int(strtok(NULL, " ")); - label = strtok(NULL, " "); - comp_id = atoi(strtok(NULL, " ")); - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); + atom->type[atom_id] = type_str2int(strtok(NULL, "\t ")); + label = strtok(NULL, "\t "); + comp_id = atoi(strtok(NULL, "\t ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); atom_vx(atom_id) = 0.0; atom_vy(atom_id) = 0.0; atom_vz(atom_id) = 0.0; - occupancy = atof(strtok(NULL, " ")); - charge = atof(strtok(NULL, " ")); + occupancy = atof(strtok(NULL, "\t ")); + charge = atof(strtok(NULL, "\t ")); atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes); atom->Natoms++; atom->Nlocal++; @@ -231,14 +245,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) { strncmp(item, "ENDMDL", 6) == 0) { // Do nothing } else { - fprintf(stderr, "Invalid item: %s\n", item); + if(me==0)fprintf(stderr, "Invalid item: %s\n", item); exit(-1); return -1; } } if(!read_atoms) { - fprintf(stderr, "Input error: No atoms read!\n"); + if(me==0)fprintf(stderr, "Input error: No atoms read!\n"); exit(-1); return -1; } @@ -254,12 +268,15 @@ int readAtom_pdb(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); + if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); fclose(fp); return read_atoms; } int readAtom_gro(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); + FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; char desc[MAXLINE]; @@ -268,7 +285,7 @@ int readAtom_gro(Atom* atom, Parameter* param) { int i = 0; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } @@ -277,26 +294,26 @@ int readAtom_gro(Atom* atom, Parameter* param) { for(i = 0; desc[i] != '\n'; i++); desc[i] = '\0'; readline(line, fp); - atoms_to_read = atoi(strtok(line, " ")); - fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read); + atoms_to_read = atoi(strtok(line, "\t ")); + if(me==0)fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read); while(!feof(fp) && read_atoms < atoms_to_read) { readline(line, fp); - char *label = strtok(line, " "); - int type = type_str2int(strtok(NULL, " ")); - int atom_id = atoi(strtok(NULL, " ")) - 1; + char *label = strtok(line, "\t "); + int type = type_str2int(strtok(NULL, "\t ")); + int atom_id = atoi(strtok(NULL, "\t ")) - 1; atom_id = read_atoms; while(atom_id + 1 >= atom->Nmax) { growAtom(atom); } atom->type[atom_id] = type; - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); - atom_vx(atom_id) = atof(strtok(NULL, " ")); - atom_vy(atom_id) = atof(strtok(NULL, " ")); - atom_vz(atom_id) = atof(strtok(NULL, " ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); + atom_vx(atom_id) = atof(strtok(NULL, "\t ")); + atom_vy(atom_id) = atof(strtok(NULL, "\t ")); + atom_vz(atom_id) = atof(strtok(NULL, "\t ")); atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes); atom->Natoms++; atom->Nlocal++; @@ -306,18 +323,18 @@ int readAtom_gro(Atom* atom, Parameter* param) { if(!feof(fp)) { readline(line, fp); param->xlo = 0.0; - param->xhi = atof(strtok(line, " ")); + param->xhi = atof(strtok(line, "\t ")); param->ylo = 0.0; - param->yhi = atof(strtok(NULL, " ")); + param->yhi = atof(strtok(NULL, "\t ")); param->zlo = 0.0; - param->zhi = atof(strtok(NULL, " ")); + param->zhi = atof(strtok(NULL, "\t ")); param->xprd = param->xhi - param->xlo; param->yprd = param->yhi - param->ylo; param->zprd = param->zhi - param->zlo; } if(read_atoms != atoms_to_read) { - fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read); + if(me==0)fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read); exit(-1); return -1; } @@ -333,12 +350,14 @@ int readAtom_gro(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); + if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file); fclose(fp); return read_atoms; } int readAtom_dmp(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; int natoms = 0; @@ -347,7 +366,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) { int ts = -1; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } @@ -370,47 +389,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) { } } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) { readline(line, fp); - param->xlo = atof(strtok(line, " ")); - param->xhi = atof(strtok(NULL, " ")); + param->xlo = atof(strtok(line, "\t ")); + param->xhi = atof(strtok(NULL, "\t ")); param->xprd = param->xhi - param->xlo; readline(line, fp); - param->ylo = atof(strtok(line, " ")); - param->yhi = atof(strtok(NULL, " ")); + param->ylo = atof(strtok(line, "\t ")); + param->yhi = atof(strtok(NULL, "\t ")); param->yprd = param->yhi - param->ylo; readline(line, fp); - param->zlo = atof(strtok(line, " ")); - param->zhi = atof(strtok(NULL, " ")); + param->zlo = atof(strtok(line, "\t ")); + param->zhi = atof(strtok(NULL, "\t ")); param->zprd = param->zhi - param->zlo; } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) { for(int i = 0; i < natoms; i++) { readline(line, fp); - atom_id = atoi(strtok(line, " ")) - 1; - atom->type[atom_id] = atoi(strtok(NULL, " ")); - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); - atom_vx(atom_id) = atof(strtok(NULL, " ")); - atom_vy(atom_id) = atof(strtok(NULL, " ")); - atom_vz(atom_id) = atof(strtok(NULL, " ")); + atom_id = atoi(strtok(line, "\t ")) - 1; + atom->type[atom_id] = atoi(strtok(NULL, "\t ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); + atom_vx(atom_id) = atof(strtok(NULL, "\t ")); + atom_vy(atom_id) = atof(strtok(NULL, "\t ")); + atom_vz(atom_id) = atof(strtok(NULL, "\t ")); atom->ntypes = MAX(atom->type[atom_id], atom->ntypes); read_atoms++; } } else { - fprintf(stderr, "Invalid item: %s\n", item); + if(me==0)fprintf(stderr, "Invalid item: %s\n", item); exit(-1); return -1; } } else { - fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line); + if(me==0)fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line); exit(-1); return -1; } } if(ts < 0 || !natoms || !read_atoms) { - fprintf(stderr, "Input error: atom data was not read!\n"); + if(me==0)fprintf(stderr, "Input error: atom data was not read!\n"); exit(-1); return -1; } @@ -426,30 +445,34 @@ int readAtom_dmp(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file); + if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file); return natoms; } int readAtom_in(Atom* atom, Parameter* param) { + int me = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &me); FILE *fp = fopen(param->input_file, "r"); char line[MAXLINE]; int natoms = 0; int atom_id = 0; if(!fp) { - fprintf(stderr, "Could not open input file: %s\n", param->input_file); + if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file); exit(-1); return -1; } - readline(line, fp); - natoms = atoi(strtok(line, " ")); - param->xlo = atof(strtok(NULL, " ")); - param->xhi = atof(strtok(NULL, " ")); - param->ylo = atof(strtok(NULL, " ")); - param->yhi = atof(strtok(NULL, " ")); - param->zlo = atof(strtok(NULL, " ")); - param->zhi = atof(strtok(NULL, " ")); + natoms = atoi(strtok(line, "\t ")); + param->xlo = atof(strtok(NULL, "\t ")); + param->xhi = atof(strtok(NULL, "\t ")); + param->ylo = atof(strtok(NULL, "\t ")); + param->yhi = atof(strtok(NULL, "\t ")); + param->zlo = atof(strtok(NULL, "\t ")); + param->zhi = atof(strtok(NULL, "\t ")); + param->xprd = param->xhi - param->xlo; + param->yprd = param->yhi - param->ylo; + param->zprd = param->zhi - param->zlo; atom->Natoms = natoms; atom->Nlocal = natoms; atom->ntypes = 1; @@ -462,27 +485,26 @@ int readAtom_in(Atom* atom, Parameter* param) { readline(line, fp); // TODO: store mass per atom - char *s_mass = strtok(line, " "); + char *s_mass = strtok(line, "\t "); if(strncmp(s_mass, "inf", 3) == 0) { // Set atom's mass to INFINITY } else { param->mass = atof(s_mass); } - - atom->radius[atom_id] = atof(strtok(NULL, " ")); - atom_x(atom_id) = atof(strtok(NULL, " ")); - atom_y(atom_id) = atof(strtok(NULL, " ")); - atom_z(atom_id) = atof(strtok(NULL, " ")); - atom_vx(atom_id) = atof(strtok(NULL, " ")); - atom_vy(atom_id) = atof(strtok(NULL, " ")); - atom_vz(atom_id) = atof(strtok(NULL, " ")); + + atom->radius[atom_id] = atof(strtok(NULL, "\t ")); + atom_x(atom_id) = atof(strtok(NULL, "\t ")); + atom_y(atom_id) = atof(strtok(NULL, "\t ")); + atom_z(atom_id) = atof(strtok(NULL, "\t ")); + atom_vx(atom_id) = atof(strtok(NULL, "\t ")); + atom_vy(atom_id) = atof(strtok(NULL, "\t ")); + atom_vz(atom_id) = atof(strtok(NULL, "\t ")); atom->type[atom_id] = 0; atom->ntypes = MAX(atom->type[atom_id], atom->ntypes); atom_id++; } - if(!natoms) { - fprintf(stderr, "Input error: atom data was not read!\n"); + if(me==0)fprintf(stderr, "Input error: atom data was not read!\n"); exit(-1); return -1; } @@ -498,25 +520,10 @@ int readAtom_in(Atom* atom, Parameter* param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file); + if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file); return natoms; } -void writeAtom(Atom *atom, Parameter *param) { - FILE *fp = fopen(param->write_atom_file, "w"); - - for(int i = 0; i < atom->Nlocal; i++) { - fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n", - atom->type[i], 1.0, - atom_x(i), atom_y(i), atom_z(i), - atom_vx(i), atom_vy(i), atom_vz(i)); - } - - fclose(fp); - fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n", - param->write_atom_file, param->xprd, param->yprd, param->zprd); -} - void growAtom(Atom *atom) { DeviceAtom *d_atom = &(atom->d_atom); int nold = atom->Nmax; @@ -545,7 +552,125 @@ void growAtom(Atom *atom) { REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int)); // DEM - atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + atom->radius = (MD_FLOAT*) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); atom->r = (MD_FLOAT*) reallocate(atom->r, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4); } + +/* MPI added*/ +void packForward(Atom* atom, int n ,int* list, MD_FLOAT* buf, int* pbc) +{ + int i, j; + for(i = 0; i < n; i++) { + j = list[i]; + buf_x(i) = atom_x(j) + pbc[0] * atom->mybox.xprd; + buf_y(i) = atom_y(j) + pbc[1] * atom->mybox.yprd; + buf_z(i) = atom_z(j) + pbc[2] * atom->mybox.zprd; + } +} + +void unpackForward(Atom* atom, int n, int first, MD_FLOAT* buf) +{ + for(int i = 0; i < n; i++) { + atom_x((first + i)) = buf_x(i); + atom_y((first + i)) = buf_y(i); + atom_z((first + i)) = buf_z(i); + } +} + +int packGhost(Atom* atom, int i, MD_FLOAT* buf, int* pbc) +{ + int m = 0; + buf[m++] = atom_x(i) + pbc[_x] * atom->mybox.xprd; + buf[m++] = atom_y(i) + pbc[_y] * atom->mybox.yprd; + buf[m++] = atom_z(i) + pbc[_z] * atom->mybox.zprd; + buf[m++] = atom->type[i]; + return m; +} + +int unpackGhost(Atom* atom, int i, MD_FLOAT* buf) +{ + while (i>=atom->Nmax) growAtom(atom); + int m = 0; + atom_x(i) = buf[m++]; + atom_y(i) = buf[m++]; + atom_z(i) = buf[m++]; + atom->type[i] = buf[m++]; + atom->Nghost++; + return m; +} + +void packReverse(Atom* atom, int n, int first, MD_FLOAT* buf) +{ + for(int i = 0; i < n; i++) { + buf_x(i) = atom_fx(first + i); + buf_y(i) = atom_fy(first + i); + buf_z(i) = atom_fz(first + i); + } +} + +void unpackReverse(Atom* atom, int n, int* list, MD_FLOAT* buf) +{ + int i, j; + for(i = 0; i < n; i++) { + j = list[i]; + atom_fx(j) += buf_x(i); + atom_fy(j) += buf_y(i); + atom_fz(j) += buf_z(i); + } +} + +int packExchange(Atom* atom, int i, MD_FLOAT* buf) +{ + int m = 0; + buf[m++] = atom_x(i); + buf[m++] = atom_y(i); + buf[m++] = atom_z(i); + buf[m++] = atom_vx(i); + buf[m++] = atom_vy(i); + buf[m++] = atom_vz(i); + buf[m++] = atom->type[i]; + return m; +} + +int unpackExchange(Atom* atom, int i, MD_FLOAT* buf) +{ + while(i >= atom->Nmax) growAtom(atom); + int m = 0; + atom_x(i) = buf[m++]; + atom_y(i) = buf[m++]; + atom_z(i) = buf[m++]; + atom_vx(i) = buf[m++]; + atom_vy(i) = buf[m++]; + atom_vz(i) = buf[m++]; + atom->type[i] = buf[m++]; + return m; +} + +void pbc(Atom* atom) +{ + for(int i = 0; i < atom->Nlocal; i++) { + + MD_FLOAT xprd = atom->mybox.xprd; + MD_FLOAT yprd = atom->mybox.yprd; + MD_FLOAT zprd = atom->mybox.zprd; + + if(atom_x(i) < 0.0) atom_x(i) += xprd; + if(atom_y(i) < 0.0) atom_y(i) += yprd; + if(atom_z(i) < 0.0) atom_z(i)+= zprd; + if(atom_x(i) >= xprd) atom_x(i) -= xprd; + if(atom_y(i) >= yprd) atom_y(i) -= yprd; + if(atom_z(i) >= zprd) atom_z(i) -= zprd; + } +} + +void copy(Atom* atom, int i, int j) +{ + atom_x(i) = atom_x(j); + atom_y(i) = atom_y(j); + atom_z(i) = atom_z(j); + atom_vx(i) = atom_vx(j); + atom_vy(i) = atom_vy(j); + atom_vz(i) = atom_vz(j); + atom->type[i] = atom->type[j]; +} diff --git a/lammps/force_lj.c b/lammps/force_lj.c index 599d14d..4dd2356 100644 --- a/lammps/force_lj.c +++ b/lammps/force_lj.c @@ -13,13 +13,17 @@ #include #include #include - +#include +#include #ifdef __SIMD_KERNEL__ #include #endif +void computeForceGhostShell(Parameter*, Atom*, Neighbor*); + double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { int Nlocal = atom->Nlocal; + int Nghost = atom->Nghost; int* neighs; #ifndef EXPLICIT_TYPES MD_FLOAT cutforcesq = param->cutforce * param->cutforce; @@ -41,21 +45,21 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n { LIKWID_MARKER_START("force"); - #pragma omp for schedule(runtime) + #pragma omp for for(int i = 0; i < Nlocal; i++) { neighs = &neighbor->neighbors[i * neighbor->maxneighs]; int numneighs = neighbor->numneigh[i]; MD_FLOAT xtmp = atom_x(i); MD_FLOAT ytmp = atom_y(i); MD_FLOAT ztmp = atom_z(i); - MD_FLOAT fix = 0; - MD_FLOAT fiy = 0; - MD_FLOAT fiz = 0; - + MD_FLOAT fix = 0.0; + MD_FLOAT fiy = 0.0; + MD_FLOAT fiz = 0.0; + #ifdef EXPLICIT_TYPES const int type_i = atom->type[i]; #endif - + for(int k = 0; k < numneighs; k++) { int j = neighs[k]; MD_FLOAT delx = xtmp - atom_x(j); @@ -70,26 +74,25 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n const MD_FLOAT sigma6 = atom->sigma6[type_ij]; const MD_FLOAT epsilon = atom->epsilon[type_ij]; #endif - if(rsq < cutforcesq) { MD_FLOAT sr2 = num1 / rsq; MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; - MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon; + MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon; fix += delx * force; fiy += dely * force; - fiz += delz * force; + fiz += delz * force; + #ifdef USE_REFERENCE_VERSION addStat(stats->atoms_within_cutoff, 1); } else { addStat(stats->atoms_outside_cutoff, 1); #endif } - } - + } atom_fx(i) += fix; atom_fy(i) += fiy; atom_fz(i) += fiz; - + #ifdef USE_REFERENCE_VERSION if(numneighs % VECTOR_WIDTH > 0) { addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH)); @@ -102,13 +105,13 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n LIKWID_MARKER_STOP("force"); } - double E = getTimeStamp(); return E-S; } double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { int Nlocal = atom->Nlocal; + int Nghost = atom->Nghost; int* neighs; #ifndef EXPLICIT_TYPES MD_FLOAT cutforcesq = param->cutforce * param->cutforce; @@ -119,19 +122,18 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, const MD_FLOAT num48 = 48.0; const MD_FLOAT num05 = 0.5; - for(int i = 0; i < Nlocal; i++) { + for(int i = 0; i < Nlocal+Nghost; i++) { atom_fx(i) = 0.0; atom_fy(i) = 0.0; atom_fz(i) = 0.0; } - double S = getTimeStamp(); #pragma omp parallel { LIKWID_MARKER_START("forceLJ-halfneigh"); - #pragma omp for schedule(runtime) + #pragma omp for for(int i = 0; i < Nlocal; i++) { neighs = &neighbor->neighbors[i * neighbor->maxneighs]; int numneighs = neighbor->numneigh[i]; @@ -172,16 +174,14 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, fix += delx * force; fiy += dely * force; fiz += delz * force; - - // We do not need to update forces for ghost atoms - if(j < Nlocal) { + // We need to update forces for ghost atoms if shell_method or half stencil is requiered + if((param->half_neigh && jmethod){ atom_fx(j) -= delx * force; atom_fy(j) -= dely * force; atom_fz(j) -= delz * force; } } } - atom_fx(i) += fix; atom_fy(i) += fiy; atom_fz(i) += fiz; @@ -190,6 +190,7 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH); } + if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor); LIKWID_MARKER_STOP("forceLJ-halfneigh"); } @@ -227,7 +228,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig { LIKWID_MARKER_START("force"); - #pragma omp for schedule(runtime) + #pragma omp for for(int i = 0; i < Nlocal; i++) { neighs = &neighbor->neighbors[i * neighbor->maxneighs]; int numneighs = neighbor->numneigh[i]; @@ -276,3 +277,58 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig double E = getTimeStamp(); return E-S; } + +void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) { + int Nshell = neighbor->Nshell; + int* neighs; + #ifndef EXPLICIT_TYPES + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + #endif + const MD_FLOAT num1 = 1.0; + const MD_FLOAT num48 = 48.0; + const MD_FLOAT num05 = 0.5; + + for(int i = 0; i < Nshell; i++) { + neighs = &(neighbor->neighshell[i * neighbor->maxneighs]); + int numneigh = neighbor->numNeighShell[i]; + int iatom = neighbor->listshell[i]; + MD_FLOAT xtmp = atom_x(iatom); + MD_FLOAT ytmp = atom_y(iatom); + MD_FLOAT ztmp = atom_z(iatom); + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; + + #ifdef EXPLICIT_TYPES + const int type_i = atom->type[i]; + #endif + + for(int k = 0; k < numneigh; k++) { + int jatom = neighs[k]; + MD_FLOAT delx = xtmp - atom_x(jatom); + MD_FLOAT dely = ytmp - atom_y(jatom); + MD_FLOAT delz = ztmp - atom_z(jatom); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + if(rsq < cutforcesq) { + MD_FLOAT sr2 = num1 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + + atom_fx(jatom) -= delx * force; + atom_fy(jatom) -= dely * force; + atom_fz(jatom) -= delz * force; + } + } + atom_fx(iatom) += fix; + atom_fy(iatom) += fiy; + atom_fz(iatom) += fiz; + + } +} + diff --git a/lammps/includes/atom.h b/lammps/includes/atom.h index 0fe3b51..a5ad991 100644 --- a/lammps/includes/atom.h +++ b/lammps/includes/atom.h @@ -4,8 +4,9 @@ * Use of this source code is governed by a LGPL-3.0 * license that can be found in the LICENSE file. */ -#include +#include +#include #ifndef __ATOM_H_ #define __ATOM_H_ @@ -56,6 +57,8 @@ typedef struct { MD_FLOAT *sigma6; MD_FLOAT *cutforcesq; MD_FLOAT *cutneighsq; + //TODO: insert the id number + //MD_FLOAT *Atom_id; // DEM MD_FLOAT *radius; @@ -64,6 +67,9 @@ typedef struct { // Device data DeviceAtom d_atom; + + //Info Subdomain + Box mybox; } Atom; extern void initAtom(Atom*); @@ -73,9 +79,19 @@ extern int readAtom_pdb(Atom*, Parameter*); extern int readAtom_gro(Atom*, Parameter*); extern int readAtom_dmp(Atom*, Parameter*); extern int readAtom_in(Atom*, Parameter*); -extern void writeAtom(Atom*, Parameter*); extern void growAtom(Atom*); +int packGhost(Atom*, int, MD_FLOAT*, int*); +int unpackGhost(Atom*, int, MD_FLOAT*); +int packExchange(Atom*, int, MD_FLOAT*); +int unpackExchange(Atom*, int, MD_FLOAT*); +void packForward(Atom*, int, int*, MD_FLOAT*, int*); +void unpackForward(Atom*, int, int, MD_FLOAT*); +void packReverse(Atom* , int , int , MD_FLOAT*); +void unpackReverse(Atom*, int, int*, MD_FLOAT*); +void pbc(Atom*); +void copy(Atom*, int, int); + #ifdef AOS # define POS_DATA_LAYOUT "AoS" # define atom_x(i) atom->x[(i) * 3 + 0] @@ -100,4 +116,8 @@ extern void growAtom(Atom*); # define atom_fz(i) atom->fz[i] #endif +# define buf_x(i) buf[3*(i)] +# define buf_y(i) buf[3*(i)+1] +# define buf_z(i) buf[3*(i)+2] + #endif diff --git a/lammps/includes/neighbor.h b/lammps/includes/neighbor.h index 3b94175..dc4c633 100644 --- a/lammps/includes/neighbor.h +++ b/lammps/includes/neighbor.h @@ -20,9 +20,14 @@ typedef struct { int ncalls; int maxneighs; int half_neigh; + int half_stencil; int *neighbors; int *numneigh; - + //MPI + int Nshell; //# of atoms in listShell + int *numNeighShell; //# of neighs for each atom in listShell + int *neighshell; //list of neighs for each atom in listShell + int *listshell; //Atoms to compute the force // Device data DeviceNeighbor d_neighbor; } Neighbor; diff --git a/lammps/includes/vtk.h b/lammps/includes/vtk.h index db5bfa4..161c6bc 100644 --- a/lammps/includes/vtk.h +++ b/lammps/includes/vtk.h @@ -5,8 +5,11 @@ * license that can be found in the LICENSE file. */ #include +#include +#include #ifndef __VTK_H_ #define __VTK_H_ extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep); +extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep); #endif diff --git a/lammps/main.c b/lammps/main.c index b291b52..6fd094a 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -11,10 +11,7 @@ #include #include #include -#include - #include - #include #include #include @@ -24,13 +21,19 @@ #include #include #include -#include #include #include #include #include +#include +#include +#include +#include #define HLINE "----------------------------------------------------------------------------\n" +#ifdef CUDA_TARGET +extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*); +#endif extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*); @@ -38,69 +41,6 @@ extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*); -#ifdef CUDA_TARGET -extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*); -#endif - -double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { - if(param->force_field == FF_EAM) { initEam(eam, param); } - double S, E; - param->lattice = pow((4.0 / param->rho), (1.0 / 3.0)); - param->xprd = param->nx * param->lattice; - param->yprd = param->ny * param->lattice; - param->zprd = param->nz * param->lattice; - - S = getTimeStamp(); - initAtom(atom); - initPbc(atom); - initStats(stats); - initNeighbor(neighbor, param); - if(param->input_file == NULL) { - createAtom(atom, param); - } else { - readAtom(atom, param); - } - - setupNeighbor(param); - setupThermo(param, atom->Natoms); - if(param->input_file == NULL) { adjustThermo(param, atom); } - #ifdef SORT_ATOMS - atom->Nghost = 0; - sortAtom(atom); - #endif - setupPbc(atom, param); - initDevice(atom, neighbor); - updatePbc(atom, param, true); - buildNeighbor(atom, neighbor); - E = getTimeStamp(); - return E-S; -} - -double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { - double S, E; - S = getTimeStamp(); - LIKWID_MARKER_START("reneighbour"); - updateAtomsPbc(atom, param); - #ifdef SORT_ATOMS - atom->Nghost = 0; - sortAtom(atom); - #endif - setupPbc(atom, param); - updatePbc(atom, param, true); - buildNeighbor(atom, neighbor); - LIKWID_MARKER_STOP("reneighbour"); - E = getTimeStamp(); - return E-S; -} - -void printAtomState(Atom *atom) { - printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax); - // int nall = atom->Nlocal + atom->Nghost; - // for (int i=0; ix[i], atom->y[i], atom->z[i]); - // } -} - double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { return computeForceEam(eam, param, atom, neighbor, stats); @@ -113,7 +53,7 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, } } - if(param->half_neigh) { + if(param->half_neigh || param->method) { return computeForceLJHalfNeigh(param, atom, neighbor, stats); } @@ -124,6 +64,102 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, #endif } +double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time){ + double S, E; + int dims = 3; //TODO: Adjust to do in 3d and 2d + S = getTimeStamp(); + if(param->balance == RCB) { + rcbBalance(grid, atom, param, meanBisect,dims,0); + neighComm(comm, param, grid); + }else if(param->balance == meanTimeRCB){ + rcbBalance(grid, atom, param, meanTimeBisect,dims,time); + neighComm(comm, param, grid); + }else if(param->balance == Staggered) { + staggeredBalance(grid, atom, param, time); + neighComm(comm, param, grid); + exchangeComm(comm,atom); + }else { } //Do nothing + //printGrid(grid); + E = getTimeStamp(); + + return E-S; +} + +double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) +{ + double E,S,time; + int me; + MPI_Comm_rank(world,&me); + S = getTimeStamp(); + if(param->balance == meanTimeRCB || param->balance == RCB){ + rcbBalance(grid, atom, param, meanBisect,3,0); + neighComm(comm, param, grid); + } + MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); + printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms); + MPI_Barrier(world); + E = getTimeStamp(); + return E-S; +} + +double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) { + if(param->force_field == FF_EAM) { initEam(eam, param); } + double S, E; + param->lattice = pow((4.0 / param->rho), (1.0 / 3.0)); + param->xprd = param->nx * param->lattice; + param->yprd = param->ny * param->lattice; + param->zprd = param->nz * param->lattice; + S = getTimeStamp(); + initAtom(atom); + initStats(stats); + initNeighbor(neighbor, param); + if(param->input_file == NULL) { + createAtom(atom, param); + } else { + readAtom(atom, param); + } + setupGrid(grid,atom,param); + setupNeighbor(param); + setupComm(comm, param, grid); + if(param->balance){ + initialBalance(param, eam, atom, neighbor, stats, comm, grid); + } + setupThermo(param, atom->Natoms); + if(param->input_file == NULL) { adjustThermo(param, atom); } + #ifdef SORT_ATOMS + atom->Nghost = 0; + sortAtom(atom); + #endif + initDevice(atom, neighbor); + ghostNeighbor(comm, atom, param); + buildNeighbor(atom, neighbor); + E = getTimeStamp(); + return E-S; +} + +double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) { + double S, E; + S = getTimeStamp(); + LIKWID_MARKER_START("reneighbour"); + #ifdef SORT_ATOMS + atom->Nghost = 0; + sortAtom(atom); + #endif + ghostNeighbor(comm, atom, param); + buildNeighbor(atom, neighbor); + LIKWID_MARKER_STOP("reneighbour"); + E = getTimeStamp(); + return E-S; +} + +double updateAtoms(Comm* comm, Atom* atom){ + double S,E; + S = getTimeStamp(); + exchangeComm(comm, atom); + E = getTimeStamp(); + return E-S; +} + void writeInput(Parameter *param, Atom *atom) { FILE *fpin = fopen("input.in", "w"); fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd); @@ -142,18 +178,19 @@ int main(int argc, char** argv) { Neighbor neighbor; Stats stats; Parameter param; - + Comm comm; + Grid grid; LIKWID_MARKER_INIT; #pragma omp parallel { LIKWID_MARKER_REGISTER("force"); //LIKWID_MARKER_REGISTER("reneighbour"); //LIKWID_MARKER_REGISTER("pbc"); - } - + } + initComm(&argc, &argv, &comm); initParameter(¶m); for(int i = 0; i < argc; i++) { - if((strcmp(argv[i], "-p") == 0) || strcmp(argv[i], "--params") == 0) { + if((strcmp(argv[i], "-p") == 0)) { readParameter(¶m, argv[++i]); continue; } @@ -191,6 +228,24 @@ int main(int argc, char** argv) { if((strcmp(argv[i], "-half") == 0)) { param.half_neigh = atoi(argv[++i]); continue; + } + if((strcmp(argv[i], "-method") == 0)) { + param.method = atoi(argv[++i]); + if (param.method>3 || param.method< 0){ + if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n"); + endComm(&comm); + exit(0); + } + continue; + } + if((strcmp(argv[i], "-bal") == 0)) { + param.balance = atoi(argv[++i]); + if (param.balance>3 || param.balance< 0){ + if(comm.myproc == 0) fprintf(stderr, "Load Balance does not exist!\n"); + endComm(&comm); + exit(0); + } + continue; } if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) { param.cutforce = atof(argv[++i]); @@ -208,71 +263,71 @@ int main(int argc, char** argv) { param.vtk_file = strdup(argv[++i]); continue; } - if((strcmp(argv[i], "-w") == 0)) { - param.write_atom_file = strdup(argv[++i]); - continue; - } if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { - printf("MD Bench: A minimalistic re-implementation of miniMD\n"); - printf(HLINE); - printf("-p / --params : file to read parameters from (can be specified more than once)\n"); - printf("-f : force field (lj, eam or dem), default lj\n"); - printf("-i : input file with atom positions (dump)\n"); - printf("-e : input file for EAM\n"); - printf("-n / --nsteps : set number of timesteps for simulation\n"); - printf("-nx/-ny/-nz : set linear dimension of systembox in x/y/z direction\n"); - printf("-half : use half (1) or full (0) neighbor lists\n"); - printf("-r / --radius : set cutoff radius\n"); - printf("-s / --skin : set skin (verlet buffer)\n"); - printf("-w : write input atoms to file\n"); - printf("--freq : processor frequency (GHz)\n"); - printf("--vtk : VTK file for visualization\n"); - printf(HLINE); - exit(EXIT_SUCCESS); + if(comm.myproc ==0 ){ + printf("MD Bench: A minimalistic re-implementation of miniMD\n"); + printf(HLINE); + printf("-p : file to read parameters from (can be specified more than once)\n"); + printf("-f : force field (lj, eam or dem), default lj\n"); + printf("-i : input file with atom positions (dump)\n"); + printf("-e : input file for EAM\n"); + printf("-n / --nsteps : set number of timesteps for simulation\n"); + printf("-nx/-ny/-nz : set linear dimension of systembox in x/y/z direction\n"); + printf("-r / --radius : set cutoff radius\n"); + printf("-s / --skin : set skin (verlet buffer)\n"); + printf("--freq : processor frequency (GHz)\n"); + printf("--vtk : VTK file for visualization\n"); + printf(HLINE); + } + exit(EXIT_SUCCESS); } } + + if(param.balance>0 && param.method == 1){ + if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n"); + endComm(&comm); + exit(0); + } param.cutneigh = param.cutforce + param.skin; - setup(¶m, &eam, &atom, &neighbor, &stats); - printParameter(¶m); - printf(HLINE); - - printf("step\ttemp\t\tpressure\n"); + timer[SETUP]=setup(¶m, &eam, &atom, &neighbor, &stats, &comm, &grid); + if(comm.myproc == 0)printParameter(¶m); + if(comm.myproc == 0)printf(HLINE); + if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n"); computeThermo(0, ¶m, &atom); #if defined(MEM_TRACER) || defined(INDEX_TRACER) - traceAddresses(¶m, &atom, &neighbor, n + 1); + traceAddresses(¶m, &atom, &neighbor, n + 1);// TODO: trace adress #endif - - if(param.write_atom_file != NULL) { - writeAtom(&atom, ¶m); - } - //writeInput(¶m, &atom); - - timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats); - timer[NEIGH] = 0.0; - timer[TOTAL] = getTimeStamp(); - + timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats); + timer[NEIGH] = 0.0; + timer[FORWARD] = 0.0; + timer[UPDATE] = 0.0; + timer[BALANCE] = 0.0; + timer[REVERSE] = reverse(&comm, &atom, ¶m); + MPI_Barrier(world); + timer[TOTAL] = getTimeStamp(); if(param.vtk_file != NULL) { - write_atoms_to_vtk_file(param.vtk_file, &atom, 0); - } - + printvtk(param.vtk_file, &comm, &atom, ¶m, 0); + } for(int n = 0; n < param.ntimes; n++) { bool reneigh = (n + 1) % param.reneigh_every == 0; initialIntegrate(reneigh, ¶m, &atom); - if((n + 1) % param.reneigh_every) { - updatePbc(&atom, ¶m, false); + if(reneigh) { + timer[UPDATE] +=updateAtoms(&comm,&atom); + if(param.balance && !((n+1)%param.balance_every)) + timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , ¶m, timer[FORCE]); + timer[NEIGH] += reneighbour(&comm, ¶m, &atom, &neighbor); } else { - timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); - } - + timer[FORWARD] += forward(&comm, &atom, ¶m); + } #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); #endif - timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats); + timer[REVERSE] += reverse(&comm, &atom, ¶m); finalIntegrate(reneigh, ¶m, &atom); - + if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { #ifdef CUDA_TARGET memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3); @@ -281,47 +336,42 @@ int main(int argc, char** argv) { } if(param.vtk_file != NULL) { - write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1); - } + printvtk(param.vtk_file, &comm, &atom ,¶m, n+1); + } } - + MPI_Barrier(world); timer[TOTAL] = getTimeStamp() - timer[TOTAL]; computeThermo(-1, ¶m, &atom); + + double mint[NUMTIMER]; + double maxt[NUMTIMER]; + double sumt[NUMTIMER]; + timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE]; + MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world); + MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world); + MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world); + int Nghost; + MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world); - printf(HLINE); - printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); - printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n", - timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]); - printf(HLINE); - - int nthreads = 0; - int chunkSize = 0; - omp_sched_t schedKind; - char schedType[10]; -#pragma omp parallel -#pragma omp master - { - omp_get_schedule(&schedKind, &chunkSize); - - switch (schedKind) - { - case omp_sched_static: strcpy(schedType, "static"); break; - case omp_sched_dynamic: strcpy(schedType, "dynamic"); break; - case omp_sched_guided: strcpy(schedType, "guided"); break; - case omp_sched_auto: strcpy(schedType, "auto"); break; - } - - nthreads = omp_get_max_threads(); - } - - printf("Num threads: %d\n", nthreads); - printf("Schedule: (%s,%d)\n", schedType, chunkSize); - - printf("Performance: %.2f million atom updates per second\n", + if(comm.myproc == 0){ + int n = comm.numproc; + printf(HLINE); + printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes); + printf("TOTAL %.2fs\n\n",timer[TOTAL]); + printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP"); + printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n"); + printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n); + printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]); + printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]); + printf(HLINE); + printf("Performance: %.2f million atom updates per second\n", 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); + #ifdef COMPUTE_STATS displayStatistics(&atom, ¶m, &stats, timer); #endif + } + endComm(&comm); LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; } diff --git a/lammps/neighbor.c b/lammps/neighbor.c index c06f678..017b324 100644 --- a/lammps/neighbor.c +++ b/lammps/neighbor.c @@ -11,27 +11,40 @@ #include #include #include +#include +#include +#include #define SMALL 1.0e-6 #define FACTOR 0.999 MD_FLOAT xprd, yprd, zprd; MD_FLOAT bininvx, bininvy, bininvz; -int mbinxlo, mbinylo, mbinzlo; +int pad_x, pad_y, pad_z; int nbinx, nbiny, nbinz; -int mbinx, mbiny, mbinz; // n bins in x, y, z +int mbinx, mbiny, mbinz; // m bins in x, y, z int *bincount; int *bins; -int mbins; //total number of bins -int atoms_per_bin; // max atoms per bin +int mbins; //total number of bins +int atoms_per_bin; // max atoms per bin MD_FLOAT cutneigh; -MD_FLOAT cutneighsq; // neighbor cutoff squared +MD_FLOAT cutneighsq; // neighbor cutoff squared int nmax; -int nstencil; // # of bins in stencil -int* stencil; // stencil list of bin offsets +int nstencil; // # of bins in stencil +int* stencil; // stencil list of bin offsets MD_FLOAT binsizex, binsizey, binsizez; +int me; //rank +int method; // method +int half_stencil; //If half stencil exist +int shellMethod; //If shell method exist + static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT); static MD_FLOAT bindist(int, int, int); +static int ghostZone(Atom*, int); +static int eightZone(Atom*, int); +static int halfZone(Atom*, int); +static void neighborGhost(Atom*, Neighbor*); +static inline int interaction(Atom* atom, int i, int j); /* exported subroutines */ void initNeighbor(Neighbor *neighbor, Parameter *param) { @@ -51,7 +64,25 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { neighbor->maxneighs = 100; neighbor->numneigh = NULL; neighbor->neighbors = NULL; + //========== MPI ============= + shellMethod = 0; + half_stencil = 0; + method = param->method; + if(method == halfShell || method == eightShell){ + param->half_neigh = 1; + shellMethod = 1; + } + if(method == halfStencil){ + param->half_neigh = 0; + half_stencil = 1; + } + me = 0; + MPI_Comm_rank(MPI_COMM_WORLD,&me); neighbor->half_neigh = param->half_neigh; + neighbor->Nshell = 0; + neighbor->numNeighShell = NULL; + neighbor->neighshell = NULL; + neighbor->listshell = NULL; } void setupNeighbor(Parameter* param) { @@ -64,7 +95,6 @@ void setupNeighbor(Parameter* param) { yprd = param->yprd; zprd = param->zprd; } - // TODO: update lo and hi for standard case and use them here instead MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd; MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd; @@ -93,54 +123,48 @@ void setupNeighbor(Parameter* param) { bininvy = 1.0 / binsizey; bininvz = 1.0 / binsizez; } - - coord = xlo - cutneigh - SMALL * xprd; - mbinxlo = (int) (coord * bininvx); - if (coord < 0.0) { mbinxlo = mbinxlo - 1; } - coord = xhi + cutneigh + SMALL * xprd; - mbinxhi = (int) (coord * bininvx); - - coord = ylo - cutneigh - SMALL * yprd; - mbinylo = (int) (coord * bininvy); - if (coord < 0.0) { mbinylo = mbinylo - 1; } - coord = yhi + cutneigh + SMALL * yprd; - mbinyhi = (int) (coord * bininvy); - - coord = zlo - cutneigh - SMALL * zprd; - mbinzlo = (int) (coord * bininvz); - if (coord < 0.0) { mbinzlo = mbinzlo - 1; } - coord = zhi + cutneigh + SMALL * zprd; - mbinzhi = (int) (coord * bininvz); - - mbinxlo = mbinxlo - 1; - mbinxhi = mbinxhi + 1; - mbinx = mbinxhi - mbinxlo + 1; - - mbinylo = mbinylo - 1; - mbinyhi = mbinyhi + 1; - mbiny = mbinyhi - mbinylo + 1; - - mbinzlo = mbinzlo - 1; - mbinzhi = mbinzhi + 1; - mbinz = mbinzhi - mbinzlo + 1; + pad_x = (int)(cutneigh*bininvx); + while(pad_x * binsizex < FACTOR * cutneigh) pad_x++; + pad_y = (int)(cutneigh*bininvy); + while(pad_y * binsizey < FACTOR * cutneigh) pad_y++; + pad_z = (int)(cutneigh*bininvz); + while(pad_z * binsizez < FACTOR * cutneigh) pad_z++; nextx = (int) (cutneigh * bininvx); - if(nextx * binsizex < FACTOR * cutneigh) nextx++; + if(nextx * binsizex < FACTOR * cutneigh){ + nextx++; + pad_x++; + } nexty = (int) (cutneigh * bininvy); - if(nexty * binsizey < FACTOR * cutneigh) nexty++; + if(nexty * binsizey < FACTOR * cutneigh){ + nexty++; + pad_y++; + } nextz = (int) (cutneigh * bininvz); - if(nextz * binsizez < FACTOR * cutneigh) nextz++; + if(nextz * binsizez < FACTOR * cutneigh){ + nextz++; + pad_z++; + } + + mbinx = nbinx+4*pad_x; + mbiny = nbiny+4*pad_y; + mbinz = nbinz+4*pad_z; if (stencil) { free(stencil); } stencil = (int*) malloc((2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int)); nstencil = 0; + int kstart = -nextz; - + int jstart = -nexty; + int istart = -nextx; + int ibin = 0; for(int k = kstart; k <= nextz; k++) { - for(int j = -nexty; j <= nexty; j++) { - for(int i = -nextx; i <= nextx; i++) { - if(bindist(i, j, k) < cutneighsq) { - stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i; + for(int j = jstart; j <= nexty; j++) { + for(int i = istart; i <= nextx; i++) { + if(bindist(i, j, k) < cutneighsq) { + int jbin = k * mbiny * mbinx + j * mbinx + i; + if(ibin>jbin && half_stencil) continue; + stencil[nstencil++] = jbin; } } } @@ -154,8 +178,7 @@ void setupNeighbor(Parameter* param) { } void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) { - int nall = atom->Nlocal + atom->Nghost; - + int nall = atom->Nlocal + atom->Nghost; /* extend atom arrays if necessary */ if(nall > nmax) { nmax = nall; @@ -164,16 +187,13 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) { neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*)); } - /* bin local & ghost atoms */ binatoms(atom); int resize = 1; - /* loop over each atom, storing neighbors */ while(resize) { int new_maxneighs = neighbor->maxneighs; resize = 0; - for(int i = 0; i < atom->Nlocal; i++) { int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]); int n = 0; @@ -184,21 +204,22 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) { #ifdef EXPLICIT_TYPES int type_i = atom->type[i]; #endif + for(int k = 0; k < nstencil; k++) { int jbin = ibin + stencil[k]; int* loc_bin = &bins[jbin * atoms_per_bin]; - - for(int m = 0; m < bincount[jbin]; m++) { + for(int m = 0; m < bincount[jbin]; m++) { int j = loc_bin[m]; - if((j == i) || (neighbor->half_neigh && (j < i))) { - continue; - } - + + if((j==i) || (neighbor->half_neigh && (jtype[j]; const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j]; @@ -210,8 +231,8 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) { } } } - neighbor->numneigh[i] = n; + if(n >= neighbor->maxneighs) { resize = 1; @@ -220,14 +241,15 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) { } } } - if(resize) { - printf("RESIZE %d\n", neighbor->maxneighs); + printf("RESIZE %d, PROC %d\n", neighbor->maxneighs,me); neighbor->maxneighs = new_maxneighs * 1.2; free(neighbor->neighbors); neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); } } + + if(method == eightShell) neighborGhost(atom, neighbor); } /* internal subroutines */ @@ -257,44 +279,28 @@ MD_FLOAT bindist(int i, int j, int k) { } else { delz = (k + 1) * binsizez; } - return (delx * delx + dely * dely + delz * delz); } int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin) { - int ix, iy, iz; - - if(xin >= xprd) { - ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo; - } else if(xin >= 0.0) { - ix = (int)(xin * bininvx) - mbinxlo; - } else { - ix = (int)(xin * bininvx) - mbinxlo - 1; - } - - if(yin >= yprd) { - iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo; - } else if(yin >= 0.0) { - iy = (int)(yin * bininvy) - mbinylo; - } else { - iy = (int)(yin * bininvy) - mbinylo - 1; - } - - if(zin >= zprd) { - iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo; - } else if(zin >= 0.0) { - iz = (int)(zin * bininvz) - mbinzlo; - } else { - iz = (int)(zin * bininvz) - mbinzlo - 1; - } - - return (iz * mbiny * mbinx + iy * mbinx + ix + 1); + int ix, iy, iz; + MD_FLOAT eps = 1e-9; + MD_FLOAT xlo=0.0; MD_FLOAT ylo=0.0; MD_FLOAT zlo=0.0; + xlo = fabs(xlo - pad_x*binsizex)+eps; + ylo = fabs(ylo - pad_y*binsizey)+eps; + zlo = fabs(zlo - pad_z*binsizez)+eps; + ix = (int) ((xin + xlo)*bininvx); + iy = (int) ((yin + ylo)*bininvy); + iz = (int) ((zin + zlo)*bininvz); + + return (iz * mbiny * mbinx + iy * mbinx + ix); + //return (iz * mbiny * mbinx + iy * mbinx + ix + 1); } -void binatoms(Atom *atom) { +void binatoms(Atom *atom) { int nall = atom->Nlocal + atom->Nghost; int resize = 1; - + while(resize > 0) { resize = 0; @@ -304,7 +310,7 @@ void binatoms(Atom *atom) { for(int i = 0; i < nall; i++) { int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i)); - + if(shellMethod && !ghostZone(atom, i)) continue; if(bincount[ibin] < atoms_per_bin) { int ac = bincount[ibin]++; bins[ibin * atoms_per_bin + ac] = i; @@ -325,11 +331,9 @@ void sortAtom(Atom* atom) { binatoms(atom); int Nmax = atom->Nmax; int* binpos = bincount; - for(int i = 1; i < mbins; i++) { binpos[i] += binpos[i - 1]; } - #ifdef AOS MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3); MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3); @@ -367,7 +371,6 @@ void sortAtom(Atom* atom) { #endif } } - free(atom->x); free(atom->vx); atom->x = new_x; @@ -383,3 +386,158 @@ void sortAtom(Atom* atom) { atom->vz = new_vz; #endif } + +/* internal subroutines +Added with MPI*/ + +static int ghostZone(Atom* atom, int i){ + if(iNlocal) return 1; + else if(method == halfShell) return halfZone(atom,i); + else if(method == eightShell) return eightZone(atom,i); + else return 0; +} + +static int eightZone(Atom* atom, int i) +{ + //Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7 + int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7}; + MD_FLOAT *hi = atom->mybox.hi; + int zone = 0; + + if(BigOrEqual(atom_x(i),hi[_x])) { + zone += 1; + } + if(BigOrEqual(atom_y(i),hi[_y])) { + zone += 2; + } + if(BigOrEqual(atom_z(i),hi[_z])) { + zone += 4; + } + return zoneMapping[zone]; +} + +static int halfZone(Atom* atom, int i) +{ + MD_FLOAT *hi = atom->mybox.hi; + MD_FLOAT *lo = atom->mybox.lo; + + if(atom_x(i)Nlocal; + int Nghost = atom->Nghost; + if(neighbor->listshell) free(neighbor->listshell); + neighbor->listshell = (int*) malloc(Nghost * sizeof(int)); + int* listzone = (int*) malloc(8 * Nghost * sizeof(int)); + int countAtoms[8] = {0,0,0,0,0,0,0,0}; + + //Selecting ghost atoms for interaction + for(int i = Nlocal; i < Nlocal+Nghost; i++) { + int izone = ghostZone(atom,i); + int *list = &listzone[Nghost*izone]; + int n = countAtoms[izone]; + list[n] = i; + countAtoms[izone]++; + } + + for(int zone = 1; zone<=3; zone++){ + int *list = &listzone[Nghost*zone]; + for(int n=0; nlistshell[Nshell++] = list[n]; + } + + neighbor->Nshell = Nshell; + if(neighbor->numNeighShell) free(neighbor->numNeighShell); + if(neighbor->neighshell) free(neighbor->neighshell); + neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int)); + neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int)); + int resize = 1; + + while(resize) + { + resize = 0; + for(int i = 0; i < Nshell; i++) { + int *neighshell = &(neighbor->neighshell[i*neighbor->maxneighs]); + int n = 0; + int iatom = neighbor->listshell[i]; + int izone = ghostZone(atom, iatom); + MD_FLOAT xtmp = atom_x(iatom); + MD_FLOAT ytmp = atom_y(iatom); + MD_FLOAT ztmp = atom_z(iatom); + int ibin = coord2bin(xtmp, ytmp, ztmp); + + #ifdef EXPLICIT_TYPES + int type_i = atom->type[iatom]; + #endif + + for(int k = 0; k < nstencil; k++) { + int jbin = ibin + stencil[k]; + int* loc_bin = &bins[jbin * atoms_per_bin]; + for(int m = 0; m < bincount[jbin]; m++) { + int jatom = loc_bin[m]; + + int jzone = ghostZone(atom,jatom); + + if(jzone <=izone) continue; + if(izone == 1 && (jzone==5||jzone==6||jzone==7)) continue; + if(izone == 2 && (jzone==4||jzone==6||jzone==7)) continue; + if(izone == 3 && (jzone==4||jzone==5||jzone==7)) continue; + + MD_FLOAT delx = xtmp - atom_x(jatom); + MD_FLOAT dely = ytmp - atom_y(jatom); + MD_FLOAT delz = ztmp - atom_z(jatom); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + + #ifdef EXPLICIT_TYPES + int type_j = atom->type[jatom]; + const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j]; + #else + const MD_FLOAT cutoff = cutneighsq; + #endif + if(rsq <= cutoff) { + neighshell[n++] = jatom; + } + } + } + + neighbor->numNeighShell[i] = n; + if(n >= neighbor->maxneighs){ + resize = 1; + neighbor->maxneighs = n * 1.2; + break; + } + } + + if(resize) { + free(neighbor->neighshell); + neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int)); + } + } + free(listzone); +} + +static inline int interaction(Atom* atom, int i, int j) { + + if(iNlocal) { + return 1; + } else if( atom_z(j)>atom_z(i) && j>=atom->Nlocal) { + return 1; + } else if(Equal(atom_z(j),atom_z(i)) && atom_y(j)=atom->Nlocal){ + return 1; + } else if(Equal(atom_z(j),atom_z(i)) && Equal(atom_y(j),atom_y(i)) && atom_x(j)=atom->Nlocal){ + return 1; + } else { + return 0; + } +} + \ No newline at end of file diff --git a/lammps/vtk.c b/lammps/vtk.c index c02ba1e..facee43 100644 --- a/lammps/vtk.c +++ b/lammps/vtk.c @@ -6,8 +6,12 @@ */ #include #include +#include +#include +#include -#include +static MPI_File _fh; +static inline void flushBuffer(char*); int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) { char timestep_filename[128]; @@ -18,12 +22,12 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) { fprintf(stderr, "Could not open VTK file for writing!\n"); return -1; } - fprintf(fp, "# vtk DataFile Version 2.0\n"); fprintf(fp, "Particle data\n"); fprintf(fp, "ASCII\n"); fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); fprintf(fp, "POINTS %d double\n", atom->Nlocal); + for(int i = 0; i < atom->Nlocal; ++i) { fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i)); } @@ -48,3 +52,168 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) { fclose(fp); return 0; } + +int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep) +{ + char msg[256]; + char timestep_filename[128]; + snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep); + MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh); + if(_fh == MPI_FILE_NULL) { + if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n"); + return -1; + } + + if (comm->myproc==0){ + sprintf(msg, "# vtk DataFile Version 2.0\n"); + sprintf(msg, "%sParticle data\n",msg); + sprintf(msg, "%sASCII\n",msg); + sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg); + sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms); + flushBuffer(msg); + } +} + +int vtkVector(Comm* comm, Atom* atom, Parameter* param) +{ + if (_fh == MPI_FILE_NULL) { + if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n"); + return -1; + } + + int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n" + int extrabuff = 100; + int sizebuff = sizeline*atom->Nlocal+extrabuff; + int mysize = 0; + char* msg = (char*) malloc(sizebuff); + sprintf(msg, ""); + for(int i = 0; i < atom->Nlocal; i++){ + if(mysize+extrabuff >= sizebuff){ + sizebuff*= 1.5; + msg = (char*) realloc(msg, sizebuff); + } + //TODO: do not forget to add param->xlo, param->ylo, param->zlo + sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i)); + mysize = strlen(msg); + } + int gatherSize[comm->numproc]; + + MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD); + int offset=0; + int globalSize = 0; + + for(int i = 0; i < comm->myproc; i++) + offset+= gatherSize[i]; + + for(int i = 0; i < comm->numproc; i++) + globalSize+= gatherSize[i]; + + MPI_Offset displ; + MPI_Datatype FileType; + int GlobalSize[] = {globalSize}; + int LocalSize[] = {mysize}; + int Start[] = {offset}; + + if(LocalSize[0]>0){ + MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType); + } else { + MPI_Type_vector(0,0,0,MPI_CHAR,&FileType); + } + MPI_Type_commit(&FileType); + MPI_File_get_size(_fh, &displ); + MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL); + MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE); + MPI_Barrier(MPI_COMM_WORLD); + MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL); + + if (comm->myproc==0){ + + sprintf(msg, "\n\n"); + sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2); + + for(int i = 0; i < atom->Natoms; i++) + sprintf(msg, "%s1 %d\n", msg, i); + flushBuffer(msg); + + sprintf(msg, "\n\n"); + sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms); + for(int i = 0; i < atom->Natoms; i++) + sprintf(msg, "%s1\n",msg); + flushBuffer(msg); + + sprintf(msg, "\n\n"); + sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms); + sprintf(msg, "%sSCALARS mass double\n",msg); + sprintf(msg, "%sLOOKUP_TABLE default\n",msg); + for(int i = 0; i < atom->Natoms; i++) + sprintf(msg, "%s1.0\n",msg); + sprintf(msg, "%s\n\n",msg); + flushBuffer(msg); + } +} + +void vtkClose() +{ + MPI_File_close(&_fh); + _fh=MPI_FILE_NULL; +} + +int printGhost(const char* filename, Atom* atom, int timestep, int me) { + char timestep_filename[128]; + snprintf(timestep_filename, sizeof timestep_filename, "%s_%d_ghost%i.vtk", filename, timestep,me); + FILE* fp = fopen(timestep_filename, "wb"); + + if(fp == NULL) { + fprintf(stderr, "Could not open VTK file for writing!\n"); + return -1; + } + fprintf(fp, "# vtk DataFile Version 2.0\n"); + fprintf(fp, "Particle data\n"); + fprintf(fp, "ASCII\n"); + fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); + fprintf(fp, "POINTS %d double\n", atom->Nghost); + + for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) { + fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i)); + } + fprintf(fp, "\n\n"); + fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2); + for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) { + fprintf(fp, "1 %d\n", i); + } + fprintf(fp, "\n\n"); + fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal); + for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) { + fprintf(fp, "1\n"); + } + fprintf(fp, "\n\n"); + fprintf(fp, "POINT_DATA %d\n", atom->Nghost); + fprintf(fp, "SCALARS mass double\n"); + fprintf(fp, "LOOKUP_TABLE default\n"); + for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; i++) { + fprintf(fp, "1.0\n"); + } + fprintf(fp, "\n\n"); + fclose(fp); + return 0; +} + +void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep) +{ + if(comm->numproc == 1) + { + write_atoms_to_vtk_file(filename, atom, timestep); + return; + } + + vtkOpen(filename, comm, atom, timestep); + vtkVector(comm, atom, param); + vtkClose(); + //printGhost(filename, atom, timestep, comm->myproc); +} + +static inline void flushBuffer(char* msg){ + MPI_Offset displ; + MPI_File_get_size(_fh, &displ); + MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE); +} \ No newline at end of file