From 87d006d41842f39b84890c667952cca9dde5b578 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Thu, 11 Aug 2022 16:42:41 +0200 Subject: [PATCH] Fix GPU version Signed-off-by: Rafael Ravedutti --- lammps/allocate.c | 7 ++++--- lammps/cuda/neighbor.cu | 35 ++++++++++++++++------------------- lammps/cuda/pbc.cu | 17 ++++++++--------- lammps/neighbor.c | 30 +++++++++++++++--------------- lammps/pbc.c | 8 +++----- 5 files changed, 46 insertions(+), 51 deletions(-) diff --git a/lammps/allocate.c b/lammps/allocate.c index 8c885c3..d57e69a 100644 --- a/lammps/allocate.c +++ b/lammps/allocate.c @@ -59,14 +59,15 @@ void *reallocate(void* ptr, int alignment, size_t newBytesize, size_t oldBytesiz return newarray; } - -#ifndef CUDA_TARGET +#ifdef CUDA_TARGET void *allocate_gpu(int alignment, size_t bytesize) { return NULL; } void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { return NULL; } #else +#include +#include void *allocate_gpu(int alignment, size_t bytesize) { void *ptr; - checkCUDAError("allocate_gpu", cudaMallocHost((void **) &ptr, bytesize)); + checkCUDAError("allocate", cudaMallocHost((void **) &ptr, bytesize)); return ptr; } diff --git a/lammps/cuda/neighbor.cu b/lammps/cuda/neighbor.cu index e1ade7d..4af6e17 100644 --- a/lammps/cuda/neighbor.cu +++ b/lammps/cuda/neighbor.cu @@ -38,17 +38,17 @@ extern "C" { } -static MD_FLOAT xprd, yprd, zprd; -static MD_FLOAT bininvx, bininvy, bininvz; -static int mbinxlo, mbinylo, mbinzlo; -static int nbinx, nbiny, nbinz; -static int mbinx, mbiny, mbinz; // n bins in x, y, z -static int mbins; //total number of bins -static int atoms_per_bin; // max atoms per bin -static MD_FLOAT cutneighsq; // neighbor cutoff squared -static int nmax; -static int nstencil; // # of bins in stencil -static int* stencil; // stencil list of bin offsets +extern MD_FLOAT xprd, yprd, zprd; +extern MD_FLOAT bininvx, bininvy, bininvz; +extern int mbinxlo, mbinylo, mbinzlo; +extern int nbinx, nbiny, nbinz; +extern int mbinx, mbiny, mbinz; // n bins in x, y, z +extern int mbins; //total number of bins +extern int atoms_per_bin; // max atoms per bin +extern MD_FLOAT cutneighsq; // neighbor cutoff squared +extern int nmax; +extern int nstencil; // # of bins in stencil +extern int* stencil; // stencil list of bin offsets static int* c_stencil = NULL; static int* c_resize_needed = NULL; static int* c_new_maxneighs = NULL; @@ -59,7 +59,6 @@ static Binning c_binning { .atoms_per_bin = 0 }; - __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) { int ix, iy, iz; @@ -115,7 +114,7 @@ __global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, in } while (!sorted); } -__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){ +__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) { Atom* atom = &a; const int i = blockIdx.x * blockDim.x + threadIdx.x; int nall = atom->Nlocal + atom->Nghost; @@ -127,7 +126,6 @@ __global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_ MD_FLOAT y = atom_y(i); MD_FLOAT z = atom_z(i); int ibin = coord2bin_device(x, y, z, np); - int ac = atomicAdd(&bincount[ibin], 1); if(ac < atoms_per_bin){ @@ -138,7 +136,7 @@ __global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_ } __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, - int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){ + int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) { const int i = blockIdx.x * blockDim.x + threadIdx.x; const int Nlocal = a.Nlocal; if( i >= Nlocal ) { @@ -189,7 +187,6 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, } neighbor->numneigh[i] = n; - if(n > neighbor->maxneighs) { atomicMax(new_maxneighs, n); } @@ -304,8 +301,8 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * c_new_maxneighs, cutneighsq); - checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() ); + checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() ); // TODO copy the value of c_new_maxneighs back to host and check if it has been modified int new_maxneighs; @@ -323,7 +320,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * } } - neighbor->maxneighs = c_neighbor->maxneighs; + neighbor->maxneighs = c_neighbor->maxneighs; cudaProfilerStop(); } diff --git a/lammps/cuda/pbc.cu b/lammps/cuda/pbc.cu index 24b680b..3490fae 100644 --- a/lammps/cuda/pbc.cu +++ b/lammps/cuda/pbc.cu @@ -34,16 +34,15 @@ extern "C" { } -static int NmaxGhost; -static int *PBCx, *PBCy, *PBCz; -static int c_NmaxGhost = 0; -static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL; - +extern int NmaxGhost; +extern int *PBCx, *PBCy, *PBCz; +static int c_NmaxGhost; +static int *c_PBCx, *c_PBCy, *c_PBCz; __global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ const int i = blockIdx.x * blockDim.x + threadIdx.x; Atom* atom = &a; - if( i >= atom->Nlocal ){ + if(i >= atom->Nlocal) { return; } @@ -69,9 +68,10 @@ __global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_F __global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ const int i = blockIdx.x * blockDim.x + threadIdx.x; const int Nghost = a.Nghost; - if( i >= Nghost ) { + if(i >= Nghost) { return; } + Atom* atom = &a; int *border_map = atom->border_map; int nlocal = atom->Nlocal; @@ -86,7 +86,7 @@ __global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLO void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) { const int num_threads_per_block = get_num_threads(); - if (doReneighbor){ + if (doReneighbor) { c_atom->Natoms = atom->Natoms; c_atom->Nlocal = atom->Nlocal; c_atom->Nghost = atom->Nghost; @@ -146,6 +146,5 @@ void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param){ checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() ); checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() ); - checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); } diff --git a/lammps/neighbor.c b/lammps/neighbor.c index cbdb150..c3e19ca 100644 --- a/lammps/neighbor.c +++ b/lammps/neighbor.c @@ -31,21 +31,21 @@ #define SMALL 1.0e-6 #define FACTOR 0.999 -static MD_FLOAT xprd, yprd, zprd; -static MD_FLOAT bininvx, bininvy, bininvz; -static int mbinxlo, mbinylo, mbinzlo; -static int nbinx, nbiny, nbinz; -static int mbinx, mbiny, mbinz; // n bins in x, y, z -static int *bincount; -static int *bins; -static int mbins; //total number of bins -static int atoms_per_bin; // max atoms per bin -static MD_FLOAT cutneigh; -static MD_FLOAT cutneighsq; // neighbor cutoff squared -static int nmax; -static int nstencil; // # of bins in stencil -static int* stencil; // stencil list of bin offsets -static MD_FLOAT binsizex, binsizey, binsizez; +MD_FLOAT xprd, yprd, zprd; +MD_FLOAT bininvx, bininvy, bininvz; +int mbinxlo, mbinylo, mbinzlo; +int nbinx, nbiny, nbinz; +int mbinx, mbiny, mbinz; // n bins in x, y, z +int *bincount; +int *bins; +int mbins; //total number of bins +int atoms_per_bin; // max atoms per bin +MD_FLOAT cutneigh; +MD_FLOAT cutneighsq; // neighbor cutoff squared +int nmax; +int nstencil; // # of bins in stencil +int* stencil; // stencil list of bin offsets +MD_FLOAT binsizex, binsizey, binsizez; static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT); static MD_FLOAT bindist(int, int, int); diff --git a/lammps/pbc.c b/lammps/pbc.c index d1e80bb..ab1b867 100644 --- a/lammps/pbc.c +++ b/lammps/pbc.c @@ -30,8 +30,8 @@ #define DELTA 20000 -static int NmaxGhost; -static int *PBCx, *PBCy, *PBCz; +int NmaxGhost; +int *PBCx, *PBCy, *PBCz; static void growPbc(Atom*); @@ -66,7 +66,6 @@ void updateAtomsPbc_cpu(Atom *atom, Atom *c_atom, Parameter *param) { MD_FLOAT zprd = param->zprd; for(int i = 0; i < atom->Nlocal; i++) { - if(atom_x(i) < 0.0) { atom_x(i) += xprd; } else if(atom_x(i) >= xprd) { @@ -177,8 +176,7 @@ void setupPbc(Atom *atom, Parameter *param) { } /* internal subroutines */ -void growPbc(Atom* atom) -{ +void growPbc(Atom* atom) { int nold = NmaxGhost; NmaxGhost += DELTA;