From c49278cb21e3b0cc26f873ef382b80ca578b799c Mon Sep 17 00:00:00 2001 From: Martin Bauernfeind Date: Sun, 26 Jun 2022 16:25:59 +0200 Subject: [PATCH] First crude attempt at parallelizing neighborhood computation (only the part after binning the atoms is parallelized with cuda) --- src/force.cu | 35 +++----------- src/includes/neighbor.h | 1 + src/main.c | 50 +++++++++++++++----- src/neighbor.cu | 102 +++++++++++++++++++++++++++++++++------- 4 files changed, 130 insertions(+), 58 deletions(-) diff --git a/src/force.cu b/src/force.cu index e7ef535..6f27d56 100644 --- a/src/force.cu +++ b/src/force.cu @@ -126,25 +126,9 @@ extern "C" { -int get_num_threads() { - - const char *num_threads_env = getenv("NUM_THREADS"); - int num_threads = 0; - if(num_threads_env == nullptr) - num_threads = 32; - else { - num_threads = atoi(num_threads_env); - } - - return num_threads; -} - -void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { +void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) { const int Nlocal = atom->Nlocal; - const int num_threads = get_num_threads(); - - const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom); @@ -157,12 +141,9 @@ void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom } } -void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { +void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) { const int Nlocal = atom->Nlocal; - const int num_threads = get_num_threads(); - - const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom); @@ -182,7 +163,8 @@ double computeForce( Atom *atom, Neighbor *neighbor, Atom *c_atom, - Neighbor *c_neighbor + Neighbor *c_neighbor, + int num_threads_per_block ) { int Nlocal = atom->Nlocal; @@ -192,8 +174,6 @@ double computeForce( MD_FLOAT epsilon = param->epsilon; #endif - const int num_threads = get_num_threads(); - c_atom->Natoms = atom->Natoms; c_atom->Nlocal = atom->Nlocal; c_atom->Nghost = atom->Nghost; @@ -219,14 +199,11 @@ double computeForce( cudaProfilerStart(); - checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); - if(reneighbourHappenend) { - checkCUDAError( "c_neighbor->numneigh memcpy", cudaMemcpy(c_neighbor->numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) ); - checkCUDAError( "c_neighbor->neighbors memcpy", cudaMemcpy(c_neighbor->neighbors, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) ); + if(!reneighbourHappenend) { + checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); } - const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); double S = getTimeStamp(); diff --git a/src/includes/neighbor.h b/src/includes/neighbor.h index 17b7bb4..014ece2 100644 --- a/src/includes/neighbor.h +++ b/src/includes/neighbor.h @@ -46,4 +46,5 @@ extern void setupNeighbor(); extern void binatoms(Atom*); extern void buildNeighbor(Atom*, Neighbor*); extern void sortAtom(Atom*); +extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int); #endif diff --git a/src/main.c b/src/main.c index 35537df..30fb374 100644 --- a/src/main.c +++ b/src/main.c @@ -45,10 +45,14 @@ #define HLINE "----------------------------------------------------------------------------\n" -extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom); -extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom); +extern void cuda_final_integrate(bool doReneighbour, Parameter *param, + Atom *atom, Atom *c_atom, + const int num_threads_per_block); +extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, + Atom *atom, Atom *c_atom, + const int num_threads_per_block); -extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*); +extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int); extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep); @@ -111,7 +115,8 @@ double setup( Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, - Stats *stats) + Stats *stats, + const int num_threads_per_block) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; @@ -131,7 +136,7 @@ double setup( adjustThermo(param, atom); setupPbc(atom, param); updatePbc(atom, param); - buildNeighbor(atom, neighbor); + buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); E = getTimeStamp(); initCudaAtom(atom, neighbor, c_atom, c_neighbor); @@ -142,7 +147,10 @@ double setup( double reneighbour( Parameter *param, Atom *atom, - Neighbor *neighbor) + Neighbor *neighbor, + Atom *c_atom, + Neighbor *c_neighbor, + const int num_threads_per_block) { double S, E; @@ -152,7 +160,7 @@ double reneighbour( setupPbc(atom, param); updatePbc(atom, param); //sortAtom(atom); - buildNeighbor(atom, neighbor); + buildNeighbor(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); @@ -206,6 +214,19 @@ const char* ff2str(int ff) return "invalid"; } +int get_num_threads() { + + const char *num_threads_env = getenv("NUM_THREADS"); + int num_threads = 0; + if(num_threads_env == nullptr) + num_threads = 32; + else { + num_threads = atoi(num_threads_env); + } + + return num_threads; +} + int main(int argc, char** argv) { double timer[NUMTIMER]; @@ -286,7 +307,10 @@ int main(int argc, char** argv) } } - setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats); + // this should be multiple of 32 as operations are performed at the level of warps + const int num_threads_per_block = get_num_threads(); + + setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block); computeThermo(0, ¶m, &atom); if(param.force_field == FF_EAM) { computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0); @@ -294,7 +318,7 @@ int main(int argc, char** argv) #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0); #else - computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor); + computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); #endif } @@ -310,10 +334,10 @@ int main(int argc, char** argv) const bool doReneighbour = (n + 1) % param.every == 0; - cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom); + cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); if(doReneighbour) { - timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); + timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); } else { updatePbc(&atom, ¶m); } @@ -324,11 +348,11 @@ int main(int argc, char** argv) #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1); #else - timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor); + timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); #endif } - cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom); + cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { computeThermo(n + 1, ¶m, &atom); diff --git a/src/neighbor.cu b/src/neighbor.cu index 89dce5c..feb02b5 100644 --- a/src/neighbor.cu +++ b/src/neighbor.cu @@ -67,9 +67,10 @@ __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1); } -__global__ void compute_neighborhood(Atom a, Neighbor neigh, int Nlocal, Neighbor_params np, int nstencil, int* stencil, +__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs){ const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int Nlocal = a.Nlocal; if( i >= Nlocal ) { return; } @@ -513,41 +514,110 @@ void sortAtom(Atom* atom) { #endif } -void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) +void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block) { int nall = atom->Nlocal + atom->Nghost; - /* extend atom arrays if necessary */ + c_atom->Natoms = atom->Natoms; + c_atom->Nlocal = atom->Nlocal; + c_atom->Nghost = atom->Nghost; + c_atom->Nmax = atom->Nmax; + c_atom->ntypes = atom->ntypes; + + c_neighbor->maxneighs = neighbor->maxneighs; + + /* extend c_neighbor arrays if necessary */ if(nall > nmax) { nmax = nall; - if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh); - if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors); - checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) ); - checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) ); - // neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); - // neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*)); + if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); + if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); + checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); + checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); } /* bin local & ghost atoms */ binatoms(atom); int resize = 1; + cudaProfilerStart(); + + checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); + + /* upload stencil */ + int* c_stencil; + // TODO move this to be done once at the start + checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); + checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); + + int *c_bincount; + checkCUDAError( "buildNeighbor c_bincount malloc", cudaMalloc((void**)&c_bincount, mbins * sizeof(int)) ); + checkCUDAError( "buildNeighbor c_bincount memcpy", cudaMemcpy(c_bincount, bincount, mbins * sizeof(int), cudaMemcpyHostToDevice) ); + + int *c_bins; + checkCUDAError( "buidlNeighbor c_bins malloc", cudaMalloc((void**)&c_bins, mbins * atoms_per_bin * sizeof(int)) ); + checkCUDAError( "buildNeighbor c_bins memcpy", cudaMemcpy(c_bins, bins, mbins * atoms_per_bin * sizeof(int), cudaMemcpyHostToDevice ) ); + + Neighbor_params np{ + .xprd = xprd, + .yprd = yprd, + .zprd = zprd, + .bininvx = bininvx, + .bininvy = bininvy, + .bininvz = bininvz, + .mbinxlo = mbinxlo, + .mbinylo = mbinylo, + .mbinzlo = mbinzlo, + .nbinx = nbinx, + .nbiny = nbiny, + .nbinz = nbinz, + .mbinx = mbinx, + .mbiny = mbiny, + .mbinz = mbinz + }; + + int* c_new_maxneighs; + checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); + /* loop over each atom, storing neighbors */ while(resize) { - int new_maxneighs = neighbor->maxneighs; resize = 0; - // TODO allocate space for and then copy all necessary components - // TODO dont forget to copy the atom positions over + checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, c_neighbor->maxneighs, sizeof(int) )); // TODO call compute_neigborhood kernel here + const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); + /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, + int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs) + * */ + compute_neighborhood<<>>(*c_Atom, *c_neighbor, + np, nstencil, c_stencil, + c_bins, atoms_per_bin, c_bincount, + c_new_maxneighs); + + // TODO copy the value of c_new_maxneighs back to host and check if it has been modified + int new_maxneighs; + checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost)); + if (new_maxneighs > c_neighbor->maxneighs){ + resize = 1; + } if(resize) { - printf("RESIZE %d\n", neighbor->maxneighs); - neighbor->maxneighs = new_maxneighs * 1.2; - free(neighbor->neighbors); - neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); + printf("RESIZE %d\n", c_neighbor->maxneighs); + c_neighbor->maxneighs = new_maxneighs * 1.2; + cudaFree(c_neighbor->neighbors); + checkCUDAError("c_neighbor->neighbors resize malloc", + cudaMalloc((void**)(&c_neighbor->neighbors), + c_atom->Nmax * c_neighbor->maxneighs * sizeof(int))); } + } + neighbor->maxneighs = c_neighbor->maxneighs; + + cudaProfilerStop(); + + cudaFree(c_new_maxneighs); + cudaFree(c_n_stencil); + cudaFree(c_bincount); + cudaFree(c_bins); } } \ No newline at end of file