First crude attempt at parallelizing neighborhood computation (only the part after binning the atoms is parallelized with cuda)
This commit is contained in:
		
							
								
								
									
										35
									
								
								src/force.cu
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								src/force.cu
									
									
									
									
									
								
							| @@ -126,25 +126,9 @@ extern "C" { | |||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| int get_num_threads() { | void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) { | ||||||
|  |  | ||||||
|     const char *num_threads_env = getenv("NUM_THREADS"); |  | ||||||
|     int num_threads = 0; |  | ||||||
|     if(num_threads_env == nullptr) |  | ||||||
|         num_threads = 32; |  | ||||||
|     else { |  | ||||||
|         num_threads = atoi(num_threads_env); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     return num_threads; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { |  | ||||||
|  |  | ||||||
|     const int Nlocal = atom->Nlocal; |     const int Nlocal = atom->Nlocal; | ||||||
|     const int num_threads = get_num_threads(); |  | ||||||
|  |  | ||||||
|     const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps |  | ||||||
|     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); |     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); | ||||||
|  |  | ||||||
|     kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom); |     kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom); | ||||||
| @@ -157,12 +141,9 @@ void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { | void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) { | ||||||
|  |  | ||||||
|     const int Nlocal = atom->Nlocal; |     const int Nlocal = atom->Nlocal; | ||||||
|     const int num_threads = get_num_threads(); |  | ||||||
|  |  | ||||||
|     const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps |  | ||||||
|     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); |     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); | ||||||
|  |  | ||||||
|     kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom); |     kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom); | ||||||
| @@ -182,7 +163,8 @@ double computeForce( | |||||||
|         Atom *atom, |         Atom *atom, | ||||||
|         Neighbor *neighbor, |         Neighbor *neighbor, | ||||||
|         Atom *c_atom, |         Atom *c_atom, | ||||||
|         Neighbor *c_neighbor |         Neighbor *c_neighbor, | ||||||
|  |         int num_threads_per_block | ||||||
|         ) |         ) | ||||||
| { | { | ||||||
|     int Nlocal = atom->Nlocal; |     int Nlocal = atom->Nlocal; | ||||||
| @@ -192,8 +174,6 @@ double computeForce( | |||||||
|     MD_FLOAT epsilon = param->epsilon; |     MD_FLOAT epsilon = param->epsilon; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     const int num_threads = get_num_threads(); |  | ||||||
|  |  | ||||||
|     c_atom->Natoms = atom->Natoms; |     c_atom->Natoms = atom->Natoms; | ||||||
|     c_atom->Nlocal = atom->Nlocal; |     c_atom->Nlocal = atom->Nlocal; | ||||||
|     c_atom->Nghost = atom->Nghost; |     c_atom->Nghost = atom->Nghost; | ||||||
| @@ -219,14 +199,11 @@ double computeForce( | |||||||
|  |  | ||||||
|     cudaProfilerStart(); |     cudaProfilerStart(); | ||||||
|  |  | ||||||
|     checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); |  | ||||||
|  |  | ||||||
|     if(reneighbourHappenend) { |     if(!reneighbourHappenend) { | ||||||
|         checkCUDAError( "c_neighbor->numneigh memcpy", cudaMemcpy(c_neighbor->numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) ); |         checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); | ||||||
|         checkCUDAError( "c_neighbor->neighbors memcpy", cudaMemcpy(c_neighbor->neighbors, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) ); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps |  | ||||||
|     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); |     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); | ||||||
|  |  | ||||||
|     double S = getTimeStamp(); |     double S = getTimeStamp(); | ||||||
|   | |||||||
| @@ -46,4 +46,5 @@ extern void setupNeighbor(); | |||||||
| extern void binatoms(Atom*); | extern void binatoms(Atom*); | ||||||
| extern void buildNeighbor(Atom*, Neighbor*); | extern void buildNeighbor(Atom*, Neighbor*); | ||||||
| extern void sortAtom(Atom*); | extern void sortAtom(Atom*); | ||||||
|  | extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int); | ||||||
| #endif | #endif | ||||||
|   | |||||||
							
								
								
									
										50
									
								
								src/main.c
									
									
									
									
									
								
							
							
						
						
									
										50
									
								
								src/main.c
									
									
									
									
									
								
							| @@ -45,10 +45,14 @@ | |||||||
|  |  | ||||||
| #define HLINE "----------------------------------------------------------------------------\n" | #define HLINE "----------------------------------------------------------------------------\n" | ||||||
|  |  | ||||||
| extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom); | extern void cuda_final_integrate(bool doReneighbour, Parameter *param, | ||||||
| extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom); |                                  Atom *atom, Atom *c_atom, | ||||||
|  |                                  const int num_threads_per_block); | ||||||
|  | extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, | ||||||
|  |                                    Atom *atom, Atom *c_atom, | ||||||
|  |                                    const int num_threads_per_block); | ||||||
|  |  | ||||||
| extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*); | extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int); | ||||||
| extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); | extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); | ||||||
| extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep); | extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep); | ||||||
|  |  | ||||||
| @@ -111,7 +115,8 @@ double setup( | |||||||
|         Neighbor *neighbor, |         Neighbor *neighbor, | ||||||
|         Atom *c_atom, |         Atom *c_atom, | ||||||
|         Neighbor *c_neighbor, |         Neighbor *c_neighbor, | ||||||
|         Stats *stats) |         Stats *stats, | ||||||
|  |         const int num_threads_per_block) | ||||||
| { | { | ||||||
|     if(param->force_field == FF_EAM) { initEam(eam, param); } |     if(param->force_field == FF_EAM) { initEam(eam, param); } | ||||||
|     double S, E; |     double S, E; | ||||||
| @@ -131,7 +136,7 @@ double setup( | |||||||
|     adjustThermo(param, atom); |     adjustThermo(param, atom); | ||||||
|     setupPbc(atom, param); |     setupPbc(atom, param); | ||||||
|     updatePbc(atom, param); |     updatePbc(atom, param); | ||||||
|     buildNeighbor(atom, neighbor); |     buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); | ||||||
|     E = getTimeStamp(); |     E = getTimeStamp(); | ||||||
|  |  | ||||||
|     initCudaAtom(atom, neighbor, c_atom, c_neighbor); |     initCudaAtom(atom, neighbor, c_atom, c_neighbor); | ||||||
| @@ -142,7 +147,10 @@ double setup( | |||||||
| double reneighbour( | double reneighbour( | ||||||
|         Parameter *param, |         Parameter *param, | ||||||
|         Atom *atom, |         Atom *atom, | ||||||
|         Neighbor *neighbor) |         Neighbor *neighbor, | ||||||
|  |         Atom *c_atom, | ||||||
|  |         Neighbor *c_neighbor, | ||||||
|  |         const int num_threads_per_block) | ||||||
| { | { | ||||||
|     double S, E; |     double S, E; | ||||||
|  |  | ||||||
| @@ -152,7 +160,7 @@ double reneighbour( | |||||||
|     setupPbc(atom, param); |     setupPbc(atom, param); | ||||||
|     updatePbc(atom, param); |     updatePbc(atom, param); | ||||||
|     //sortAtom(atom); |     //sortAtom(atom); | ||||||
|     buildNeighbor(atom, neighbor); |     buildNeighbor(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); | ||||||
|     LIKWID_MARKER_STOP("reneighbour"); |     LIKWID_MARKER_STOP("reneighbour"); | ||||||
|     E = getTimeStamp(); |     E = getTimeStamp(); | ||||||
|  |  | ||||||
| @@ -206,6 +214,19 @@ const char* ff2str(int ff) | |||||||
|     return "invalid"; |     return "invalid"; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | int get_num_threads() { | ||||||
|  |  | ||||||
|  |     const char *num_threads_env = getenv("NUM_THREADS"); | ||||||
|  |     int num_threads = 0; | ||||||
|  |     if(num_threads_env == nullptr) | ||||||
|  |         num_threads = 32; | ||||||
|  |     else { | ||||||
|  |         num_threads = atoi(num_threads_env); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return num_threads; | ||||||
|  | } | ||||||
|  |  | ||||||
| int main(int argc, char** argv) | int main(int argc, char** argv) | ||||||
| { | { | ||||||
|     double timer[NUMTIMER]; |     double timer[NUMTIMER]; | ||||||
| @@ -286,7 +307,10 @@ int main(int argc, char** argv) | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats); |     // this should be multiple of 32 as operations are performed at the level of warps | ||||||
|  |     const int num_threads_per_block = get_num_threads(); | ||||||
|  |  | ||||||
|  |     setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block); | ||||||
|     computeThermo(0, ¶m, &atom); |     computeThermo(0, ¶m, &atom); | ||||||
|     if(param.force_field == FF_EAM) { |     if(param.force_field == FF_EAM) { | ||||||
|         computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0); |         computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0); | ||||||
| @@ -294,7 +318,7 @@ int main(int argc, char** argv) | |||||||
| #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) | #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) | ||||||
|         computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0); |         computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0); | ||||||
| #else | #else | ||||||
|         computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor); |         computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); | ||||||
| #endif | #endif | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -310,10 +334,10 @@ int main(int argc, char** argv) | |||||||
|  |  | ||||||
|         const bool doReneighbour = (n + 1) % param.every == 0; |         const bool doReneighbour = (n + 1) % param.every == 0; | ||||||
|  |  | ||||||
|         cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom); |         cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); | ||||||
|  |  | ||||||
|         if(doReneighbour) { |         if(doReneighbour) { | ||||||
|             timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); |             timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); | ||||||
|         } else { |         } else { | ||||||
|             updatePbc(&atom, ¶m); |             updatePbc(&atom, ¶m); | ||||||
|         } |         } | ||||||
| @@ -324,11 +348,11 @@ int main(int argc, char** argv) | |||||||
| #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) | #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) | ||||||
|             timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1); |             timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1); | ||||||
| #else | #else | ||||||
|             timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor); |             timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); | ||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom); |         cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); | ||||||
|  |  | ||||||
|         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { |         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { | ||||||
|             computeThermo(n + 1, ¶m, &atom); |             computeThermo(n + 1, ¶m, &atom); | ||||||
|   | |||||||
							
								
								
									
										106
									
								
								src/neighbor.cu
									
									
									
									
									
								
							
							
						
						
									
										106
									
								
								src/neighbor.cu
									
									
									
									
									
								
							| @@ -67,9 +67,10 @@ __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, | |||||||
|     return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1); |     return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1); | ||||||
| } | } | ||||||
|  |  | ||||||
| __global__ void compute_neighborhood(Atom a, Neighbor neigh, int Nlocal, Neighbor_params np, int nstencil, int* stencil, | __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, | ||||||
|                                      int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs){ |                                      int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs){ | ||||||
|     const int i = blockIdx.x * blockDim.x + threadIdx.x; |     const int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||||
|  |     const int Nlocal = a.Nlocal; | ||||||
|     if( i >= Nlocal ) { |     if( i >= Nlocal ) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
| @@ -513,41 +514,110 @@ void sortAtom(Atom* atom) { | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) | void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block) | ||||||
| { | { | ||||||
|     int nall = atom->Nlocal + atom->Nghost; |     int nall = atom->Nlocal + atom->Nghost; | ||||||
|  |  | ||||||
|     /* extend atom arrays if necessary */ |     c_atom->Natoms = atom->Natoms; | ||||||
|  |     c_atom->Nlocal = atom->Nlocal; | ||||||
|  |     c_atom->Nghost = atom->Nghost; | ||||||
|  |     c_atom->Nmax = atom->Nmax; | ||||||
|  |     c_atom->ntypes = atom->ntypes; | ||||||
|  |  | ||||||
|  |     c_neighbor->maxneighs = neighbor->maxneighs; | ||||||
|  |  | ||||||
|  |     /* extend c_neighbor arrays if necessary */ | ||||||
|     if(nall > nmax) { |     if(nall > nmax) { | ||||||
|         nmax = nall; |         nmax = nall; | ||||||
|         if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh); |         if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); | ||||||
|         if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors); |         if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); | ||||||
|         checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) ); |         checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); | ||||||
|         checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) ); |         checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); | ||||||
|         // neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); |  | ||||||
|         // neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*)); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /* bin local & ghost atoms */ |     /* bin local & ghost atoms */ | ||||||
|     binatoms(atom); |     binatoms(atom); | ||||||
|     int resize = 1; |     int resize = 1; | ||||||
|  |  | ||||||
|  |     cudaProfilerStart(); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); | ||||||
|  |  | ||||||
|  |     /* upload stencil */ | ||||||
|  |     int* c_stencil; | ||||||
|  |     // TODO move this to be done once at the start | ||||||
|  |     checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); | ||||||
|  |     checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); | ||||||
|  |  | ||||||
|  |     int *c_bincount; | ||||||
|  |     checkCUDAError( "buildNeighbor c_bincount malloc", cudaMalloc((void**)&c_bincount, mbins * sizeof(int)) ); | ||||||
|  |     checkCUDAError( "buildNeighbor c_bincount memcpy", cudaMemcpy(c_bincount, bincount, mbins * sizeof(int), cudaMemcpyHostToDevice) ); | ||||||
|  |  | ||||||
|  |     int *c_bins; | ||||||
|  |     checkCUDAError( "buidlNeighbor c_bins malloc", cudaMalloc((void**)&c_bins, mbins * atoms_per_bin * sizeof(int)) ); | ||||||
|  |     checkCUDAError( "buildNeighbor c_bins memcpy", cudaMemcpy(c_bins, bins, mbins * atoms_per_bin * sizeof(int), cudaMemcpyHostToDevice ) ); | ||||||
|  |  | ||||||
|  |     Neighbor_params np{ | ||||||
|  |         .xprd = xprd, | ||||||
|  |         .yprd = yprd, | ||||||
|  |         .zprd = zprd, | ||||||
|  |         .bininvx = bininvx, | ||||||
|  |         .bininvy = bininvy, | ||||||
|  |         .bininvz = bininvz, | ||||||
|  |         .mbinxlo = mbinxlo, | ||||||
|  |         .mbinylo = mbinylo, | ||||||
|  |         .mbinzlo = mbinzlo, | ||||||
|  |         .nbinx = nbinx, | ||||||
|  |         .nbiny = nbiny, | ||||||
|  |         .nbinz = nbinz, | ||||||
|  |         .mbinx = mbinx, | ||||||
|  |         .mbiny = mbiny, | ||||||
|  |         .mbinz = mbinz | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     int* c_new_maxneighs; | ||||||
|  |     checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); | ||||||
|  |  | ||||||
|     /* loop over each atom, storing neighbors */ |     /* loop over each atom, storing neighbors */ | ||||||
|     while(resize) { |     while(resize) { | ||||||
|         int new_maxneighs = neighbor->maxneighs; |  | ||||||
|         resize = 0; |         resize = 0; | ||||||
|  |  | ||||||
|         // TODO allocate space for and then copy all necessary components |         checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, c_neighbor->maxneighs, sizeof(int) )); | ||||||
|         // TODO dont forget to copy the atom positions over |  | ||||||
|  |  | ||||||
|         // TODO call compute_neigborhood kernel here |         // TODO call compute_neigborhood kernel here | ||||||
|  |         const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); | ||||||
|  |         /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, | ||||||
|  |                                      int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs) | ||||||
|  |          * */ | ||||||
|  |         compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_Atom, *c_neighbor, | ||||||
|  |                                                                     np, nstencil, c_stencil, | ||||||
|  |                                                                     c_bins, atoms_per_bin, c_bincount, | ||||||
|  |                                                                     c_new_maxneighs); | ||||||
|  |  | ||||||
|  |         // TODO copy the value of c_new_maxneighs back to host and check if it has been modified | ||||||
|  |         int new_maxneighs; | ||||||
|  |         checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost)); | ||||||
|  |         if (new_maxneighs > c_neighbor->maxneighs){ | ||||||
|  |             resize = 1; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         if(resize) { |         if(resize) { | ||||||
|             printf("RESIZE %d\n", neighbor->maxneighs); |             printf("RESIZE %d\n", c_neighbor->maxneighs); | ||||||
|             neighbor->maxneighs = new_maxneighs * 1.2; |             c_neighbor->maxneighs = new_maxneighs * 1.2; | ||||||
|             free(neighbor->neighbors); |             cudaFree(c_neighbor->neighbors); | ||||||
|             neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); |             checkCUDAError("c_neighbor->neighbors resize malloc", | ||||||
|         } |                            cudaMalloc((void**)(&c_neighbor->neighbors), | ||||||
|     } |                                       c_atom->Nmax * c_neighbor->maxneighs * sizeof(int))); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |     neighbor->maxneighs = c_neighbor->maxneighs; | ||||||
|  |  | ||||||
|  |     cudaProfilerStop(); | ||||||
|  |  | ||||||
|  |     cudaFree(c_new_maxneighs); | ||||||
|  |     cudaFree(c_n_stencil); | ||||||
|  |     cudaFree(c_bincount); | ||||||
|  |     cudaFree(c_bins); | ||||||
| } | } | ||||||
| } | } | ||||||
		Reference in New Issue
	
	Block a user