The program now does the binning on the GPU via the binatoms_cuda method
This commit is contained in:
		| @@ -53,6 +53,6 @@ extern void setupNeighbor(); | |||||||
| extern void binatoms(Atom*); | extern void binatoms(Atom*); | ||||||
| extern void buildNeighbor(Atom*, Neighbor*); | extern void buildNeighbor(Atom*, Neighbor*); | ||||||
| extern void sortAtom(Atom*); | extern void sortAtom(Atom*); | ||||||
| extern void binatoms_cuda(Atom*, Binning*, int**, Neighbor_params*, const int); | extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int); | ||||||
| extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int); | extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int); | ||||||
| #endif | #endif | ||||||
|   | |||||||
| @@ -539,7 +539,7 @@ void sortAtom(Atom* atom) { | |||||||
| #endif | #endif | ||||||
| } | } | ||||||
|  |  | ||||||
| void binatoms_cuda(Atom* c_atom, Binning* c_binning, int** c_resize_needed, NeighborParams *np, const int threads_per_block) | void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, NeighborParams *np, const int threads_per_block) | ||||||
| { | { | ||||||
|     int nall = c_atom->Nlocal + c_atom->Nghost; |     int nall = c_atom->Nlocal + c_atom->Nghost; | ||||||
|     int resize = 1; |     int resize = 1; | ||||||
| @@ -549,21 +549,18 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int** c_resize_needed, Neig | |||||||
|     if(c_binning->bins == NULL){ |     if(c_binning->bins == NULL){ | ||||||
|         checkCUDAError("binatoms_cuda c_binning->bins malloc", cudaMalloc((void**)(&c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) ); |         checkCUDAError("binatoms_cuda c_binning->bins malloc", cudaMalloc((void**)(&c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) ); | ||||||
|     } |     } | ||||||
|     if(*c_resize_needed == NULL){ |  | ||||||
|         checkCUDAError("binatoms_cuda c_resize_needed malloc", cudaMalloc(&c_resize_needed, sizeof(int)) ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     const int num_blocks = ceil((float)nall-> / (float)num_threads_per_block); |     const int num_blocks = ceil((float)nall-> / (float)num_threads_per_block); | ||||||
|  |  | ||||||
|     while(resize > 0) { |     while(resize > 0) { | ||||||
|         resize = 0; |         resize = 0; | ||||||
|         checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int))); |         checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int))); | ||||||
|         checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(*c_resize_needed, 0, sizeof(int)) ); |         checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) ); | ||||||
|  |  | ||||||
|         /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */ |         /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */ | ||||||
|         binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, *c_resize_needed); |         binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed); | ||||||
|  |  | ||||||
|         checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, *c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) ); |         checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) ); | ||||||
|  |  | ||||||
|         if(resize) { |         if(resize) { | ||||||
|             cudaFree(c_binning->bins); |             cudaFree(c_binning->bins); | ||||||
| @@ -576,39 +573,20 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int** c_resize_needed, Neig | |||||||
| void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block) | void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block) | ||||||
| { | { | ||||||
|     int nall = atom->Nlocal + atom->Nghost; |     int nall = atom->Nlocal + atom->Nghost; | ||||||
|  |  | ||||||
|     c_neighbor->maxneighs = neighbor->maxneighs; |     c_neighbor->maxneighs = neighbor->maxneighs; | ||||||
|  |  | ||||||
|     /* extend c_neighbor arrays if necessary */ |  | ||||||
|     if(nall > nmax) { |  | ||||||
|         nmax = nall; |  | ||||||
|         if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); |  | ||||||
|         if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); |  | ||||||
|         checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); |  | ||||||
|         checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     checkCUDAError( "buildNeighbor c_atom->x memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * 3 * c_atom->Nmax, cudaMemcpyDeviceToHost) ); |  | ||||||
|  |  | ||||||
|     /* bin local & ghost atoms */ |  | ||||||
|     binatoms(atom); |  | ||||||
|     int resize = 1; |  | ||||||
|  |  | ||||||
|     cudaProfilerStart(); |     cudaProfilerStart(); | ||||||
|  |  | ||||||
|     /* upload stencil */ |     /* upload stencil */ | ||||||
|     int* c_stencil; |     int* c_stencil; | ||||||
|     // TODO move this to be done once at the start |     // TODO move this to be done once at the start | ||||||
|     checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); |     checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); | ||||||
|     checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); |     checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); | ||||||
|  |  | ||||||
|     int *c_bincount; |     Binning c_binning; | ||||||
|     checkCUDAError( "buildNeighbor c_bincount malloc", cudaMalloc((void**)&c_bincount, mbins * sizeof(int)) ); |     c_binning->mbins = mbins; | ||||||
|     checkCUDAError( "buildNeighbor c_bincount memcpy", cudaMemcpy(c_bincount, bincount, mbins * sizeof(int), cudaMemcpyHostToDevice) ); |     c_binning->atoms_per_bin = atoms_per_bin; | ||||||
|  |     checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning->bincount), mbins * sizeof(int)) ); | ||||||
|     int *c_bins; |     checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) ); | ||||||
|     checkCUDAError( "buidlNeighbor c_bins malloc", cudaMalloc((void**)&c_bins, mbins * atoms_per_bin * sizeof(int)) ); |  | ||||||
|     checkCUDAError( "buildNeighbor c_bins memcpy", cudaMemcpy(c_bins, bins, mbins * atoms_per_bin * sizeof(int), cudaMemcpyHostToDevice ) ); |  | ||||||
|  |  | ||||||
|     Neighbor_params np{ |     Neighbor_params np{ | ||||||
|             .xprd = xprd, |             .xprd = xprd, | ||||||
| @@ -628,9 +606,26 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * | |||||||
|             .mbinz = mbinz |             .mbinz = mbinz | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     int* c_resize_needed; | ||||||
|  |     checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) ); | ||||||
|  |      | ||||||
|  |     /* bin local & ghost atoms */ | ||||||
|  |     binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block); | ||||||
|  |  | ||||||
|     int* c_new_maxneighs; |     int* c_new_maxneighs; | ||||||
|     checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); |     checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); | ||||||
|  |  | ||||||
|  |     int resize = 1; | ||||||
|  |      | ||||||
|  |     /* extend c_neighbor arrays if necessary */ | ||||||
|  |     if(nall > nmax) { | ||||||
|  |         nmax = nall; | ||||||
|  |         if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); | ||||||
|  |         if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); | ||||||
|  |         checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); | ||||||
|  |         checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /* loop over each atom, storing neighbors */ |     /* loop over each atom, storing neighbors */ | ||||||
|     while(resize) { |     while(resize) { | ||||||
|         resize = 0; |         resize = 0; | ||||||
| @@ -644,7 +639,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * | |||||||
|          * */ |          * */ | ||||||
|         compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor, |         compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor, | ||||||
|                                                                     np, nstencil, c_stencil, |                                                                     np, nstencil, c_stencil, | ||||||
|                                                                     c_bins, atoms_per_bin, c_bincount, |                                                                     c_binning->bins, atoms_per_bin, c_binning->bincount, | ||||||
|                                                                     c_new_maxneighs, |                                                                     c_new_maxneighs, | ||||||
| 								                                    cutneighsq); | 								                                    cutneighsq); | ||||||
|  |  | ||||||
| @@ -675,7 +670,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * | |||||||
|  |  | ||||||
|     cudaFree(c_new_maxneighs); |     cudaFree(c_new_maxneighs); | ||||||
|     cudaFree(c_stencil); |     cudaFree(c_stencil); | ||||||
|     cudaFree(c_bincount); |     cudaFree(c_binning->bincount); | ||||||
|     cudaFree(c_bins); |     cudaFree(c_binning->bins); | ||||||
| } | } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user