Fixed a compiler error and removed an unnecessary memcpy (from device to host) - performance seems to have crossed the 300M updates/second mark for the A100
This commit is contained in:
		@@ -92,7 +92,7 @@ __global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, in
 | 
				
			|||||||
                sorted = 0;
 | 
					                sorted = 0;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    } while (!sorted)
 | 
					    } while (!sorted);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
 | 
					__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -120,7 +120,6 @@ void updatePbc_cuda(Atom *atom, Parameter *param, Atom *c_atom, bool doReneighbo
 | 
				
			|||||||
        checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
 | 
					        checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int nlocal = atom->Nlocal;
 | 
					 | 
				
			||||||
    MD_FLOAT xprd = param->xprd;
 | 
					    MD_FLOAT xprd = param->xprd;
 | 
				
			||||||
    MD_FLOAT yprd = param->yprd;
 | 
					    MD_FLOAT yprd = param->yprd;
 | 
				
			||||||
    MD_FLOAT zprd = param->zprd;
 | 
					    MD_FLOAT zprd = param->zprd;
 | 
				
			||||||
@@ -133,9 +132,6 @@ void updatePbc_cuda(Atom *atom, Parameter *param, Atom *c_atom, bool doReneighbo
 | 
				
			|||||||
    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
 | 
					    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
 | 
					    checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
 | 
				
			||||||
    checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
 | 
					    checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
 | 
				
			||||||
    if(doReneighbor){
 | 
					 | 
				
			||||||
    	checkCUDAError( "updatePbc atom->x memcpy back", cudaMemcpy(atom->x, c_atom->x, atom->Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost) );
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* relocate atoms that have left domain according
 | 
					/* relocate atoms that have left domain according
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user