From 065b596074fa1f8f01e3b637f4270e02fbce7e91 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Fri, 12 Aug 2022 04:19:38 +0200 Subject: [PATCH] Initial refactoring of CUDA code Signed-off-by: Rafael Ravedutti --- config.mk | 4 +++ lammps/allocate.c | 49 +++++++++++++++++++------- lammps/atom.c | 8 ----- lammps/cuda/atom.cu | 43 +++++++++-------------- lammps/cuda/force.cu | 33 +++++++----------- lammps/cuda/neighbor.cu | 69 +++++++++++++++---------------------- lammps/cuda/pbc.cu | 55 ++++++++++++----------------- lammps/includes/cuda_atom.h | 7 +++- lammps/main.c | 3 +- 9 files changed, 127 insertions(+), 144 deletions(-) diff --git a/config.mk b/config.mk index a2da5f6..696f1e5 100644 --- a/config.mk +++ b/config.mk @@ -38,6 +38,10 @@ XTC_OUTPUT ?= false # Check if cj is local when decreasing reaction force HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false +# Configurations for CUDA +# Use CUDA host memory to optimize transfers +USE_CUDA_HOST_MEMORY ?= false + #Feature options OPTIONS = -DALIGNMENT=64 #OPTIONS += More options diff --git a/lammps/allocate.c b/lammps/allocate.c index c095049..a690deb 100644 --- a/lammps/allocate.c +++ b/lammps/allocate.c @@ -49,36 +49,61 @@ void *allocate(int alignment, size_t bytesize) { return ptr; } -void *reallocate(void* ptr, int alignment, size_t newBytesize, size_t oldBytesize) { - void *newarray = allocate(alignment, newBytesize); - +void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) { + void *newarray = allocate(alignment, new_bytesize); if(ptr != NULL) { - memcpy(newarray, ptr, oldBytesize); + memcpy(newarray, ptr, old_bytesize); free(ptr); } return newarray; } + #ifndef CUDA_TARGET -void *allocate_gpu(int alignment, size_t bytesize) { return NULL; } -void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { return NULL; } +void *allocateGPU(size_t bytesize) { return NULL; } +void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; } +void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {} +void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {} +void memsetGPU(void *d_ptr, int value, size_t bytesize) {} #else #include #include -void *allocate_gpu(int alignment, size_t bytesize) { +void *allocateGPU(size_t bytesize) { void *ptr; - checkCUDAError("allocate", cudaMallocHost((void **) &ptr, bytesize)); + #ifdef CUDA_HOST_MEMORY + cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize)); + #else + cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize)); + #endif return ptr; } // Data is not preserved -void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { - void *newarray = allocate_gpu(alignment, newBytesize); - +void *reallocateGPU(void *ptr, size_t new_bytesize) { if(ptr != NULL) { + #ifdef CUDA_HOST_MEMORY cudaFreeHost(ptr); + #else + cudaFree(ptr); + #endif } - return newarray; + return allocateGPU(new_bytesize); +} + +void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) { + #ifndef CUDA_HOST_MEMORY + cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice)); + #endif +} + +void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) { + #ifndef CUDA_HOST_MEMORY + cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost)); + #endif +} + +void memsetGPU(void *d_ptr, int value, size_t bytesize) { + cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize)); } #endif diff --git a/lammps/atom.c b/lammps/atom.c index 4dd6829..037e232 100644 --- a/lammps/atom.c +++ b/lammps/atom.c @@ -71,14 +71,6 @@ void createAtom(Atom *atom, Parameter *param) { atom->Natoms = 4 * param->nx * param->ny * param->nz; atom->Nlocal = 0; atom->ntypes = param->ntypes; - -#ifdef CUDA_TARGET - checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); -#endif - atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); diff --git a/lammps/cuda/atom.cu b/lammps/cuda/atom.cu index a2767e5..3dc2c3a 100644 --- a/lammps/cuda/atom.cu +++ b/lammps/cuda/atom.cu @@ -37,38 +37,29 @@ void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor c_atom->Nghost = atom->Nghost; c_atom->Nmax = atom->Nmax; c_atom->ntypes = atom->ntypes; - c_atom->border_map = NULL; - const int Nlocal = atom->Nlocal; + c_atom->x = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3); + c_atom->vx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3); + c_atom->fx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3); + c_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + c_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + c_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + c_atom->type = (int *) allocateGPU(sizeof(int) * atom->Nmax * 3); + c_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs); + c_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax); - checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); - checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); - - checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) ); - - checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) ); - checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) ); - - checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) ); - checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); - checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); - checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); - - checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) ); - checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) ); - - checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); - checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); - checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); - - checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); + memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax); } -void checkCUDAError(const char *msg, cudaError_t err) { +void cuda_assert(const char *label, cudaError_t err) { if (err != cudaSuccess) { - //print a human readable error message - printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err)); + printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err)); exit(-1); } } diff --git a/lammps/cuda/force.cu b/lammps/cuda/force.cu index d142a30..2295082 100644 --- a/lammps/cuda/force.cu +++ b/lammps/cuda/force.cu @@ -33,6 +33,7 @@ extern "C" { +#include #include #include #include @@ -123,33 +124,31 @@ __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) { extern "C" { -void finalIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { +void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { const int Nlocal = atom->Nlocal; const int num_threads_per_block = get_num_threads(); const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom); + cuda_assert("kernel_final_integrate", cudaPeekAtLastError()); + cuda_assert("kernel_final_integrate", cudaDeviceSynchronize()); - checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() ); - - if(doReneighbour) { - checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); + if(reneigh) { + memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3); } } -void initialIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { +void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { const int Nlocal = atom->Nlocal; const int num_threads_per_block = get_num_threads(); const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom); + cuda_assert("kernel_initial_integrate", cudaPeekAtLastError()); + cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize()); - checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() ); - - if(doReneighbour) { - checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); + if(reneigh) { + memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3); } } @@ -176,26 +175,20 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error - // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) ); cudaProfilerStart(); - const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); - double S = getTimeStamp(); LIKWID_MARKER_START("force"); calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh); - - checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() ); - + cuda_assert("calc_force", cudaPeekAtLastError()); + cuda_assert("calc_force", cudaDeviceSynchronize()); cudaProfilerStop(); LIKWID_MARKER_STOP("force"); double E = getTimeStamp(); - return E-S; } diff --git a/lammps/cuda/neighbor.cu b/lammps/cuda/neighbor.cu index 4af6e17..b183f5b 100644 --- a/lammps/cuda/neighbor.cu +++ b/lammps/cuda/neighbor.cu @@ -199,30 +199,25 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh while(resize > 0) { resize = 0; - checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int))); - checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) ); + memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int)); + memsetGPU(c_resize_needed, 0, sizeof(int)); - /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */ binatoms_kernel<<>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed); + cuda_assert("binatoms", cudaPeekAtLastError()); + cuda_assert("binatoms", cudaDeviceSynchronize()); - checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() ); - - checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) ); - + memcpyFromGPU(&resize, c_resize_needed, sizeof(int)); if(resize) { - cudaFree(c_binning->bins); c_binning->atoms_per_bin *= 2; - checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) ); + c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)); } } atoms_per_bin = c_binning->atoms_per_bin; - const int sortBlocks = ceil((float)mbins / (float)threads_per_block); - /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/ + const int sortBlocks = ceil((float) mbins / (float) threads_per_block); sort_bin_contents_kernel<<>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin); - checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() ); + cuda_assert("sort_bin", cudaPeekAtLastError()); + cuda_assert("sort_bin", cudaDeviceSynchronize()); } void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { @@ -231,18 +226,18 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * c_neighbor->maxneighs = neighbor->maxneighs; cudaProfilerStart(); - /* upload stencil */ + // TODO move all of this initialization into its own method - if(c_stencil == NULL){ - checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); - checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); + if(c_stencil == NULL) { + c_stencil = (int *) allocateGPU(nstencil * sizeof(int)); + memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int)); } - if(c_binning.mbins == 0){ + if(c_binning.mbins == 0) { c_binning.mbins = mbins; c_binning.atoms_per_bin = atoms_per_bin; - checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) ); - checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) ); + c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int)); + c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)); } Neighbor_params np { @@ -263,14 +258,14 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * .mbinz = mbinz }; - if(c_resize_needed == NULL){ - checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) ); + if(c_resize_needed == NULL) { + c_resize_needed = (int *) allocateGPU(sizeof(int)); } /* bin local & ghost atoms */ binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block); - if(c_new_maxneighs == NULL){ - checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); + if(c_new_maxneighs == NULL) { + c_new_maxneighs = (int *) allocateGPU(sizeof(int)); } int resize = 1; @@ -278,35 +273,26 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * /* extend c_neighbor arrays if necessary */ if(nall > nmax) { nmax = nall; - if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); - if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); - checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); - checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); + c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, nmax * c_neighbor->maxneighs * sizeof(int)); + c_neighbor->numneigh = (int *) reallocateGPU(c_neighbor->numneigh, nmax * sizeof(int)); } /* loop over each atom, storing neighbors */ while(resize) { resize = 0; - - checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) )); - - // TODO call compute_neigborhood kernel here + memsetGPU(c_new_maxneighs, 0, sizeof(int)); const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); - /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, - int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs) - * */ compute_neighborhood<<>>(*c_atom, *c_neighbor, np, nstencil, c_stencil, c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount, c_new_maxneighs, cutneighsq); - checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() ); + cuda_assert("compute_neighborhood", cudaPeekAtLastError()); + cuda_assert("compute_neighborhood", cudaDeviceSynchronize()); - // TODO copy the value of c_new_maxneighs back to host and check if it has been modified int new_maxneighs; - checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost)); + memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int)); if (new_maxneighs > c_neighbor->maxneighs){ resize = 1; } @@ -315,8 +301,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * printf("RESIZE %d\n", c_neighbor->maxneighs); c_neighbor->maxneighs = new_maxneighs * 1.2; printf("NEW SIZE %d\n", c_neighbor->maxneighs); - cudaFree(c_neighbor->neighbors); - checkCUDAError("c_neighbor->neighbors resize malloc", cudaMalloc((void**)(&c_neighbor->neighbors), c_atom->Nmax * c_neighbor->maxneighs * sizeof(int))); + c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)); } } diff --git a/lammps/cuda/pbc.cu b/lammps/cuda/pbc.cu index 3490fae..2982b5f 100644 --- a/lammps/cuda/pbc.cu +++ b/lammps/cuda/pbc.cu @@ -39,7 +39,7 @@ extern int *PBCx, *PBCy, *PBCz; static int c_NmaxGhost; static int *c_PBCx, *c_PBCy, *c_PBCz; -__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ +__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) { const int i = blockIdx.x * blockDim.x + threadIdx.x; Atom* atom = &a; if(i >= atom->Nlocal) { @@ -94,30 +94,25 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed c_atom->Nmax = atom->Nmax; - if(c_atom->x != NULL){ cudaFree(c_atom->x); } - if(c_atom->type != NULL){ cudaFree(c_atom->type); } - checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); - checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) ); + c_atom->x = (MD_FLOAT *) reallocateGPU(c_atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); + c_atom->type = (int *) reallocateGPU(c_atom->type, sizeof(int) * atom->Nmax); } - // TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well - checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); - checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); - if(c_NmaxGhost < NmaxGhost){ + memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax); + + if(c_NmaxGhost < NmaxGhost) { c_NmaxGhost = NmaxGhost; - if(c_PBCx != NULL){ cudaFree(c_PBCx); } - if(c_PBCy != NULL){ cudaFree(c_PBCy); } - if(c_PBCz != NULL){ cudaFree(c_PBCz); } - if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); } - checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) ); - checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) ); - checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) ); - checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) ); + c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int)); + c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int)); + c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int)); + c_atom->border_map = (int *) reallocateGPU(c_atom->border_map, NmaxGhost * sizeof(int)); } - checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); - checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); - checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); - checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); + + memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int)); + memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int)); + memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int)); + memcpyToGPU(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int)); } MD_FLOAT xprd = param->xprd; @@ -125,26 +120,20 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo MD_FLOAT zprd = param->zprd; const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block); - - /*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, - * MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) - * */ computePbcUpdate<<>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd); - checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() ); + cuda_assert("computePbcUpdate", cudaPeekAtLastError()); + cuda_assert("computePbcUpdate", cudaDeviceSynchronize()); } -void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param){ +void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param) { const int num_threads_per_block = get_num_threads(); MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); - /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/ computeAtomsPbcUpdate<<>>(*c_atom, xprd, yprd, zprd); - - checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() ); - checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() ); - checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); + cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError()); + cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize()); + memcpyFromGPU(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3); } diff --git a/lammps/includes/cuda_atom.h b/lammps/includes/cuda_atom.h index 232daec..b164985 100644 --- a/lammps/includes/cuda_atom.h +++ b/lammps/includes/cuda_atom.h @@ -6,5 +6,10 @@ #ifndef __CUDA_ATOM_H_ #define __CUDA_ATOM_H_ extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*); -extern void checkCUDAError(const char *msg, cudaError_t err); +extern void cuda_assert(const char *msg, cudaError_t err); +extern void *allocateGPU(size_t bytesize); +extern void *reallocateGPU(void *ptr, size_t new_bytesize); +extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize); +extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize); +extern void memsetGPU(void *d_ptr, int value, size_t bytesize); #endif diff --git a/lammps/main.c b/lammps/main.c index 1791ddf..d936858 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -261,9 +261,8 @@ int main(int argc, char** argv) { if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { #ifdef CUDA_TARGET - checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost)); + memcpyFromGPU(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3); #endif - computeThermo(n + 1, ¶m, &atom); }