diff --git a/lammps/atom.c b/lammps/atom.c index 037e232..ce133b8 100644 --- a/lammps/atom.c +++ b/lammps/atom.c @@ -62,6 +62,17 @@ void initAtom(Atom *atom) { atom->radius = NULL; atom->av = NULL; atom->r = NULL; + + DeviceAtom *d_atom = &(atom->d_atom); + d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL; + d_atom->vx = NULL; d_atom->vy = NULL; d_atom->vz = NULL; + d_atom->fx = NULL; d_atom->fy = NULL; d_atom->fz = NULL; + d_atom->border_map = NULL; + d_atom->type = NULL; + d_atom->epsilon = NULL; + d_atom->sigma6 = NULL; + d_atom->cutforcesq = NULL; + d_atom->cutneighsq = NULL; } void createAtom(Atom *atom, Parameter *param) { @@ -513,25 +524,32 @@ int readAtom_in(Atom* atom, Parameter* param) { } void growAtom(Atom *atom) { + DeviceAtom *d_atom = &(atom->d_atom); int nold = atom->Nmax; atom->Nmax += DELTA; + #undef REALLOC + #define REALLOC(p,t,ns,os); \ + atom->p = (t *) reallocate(atom->p, ALIGNMENT, ns, os); \ + atom->d_atom.p = (t *) reallocateGPU(atom->d_atom.p, ns); + #ifdef AOS - atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); - atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); - atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); + REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); + REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); + REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); #else - atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); - atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(y, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(z, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(vy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(vz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(fy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); + REALLOC(fz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); #endif - atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int)); + REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int)); + // DEM atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); diff --git a/lammps/cuda/atom.cu b/lammps/cuda/atom.cu index 3dc2c3a..7a9d4d3 100644 --- a/lammps/cuda/atom.cu +++ b/lammps/cuda/atom.cu @@ -31,30 +31,22 @@ extern "C" { #include #include -void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { - c_atom->Natoms = atom->Natoms; - c_atom->Nlocal = atom->Nlocal; - c_atom->Nghost = atom->Nghost; - c_atom->Nmax = atom->Nmax; - c_atom->ntypes = atom->ntypes; - c_atom->border_map = NULL; +void initCuda(Atom *atom, Neighbor *neighbor) { + DeviceAtom *d_atom = &(atom->d_atom); + DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor); - c_atom->x = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3); - c_atom->vx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3); - c_atom->fx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3); - c_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - c_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - c_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - c_atom->type = (int *) allocateGPU(sizeof(int) * atom->Nmax * 3); - c_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs); - c_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax); + d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs); + d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax); - memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); - memcpyToGPU(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3); - memcpyToGPU(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - memcpyToGPU(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - memcpyToGPU(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax); + memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax); } void cuda_assert(const char *label, cudaError_t err) { diff --git a/lammps/cuda/force.cu b/lammps/cuda/force.cu index 2295082..bdd55da 100644 --- a/lammps/cuda/force.cu +++ b/lammps/cuda/force.cu @@ -45,14 +45,13 @@ extern "C" { } // cuda kernel -__global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) { +__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) { const int i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= Nlocal) { return; } - Atom *atom = &a; - + DeviceAtom *atom = &a; const int numneighs = neigh_numneigh[i]; MD_FLOAT xtmp = atom_x(i); @@ -64,7 +63,7 @@ __global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOA MD_FLOAT fiz = 0; for(int k = 0; k < numneighs; k++) { - int j = neigh_neighbors[atom->Nlocal * k + i]; + int j = neigh_neighbors[Nlocal * k + i]; MD_FLOAT delx = xtmp - atom_x(j); MD_FLOAT dely = ytmp - atom_y(j); MD_FLOAT delz = ztmp - atom_z(j); @@ -93,13 +92,13 @@ __global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOA atom_fz(i) = fiz; } -__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) { +__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) { const int i = blockIdx.x * blockDim.x + threadIdx.x; if( i >= Nlocal ) { return; } - Atom *atom = &a; + DeviceAtom *atom = &a; atom_vx(i) += dtforce * atom_fx(i); atom_vy(i) += dtforce * atom_fy(i); @@ -109,13 +108,13 @@ __global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nloc atom_z(i) = atom_z(i) + dt * atom_vz(i); } -__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) { +__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) { const int i = blockIdx.x * blockDim.x + threadIdx.x; if( i >= Nlocal ) { return; } - Atom *atom = &a; + DeviceAtom *atom = &a; atom_vx(i) += dtforce * atom_fx(i); atom_vy(i) += dtforce * atom_fy(i); @@ -124,35 +123,35 @@ __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) { extern "C" { -void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { +void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) { const int Nlocal = atom->Nlocal; const int num_threads_per_block = get_num_threads(); const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); - kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom); + kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom); cuda_assert("kernel_final_integrate", cudaPeekAtLastError()); cuda_assert("kernel_final_integrate", cudaDeviceSynchronize()); if(reneigh) { - memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3); + memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3); } } -void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { +void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) { const int Nlocal = atom->Nlocal; const int num_threads_per_block = get_num_threads(); const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); - kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom); + kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom); cuda_assert("kernel_initial_integrate", cudaPeekAtLastError()); cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize()); if(reneigh) { - memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3); + memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3); } } -double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { +double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) { const int num_threads_per_block = get_num_threads(); int Nlocal = atom->Nlocal; #ifndef EXPLICIT_TYPES @@ -175,14 +174,14 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error - // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) ); + // memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3); cudaProfilerStart(); const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); double S = getTimeStamp(); LIKWID_MARKER_START("force"); - calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh); + calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh); cuda_assert("calc_force", cudaPeekAtLastError()); cuda_assert("calc_force", cudaDeviceSynchronize()); cudaProfilerStop(); diff --git a/lammps/cuda/neighbor.cu b/lammps/cuda/neighbor.cu index b183f5b..ad73d77 100644 --- a/lammps/cuda/neighbor.cu +++ b/lammps/cuda/neighbor.cu @@ -114,11 +114,10 @@ __global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, in } while (!sorted); } -__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) { - Atom* atom = &a; +__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) { + DeviceAtom* atom = &a; const int i = blockIdx.x * blockDim.x + threadIdx.x; - int nall = atom->Nlocal + atom->Nghost; - if(i >= nall){ + if(i >= nall) { return; } @@ -135,16 +134,17 @@ __global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_ } } -__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, - int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) { +__global__ void compute_neighborhood( + DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil, + int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; - const int Nlocal = a.Nlocal; - if( i >= Nlocal ) { + if(i >= nlocal) { return; } - Atom *atom = &a; - Neighbor *neighbor = &neigh; + DeviceAtom *atom = &a; + DeviceNeighbor *neighbor = &neigh; int* neighptr = &(neighbor->neighbors[i]); int n = 0; @@ -179,7 +179,7 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, #endif if( rsq <= cutoff ) { - int idx = atom->Nlocal * n; + int idx = nlocal * n; neighptr[idx] = j; n += 1; } @@ -187,13 +187,13 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, } neighbor->numneigh[i] = n; - if(n > neighbor->maxneighs) { + if(n > maxneighs) { atomicMax(new_maxneighs, n); } } -void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) { - int nall = c_atom->Nlocal + c_atom->Nghost; +void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) { + int nall = atom->Nlocal + atom->Nghost; int resize = 1; const int num_blocks = ceil((float) nall / (float) threads_per_block); @@ -202,7 +202,7 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int)); memsetGPU(c_resize_needed, 0, sizeof(int)); - binatoms_kernel<<>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed); + binatoms_kernel<<>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed); cuda_assert("binatoms", cudaPeekAtLastError()); cuda_assert("binatoms", cudaDeviceSynchronize()); @@ -220,10 +220,10 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh cuda_assert("sort_bin", cudaDeviceSynchronize()); } -void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { +void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) { + DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor); const int num_threads_per_block = get_num_threads(); int nall = atom->Nlocal + atom->Nghost; - c_neighbor->maxneighs = neighbor->maxneighs; cudaProfilerStart(); @@ -263,18 +263,17 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * } /* bin local & ghost atoms */ - binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block); + binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block); if(c_new_maxneighs == NULL) { c_new_maxneighs = (int *) allocateGPU(sizeof(int)); } int resize = 1; - /* extend c_neighbor arrays if necessary */ if(nall > nmax) { nmax = nall; - c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, nmax * c_neighbor->maxneighs * sizeof(int)); - c_neighbor->numneigh = (int *) reallocateGPU(c_neighbor->numneigh, nmax * sizeof(int)); + d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int)); + d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int)); } /* loop over each atom, storing neighbors */ @@ -282,8 +281,8 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * resize = 0; memsetGPU(c_new_maxneighs, 0, sizeof(int)); const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); - compute_neighborhood<<>>(*c_atom, *c_neighbor, - np, nstencil, c_stencil, + compute_neighborhood<<>>(atom->d_atom, *d_neighbor, + np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil, c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount, c_new_maxneighs, cutneighsq); @@ -293,19 +292,18 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * int new_maxneighs; memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int)); - if (new_maxneighs > c_neighbor->maxneighs){ + if(new_maxneighs > neighbor->maxneighs){ resize = 1; } if(resize) { - printf("RESIZE %d\n", c_neighbor->maxneighs); - c_neighbor->maxneighs = new_maxneighs * 1.2; - printf("NEW SIZE %d\n", c_neighbor->maxneighs); - c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)); + printf("RESIZE %d\n", neighbor->maxneighs); + neighbor->maxneighs = new_maxneighs * 1.2; + printf("NEW SIZE %d\n", neighbor->maxneighs); + neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int)); } } - neighbor->maxneighs = c_neighbor->maxneighs; cudaProfilerStop(); } diff --git a/lammps/cuda/pbc.cu b/lammps/cuda/pbc.cu index 2982b5f..90e4c7f 100644 --- a/lammps/cuda/pbc.cu +++ b/lammps/cuda/pbc.cu @@ -36,13 +36,13 @@ extern "C" { extern int NmaxGhost; extern int *PBCx, *PBCy, *PBCz; -static int c_NmaxGhost; -static int *c_PBCx, *c_PBCy, *c_PBCz; +static int c_NmaxGhost = 0; +static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL; -__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) { +__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) { const int i = blockIdx.x * blockDim.x + threadIdx.x; - Atom* atom = &a; - if(i >= atom->Nlocal) { + DeviceAtom *atom = &a; + if(i >= nlocal) { return; } @@ -65,17 +65,14 @@ __global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_F } } -__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ +__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ const int i = blockIdx.x * blockDim.x + threadIdx.x; - const int Nghost = a.Nghost; - if(i >= Nghost) { + if(i >= nghost) { return; } - Atom* atom = &a; + DeviceAtom* atom = &a; int *border_map = atom->border_map; - int nlocal = atom->Nlocal; - atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd; atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd; atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd; @@ -83,36 +80,27 @@ __global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLO /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ -void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) { +void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) { const int num_threads_per_block = get_num_threads(); - if (doReneighbor) { - c_atom->Natoms = atom->Natoms; - c_atom->Nlocal = atom->Nlocal; - c_atom->Nghost = atom->Nghost; - c_atom->ntypes = atom->ntypes; - - if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed - c_atom->Nmax = atom->Nmax; - c_atom->x = (MD_FLOAT *) reallocateGPU(c_atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); - c_atom->type = (int *) reallocateGPU(c_atom->type, sizeof(int) * atom->Nmax); - } - - memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); - memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax); + if(reneigh) { + memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax); if(c_NmaxGhost < NmaxGhost) { c_NmaxGhost = NmaxGhost; c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int)); c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int)); c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int)); - c_atom->border_map = (int *) reallocateGPU(c_atom->border_map, NmaxGhost * sizeof(int)); + atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int)); } memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int)); memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int)); memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int)); - memcpyToGPU(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int)); + memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int)); + cuda_assert("updatePbc.reneigh", cudaPeekAtLastError()); + cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize()); } MD_FLOAT xprd = param->xprd; @@ -120,20 +108,20 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo MD_FLOAT zprd = param->zprd; const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block); - computePbcUpdate<<>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd); - cuda_assert("computePbcUpdate", cudaPeekAtLastError()); - cuda_assert("computePbcUpdate", cudaDeviceSynchronize()); + computePbcUpdate<<>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd); + cuda_assert("updatePbc", cudaPeekAtLastError()); + cuda_assert("updatePbc", cudaDeviceSynchronize()); } -void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param) { +void updateAtomsPbc_cuda(Atom* atom, Parameter *param) { const int num_threads_per_block = get_num_threads(); MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); - computeAtomsPbcUpdate<<>>(*c_atom, xprd, yprd, zprd); + computeAtomsPbcUpdate<<>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd); cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError()); cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize()); - memcpyFromGPU(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3); + memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3); } diff --git a/lammps/includes/atom.h b/lammps/includes/atom.h index 7687c2c..70c7863 100644 --- a/lammps/includes/atom.h +++ b/lammps/includes/atom.h @@ -48,6 +48,18 @@ # define updateAtomsPbc updateAtomsPbc_cpu #endif +typedef struct { + MD_FLOAT *x, *y, *z; + MD_FLOAT *vx, *vy, *vz; + MD_FLOAT *fx, *fy, *fz; + int *border_map; + int *type; + MD_FLOAT *epsilon; + MD_FLOAT *sigma6; + MD_FLOAT *cutforcesq; + MD_FLOAT *cutneighsq; +} DeviceAtom; + typedef struct { int Natoms, Nlocal, Nghost, Nmax; MD_FLOAT *x, *y, *z; @@ -60,10 +72,14 @@ typedef struct { MD_FLOAT *sigma6; MD_FLOAT *cutforcesq; MD_FLOAT *cutneighsq; + // DEM MD_FLOAT *radius; MD_FLOAT *av; MD_FLOAT *r; + + // Device data + DeviceAtom d_atom; } Atom; extern void initAtom(Atom*); diff --git a/lammps/includes/cuda_atom.h b/lammps/includes/cuda_atom.h index b164985..9f986bb 100644 --- a/lammps/includes/cuda_atom.h +++ b/lammps/includes/cuda_atom.h @@ -5,7 +5,7 @@ #ifndef __CUDA_ATOM_H_ #define __CUDA_ATOM_H_ -extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*); +extern void initCuda(Atom*, Neighbor*); extern void cuda_assert(const char *msg, cudaError_t err); extern void *allocateGPU(size_t bytesize); extern void *reallocateGPU(void *ptr, size_t new_bytesize); diff --git a/lammps/includes/integrate.h b/lammps/includes/integrate.h index 2802595..42d62cf 100644 --- a/lammps/includes/integrate.h +++ b/lammps/includes/integrate.h @@ -23,7 +23,7 @@ #include #include -void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { +void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) { for(int i = 0; i < atom->Nlocal; i++) { atom_vx(i) += param->dtforce * atom_fx(i); atom_vy(i) += param->dtforce * atom_fy(i); @@ -34,7 +34,7 @@ void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_at } } -void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { +void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) { for(int i = 0; i < atom->Nlocal; i++) { atom_vx(i) += param->dtforce * atom_fx(i); atom_vy(i) += param->dtforce * atom_fy(i); @@ -43,6 +43,6 @@ void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom } #ifdef CUDA_TARGET -void initialIntegrate_cuda(bool, Parameter*, Atom*, Atom*); -void finalIntegrate_cuda(bool, Parameter*, Atom*, Atom*); +void initialIntegrate_cuda(bool, Parameter*, Atom*); +void finalIntegrate_cuda(bool, Parameter*, Atom*); #endif diff --git a/lammps/includes/neighbor.h b/lammps/includes/neighbor.h index 52de07c..5b1d9ed 100644 --- a/lammps/includes/neighbor.h +++ b/lammps/includes/neighbor.h @@ -25,13 +25,22 @@ #ifndef __NEIGHBOR_H_ #define __NEIGHBOR_H_ + +typedef struct { + int *neighbors; + int *numneigh; +} DeviceNeighbor; + typedef struct { int every; int ncalls; int maxneighs; int half_neigh; - int* neighbors; - int* numneigh; + int *neighbors; + int *numneigh; + + // Device data + DeviceNeighbor d_neighbor; } Neighbor; typedef struct { @@ -52,11 +61,11 @@ typedef struct { extern void initNeighbor(Neighbor*, Parameter*); extern void setupNeighbor(Parameter*); extern void binatoms(Atom*); -extern void buildNeighbor_cpu(Atom*, Neighbor*, Atom*, Neighbor*); +extern void buildNeighbor_cpu(Atom*, Neighbor*); extern void sortAtom(Atom*); #ifdef CUDA_TARGET -extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*); +extern void buildNeighbor_cuda(Atom*, Neighbor*); #endif #endif diff --git a/lammps/includes/pbc.h b/lammps/includes/pbc.h index df6e46d..c64f4a0 100644 --- a/lammps/includes/pbc.h +++ b/lammps/includes/pbc.h @@ -28,13 +28,13 @@ #ifndef __PBC_H_ #define __PBC_H_ extern void initPbc(); -extern void updatePbc_cpu(Atom*, Atom*, Parameter*, bool); -extern void updateAtomsPbc_cpu(Atom*, Atom*, Parameter*); +extern void updatePbc_cpu(Atom*, Parameter*, bool); +extern void updateAtomsPbc_cpu(Atom*, Parameter*); extern void setupPbc(Atom*, Parameter*); #ifdef CUDA_TARGET -extern void updatePbc_cuda(Atom*, Atom*, Parameter*, bool); -extern void updateAtomsPbc_cuda(Atom*, Atom*, Parameter*); +extern void updatePbc_cuda(Atom*, Parameter*, bool); +extern void updateAtomsPbc_cuda(Atom*, Parameter*); #endif #endif diff --git a/lammps/main.c b/lammps/main.c index d936858..7de2312 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -54,10 +54,10 @@ extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*); #ifdef CUDA_TARGET #include -extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*, Atom*, Neighbor*); +extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*); #endif -double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) { +double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; param->lattice = pow((4.0 / param->rho), (1.0 / 3.0)); @@ -81,23 +81,23 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c if(param->input_file == NULL) { adjustThermo(param, atom); } setupPbc(atom, param); #ifdef CUDA_TARGET - initCuda(atom, neighbor, c_atom, c_neighbor); + initCuda(atom, neighbor); #endif - updatePbc(atom, c_atom, param, true); - buildNeighbor(atom, neighbor, c_atom, c_neighbor); + updatePbc(atom, param, true); + buildNeighbor(atom, neighbor); E = getTimeStamp(); return E-S; } -double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { +double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { double S, E; S = getTimeStamp(); LIKWID_MARKER_START("reneighbour"); - updateAtomsPbc(atom, c_atom, param); + updateAtomsPbc(atom, param); setupPbc(atom, param); - updatePbc(atom, c_atom, param, true); + updatePbc(atom, param, true); //sortAtom(atom); - buildNeighbor(atom, neighbor, c_atom, c_neighbor); + buildNeighbor(atom, neighbor); LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); return E-S; @@ -111,7 +111,7 @@ void printAtomState(Atom *atom) { // } } -double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) { +double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { return computeForceEam(eam, param, atom, neighbor, stats); } else if(param->force_field == FF_DEM) { @@ -128,7 +128,7 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, } #ifdef CUDA_TARGET - return computeForceLJFullNeigh(param, atom, neighbor, c_atom, c_neighbor); + return computeForceLJFullNeigh(param, atom, neighbor); #else return computeForceLJFullNeigh(param, atom, neighbor, stats); #endif @@ -137,8 +137,8 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, int main(int argc, char** argv) { double timer[NUMTIMER]; Eam eam; - Atom atom, c_atom; - Neighbor neighbor, c_neighbor; + Atom atom; + Neighbor neighbor; Stats stats; Parameter param; @@ -226,7 +226,7 @@ int main(int argc, char** argv) { } param.cutneigh = param.cutforce + param.skin; - setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats); + setup(¶m, &eam, &atom, &neighbor, &stats); printParameter(¶m); printf("step\ttemp\t\tpressure\n"); @@ -235,7 +235,7 @@ int main(int argc, char** argv) { traceAddresses(¶m, &atom, &neighbor, n + 1); #endif - timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats); + timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats); timer[NEIGH] = 0.0; timer[TOTAL] = getTimeStamp(); @@ -245,23 +245,23 @@ int main(int argc, char** argv) { for(int n = 0; n < param.ntimes; n++) { bool reneigh = (n + 1) % param.reneigh_every == 0; - initialIntegrate(reneigh, ¶m, &atom, &c_atom); + initialIntegrate(reneigh, ¶m, &atom); if((n + 1) % param.reneigh_every) { - updatePbc(&atom, &c_atom, ¶m, false); + updatePbc(&atom, ¶m, false); } else { - timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor); + timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); } #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); #endif - timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats); - finalIntegrate(reneigh, ¶m, &atom, &c_atom); + timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats); + finalIntegrate(reneigh, ¶m, &atom); if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { #ifdef CUDA_TARGET - memcpyFromGPU(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3); + memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3); #endif computeThermo(n + 1, ¶m, &atom); } diff --git a/lammps/neighbor.c b/lammps/neighbor.c index c3e19ca..97d001e 100644 --- a/lammps/neighbor.c +++ b/lammps/neighbor.c @@ -169,7 +169,7 @@ void setupNeighbor(Parameter* param) { bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int)); } -void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { +void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) { int nall = atom->Nlocal + atom->Nghost; /* extend atom arrays if necessary */ @@ -228,7 +228,6 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c } neighbor->numneigh[i] = n; - if(n >= neighbor->maxneighs) { resize = 1; diff --git a/lammps/pbc.c b/lammps/pbc.c index ab1b867..de89558 100644 --- a/lammps/pbc.c +++ b/lammps/pbc.c @@ -44,7 +44,7 @@ void initPbc(Atom* atom) { /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ -void updatePbc_cpu(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) { +void updatePbc_cpu(Atom *atom, Parameter *param, bool doReneighbor) { int *border_map = atom->border_map; int nlocal = atom->Nlocal; MD_FLOAT xprd = param->xprd; @@ -60,7 +60,7 @@ void updatePbc_cpu(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor /* relocate atoms that have left domain according * to periodic boundary conditions */ -void updateAtomsPbc_cpu(Atom *atom, Atom *c_atom, Parameter *param) { +void updateAtomsPbc_cpu(Atom *atom, Parameter *param) { MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd;