Create separate structs DeviceAtom and DeviceNeighbor with device pointers

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti
2022-08-12 17:28:06 +02:00
parent 065b596074
commit 939197a785
13 changed files with 172 additions and 153 deletions

View File

@@ -31,30 +31,22 @@ extern "C" {
#include <cuda_atom.h>
#include <neighbor.h>
void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
c_atom->Natoms = atom->Natoms;
c_atom->Nlocal = atom->Nlocal;
c_atom->Nghost = atom->Nghost;
c_atom->Nmax = atom->Nmax;
c_atom->ntypes = atom->ntypes;
c_atom->border_map = NULL;
void initCuda(Atom *atom, Neighbor *neighbor) {
DeviceAtom *d_atom = &(atom->d_atom);
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
c_atom->x = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
c_atom->vx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
c_atom->fx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
c_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
c_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
c_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
c_atom->type = (int *) allocateGPU(sizeof(int) * atom->Nmax * 3);
c_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
c_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax);
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
}
void cuda_assert(const char *label, cudaError_t err) {

View File

@@ -45,14 +45,13 @@ extern "C" {
}
// cuda kernel
__global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= Nlocal) {
return;
}
Atom *atom = &a;
DeviceAtom *atom = &a;
const int numneighs = neigh_numneigh[i];
MD_FLOAT xtmp = atom_x(i);
@@ -64,7 +63,7 @@ __global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOA
MD_FLOAT fiz = 0;
for(int k = 0; k < numneighs; k++) {
int j = neigh_neighbors[atom->Nlocal * k + i];
int j = neigh_neighbors[Nlocal * k + i];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
@@ -93,13 +92,13 @@ __global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOA
atom_fz(i) = fiz;
}
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
DeviceAtom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
@@ -109,13 +108,13 @@ __global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nloc
atom_z(i) = atom_z(i) + dt * atom_vz(i);
}
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
DeviceAtom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
@@ -124,35 +123,35 @@ __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
extern "C" {
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
if(reneigh) {
memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
}
}
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
if(reneigh) {
memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
}
}
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
const int num_threads_per_block = get_num_threads();
int Nlocal = atom->Nlocal;
#ifndef EXPLICIT_TYPES
@@ -175,14 +174,14 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
// checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
// memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
cudaProfilerStart();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
cuda_assert("calc_force", cudaPeekAtLastError());
cuda_assert("calc_force", cudaDeviceSynchronize());
cudaProfilerStop();

View File

@@ -114,11 +114,10 @@ __global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, in
} while (!sorted);
}
__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
Atom* atom = &a;
__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
DeviceAtom* atom = &a;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
int nall = atom->Nlocal + atom->Nghost;
if(i >= nall){
if(i >= nall) {
return;
}
@@ -135,16 +134,17 @@ __global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_
}
}
__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
__global__ void compute_neighborhood(
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
const int Nlocal = a.Nlocal;
if( i >= Nlocal ) {
if(i >= nlocal) {
return;
}
Atom *atom = &a;
Neighbor *neighbor = &neigh;
DeviceAtom *atom = &a;
DeviceNeighbor *neighbor = &neigh;
int* neighptr = &(neighbor->neighbors[i]);
int n = 0;
@@ -179,7 +179,7 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np,
#endif
if( rsq <= cutoff ) {
int idx = atom->Nlocal * n;
int idx = nlocal * n;
neighptr[idx] = j;
n += 1;
}
@@ -187,13 +187,13 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np,
}
neighbor->numneigh[i] = n;
if(n > neighbor->maxneighs) {
if(n > maxneighs) {
atomicMax(new_maxneighs, n);
}
}
void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
int nall = c_atom->Nlocal + c_atom->Nghost;
void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
int nall = atom->Nlocal + atom->Nghost;
int resize = 1;
const int num_blocks = ceil((float) nall / (float) threads_per_block);
@@ -202,7 +202,7 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
memsetGPU(c_resize_needed, 0, sizeof(int));
binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
cuda_assert("binatoms", cudaPeekAtLastError());
cuda_assert("binatoms", cudaDeviceSynchronize());
@@ -220,10 +220,10 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh
cuda_assert("sort_bin", cudaDeviceSynchronize());
}
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
const int num_threads_per_block = get_num_threads();
int nall = atom->Nlocal + atom->Nghost;
c_neighbor->maxneighs = neighbor->maxneighs;
cudaProfilerStart();
@@ -263,18 +263,17 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
}
/* bin local & ghost atoms */
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
if(c_new_maxneighs == NULL) {
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
}
int resize = 1;
/* extend c_neighbor arrays if necessary */
if(nall > nmax) {
nmax = nall;
c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, nmax * c_neighbor->maxneighs * sizeof(int));
c_neighbor->numneigh = (int *) reallocateGPU(c_neighbor->numneigh, nmax * sizeof(int));
d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int));
}
/* loop over each atom, storing neighbors */
@@ -282,8 +281,8 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
resize = 0;
memsetGPU(c_new_maxneighs, 0, sizeof(int));
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
np, nstencil, c_stencil,
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
c_new_maxneighs,
cutneighsq);
@@ -293,19 +292,18 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
int new_maxneighs;
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
if (new_maxneighs > c_neighbor->maxneighs){
if(new_maxneighs > neighbor->maxneighs){
resize = 1;
}
if(resize) {
printf("RESIZE %d\n", c_neighbor->maxneighs);
c_neighbor->maxneighs = new_maxneighs * 1.2;
printf("NEW SIZE %d\n", c_neighbor->maxneighs);
c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, c_atom->Nmax * c_neighbor->maxneighs * sizeof(int));
printf("RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2;
printf("NEW SIZE %d\n", neighbor->maxneighs);
neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
}
}
neighbor->maxneighs = c_neighbor->maxneighs;
cudaProfilerStop();
}

View File

@@ -36,13 +36,13 @@ extern "C" {
extern int NmaxGhost;
extern int *PBCx, *PBCy, *PBCz;
static int c_NmaxGhost;
static int *c_PBCx, *c_PBCy, *c_PBCz;
static int c_NmaxGhost = 0;
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
Atom* atom = &a;
if(i >= atom->Nlocal) {
DeviceAtom *atom = &a;
if(i >= nlocal) {
return;
}
@@ -65,17 +65,14 @@ __global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_F
}
}
__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
const int i = blockIdx.x * blockDim.x + threadIdx.x;
const int Nghost = a.Nghost;
if(i >= Nghost) {
if(i >= nghost) {
return;
}
Atom* atom = &a;
DeviceAtom* atom = &a;
int *border_map = atom->border_map;
int nlocal = atom->Nlocal;
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
@@ -83,36 +80,27 @@ __global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLO
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) {
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
const int num_threads_per_block = get_num_threads();
if (doReneighbor) {
c_atom->Natoms = atom->Natoms;
c_atom->Nlocal = atom->Nlocal;
c_atom->Nghost = atom->Nghost;
c_atom->ntypes = atom->ntypes;
if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
c_atom->Nmax = atom->Nmax;
c_atom->x = (MD_FLOAT *) reallocateGPU(c_atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
c_atom->type = (int *) reallocateGPU(c_atom->type, sizeof(int) * atom->Nmax);
}
memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax);
if(reneigh) {
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax);
if(c_NmaxGhost < NmaxGhost) {
c_NmaxGhost = NmaxGhost;
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
c_atom->border_map = (int *) reallocateGPU(c_atom->border_map, NmaxGhost * sizeof(int));
atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
}
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
memcpyToGPU(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int));
memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
}
MD_FLOAT xprd = param->xprd;
@@ -120,20 +108,20 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo
MD_FLOAT zprd = param->zprd;
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
cuda_assert("computePbcUpdate", cudaPeekAtLastError());
cuda_assert("computePbcUpdate", cudaDeviceSynchronize());
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
cuda_assert("updatePbc", cudaPeekAtLastError());
cuda_assert("updatePbc", cudaDeviceSynchronize());
}
void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param) {
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
const int num_threads_per_block = get_num_threads();
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
memcpyFromGPU(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
}