Create separate structs DeviceAtom and DeviceNeighbor with device pointers
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
065b596074
commit
939197a785
@ -62,6 +62,17 @@ void initAtom(Atom *atom) {
|
|||||||
atom->radius = NULL;
|
atom->radius = NULL;
|
||||||
atom->av = NULL;
|
atom->av = NULL;
|
||||||
atom->r = NULL;
|
atom->r = NULL;
|
||||||
|
|
||||||
|
DeviceAtom *d_atom = &(atom->d_atom);
|
||||||
|
d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL;
|
||||||
|
d_atom->vx = NULL; d_atom->vy = NULL; d_atom->vz = NULL;
|
||||||
|
d_atom->fx = NULL; d_atom->fy = NULL; d_atom->fz = NULL;
|
||||||
|
d_atom->border_map = NULL;
|
||||||
|
d_atom->type = NULL;
|
||||||
|
d_atom->epsilon = NULL;
|
||||||
|
d_atom->sigma6 = NULL;
|
||||||
|
d_atom->cutforcesq = NULL;
|
||||||
|
d_atom->cutneighsq = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void createAtom(Atom *atom, Parameter *param) {
|
void createAtom(Atom *atom, Parameter *param) {
|
||||||
@ -513,25 +524,32 @@ int readAtom_in(Atom* atom, Parameter* param) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void growAtom(Atom *atom) {
|
void growAtom(Atom *atom) {
|
||||||
|
DeviceAtom *d_atom = &(atom->d_atom);
|
||||||
int nold = atom->Nmax;
|
int nold = atom->Nmax;
|
||||||
atom->Nmax += DELTA;
|
atom->Nmax += DELTA;
|
||||||
|
|
||||||
|
#undef REALLOC
|
||||||
|
#define REALLOC(p,t,ns,os); \
|
||||||
|
atom->p = (t *) reallocate(atom->p, ALIGNMENT, ns, os); \
|
||||||
|
atom->d_atom.p = (t *) reallocateGPU(atom->d_atom.p, ns);
|
||||||
|
|
||||||
#ifdef AOS
|
#ifdef AOS
|
||||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
#else
|
#else
|
||||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(y, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(z, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(vy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(vz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(fy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
REALLOC(fz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
#endif
|
#endif
|
||||||
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||||
|
|
||||||
// DEM
|
// DEM
|
||||||
atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
|
@ -31,30 +31,22 @@ extern "C" {
|
|||||||
#include <cuda_atom.h>
|
#include <cuda_atom.h>
|
||||||
#include <neighbor.h>
|
#include <neighbor.h>
|
||||||
|
|
||||||
void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
void initCuda(Atom *atom, Neighbor *neighbor) {
|
||||||
c_atom->Natoms = atom->Natoms;
|
DeviceAtom *d_atom = &(atom->d_atom);
|
||||||
c_atom->Nlocal = atom->Nlocal;
|
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||||
c_atom->Nghost = atom->Nghost;
|
|
||||||
c_atom->Nmax = atom->Nmax;
|
|
||||||
c_atom->ntypes = atom->ntypes;
|
|
||||||
c_atom->border_map = NULL;
|
|
||||||
|
|
||||||
c_atom->x = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
|
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||||
c_atom->vx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
|
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||||
c_atom->fx = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
|
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||||
c_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||||
c_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||||
c_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
|
||||||
c_atom->type = (int *) allocateGPU(sizeof(int) * atom->Nmax * 3);
|
|
||||||
c_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
|
||||||
c_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
|
||||||
|
|
||||||
memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||||
memcpyToGPU(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||||
memcpyToGPU(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||||
memcpyToGPU(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||||
memcpyToGPU(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||||
memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_assert(const char *label, cudaError_t err) {
|
void cuda_assert(const char *label, cudaError_t err) {
|
||||||
|
@ -45,14 +45,13 @@ extern "C" {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// cuda kernel
|
// cuda kernel
|
||||||
__global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if(i >= Nlocal) {
|
if(i >= Nlocal) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Atom *atom = &a;
|
DeviceAtom *atom = &a;
|
||||||
|
|
||||||
const int numneighs = neigh_numneigh[i];
|
const int numneighs = neigh_numneigh[i];
|
||||||
|
|
||||||
MD_FLOAT xtmp = atom_x(i);
|
MD_FLOAT xtmp = atom_x(i);
|
||||||
@ -64,7 +63,7 @@ __global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOA
|
|||||||
MD_FLOAT fiz = 0;
|
MD_FLOAT fiz = 0;
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int j = neigh_neighbors[atom->Nlocal * k + i];
|
int j = neigh_neighbors[Nlocal * k + i];
|
||||||
MD_FLOAT delx = xtmp - atom_x(j);
|
MD_FLOAT delx = xtmp - atom_x(j);
|
||||||
MD_FLOAT dely = ytmp - atom_y(j);
|
MD_FLOAT dely = ytmp - atom_y(j);
|
||||||
MD_FLOAT delz = ztmp - atom_z(j);
|
MD_FLOAT delz = ztmp - atom_z(j);
|
||||||
@ -93,13 +92,13 @@ __global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOA
|
|||||||
atom_fz(i) = fiz;
|
atom_fz(i) = fiz;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
|
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if( i >= Nlocal ) {
|
if( i >= Nlocal ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Atom *atom = &a;
|
DeviceAtom *atom = &a;
|
||||||
|
|
||||||
atom_vx(i) += dtforce * atom_fx(i);
|
atom_vx(i) += dtforce * atom_fx(i);
|
||||||
atom_vy(i) += dtforce * atom_fy(i);
|
atom_vy(i) += dtforce * atom_fy(i);
|
||||||
@ -109,13 +108,13 @@ __global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nloc
|
|||||||
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
|
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
if( i >= Nlocal ) {
|
if( i >= Nlocal ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Atom *atom = &a;
|
DeviceAtom *atom = &a;
|
||||||
|
|
||||||
atom_vx(i) += dtforce * atom_fx(i);
|
atom_vx(i) += dtforce * atom_fx(i);
|
||||||
atom_vy(i) += dtforce * atom_fy(i);
|
atom_vy(i) += dtforce * atom_fy(i);
|
||||||
@ -124,35 +123,35 @@ __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
|
|||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
|
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||||
const int Nlocal = atom->Nlocal;
|
const int Nlocal = atom->Nlocal;
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
|
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
|
||||||
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
|
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
|
||||||
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
|
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
|
||||||
|
|
||||||
if(reneigh) {
|
if(reneigh) {
|
||||||
memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
|
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||||
const int Nlocal = atom->Nlocal;
|
const int Nlocal = atom->Nlocal;
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
|
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
|
||||||
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
|
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
|
||||||
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
|
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
|
||||||
|
|
||||||
if(reneigh) {
|
if(reneigh) {
|
||||||
memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
#ifndef EXPLICIT_TYPES
|
#ifndef EXPLICIT_TYPES
|
||||||
@ -175,14 +174,14 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
|
|||||||
|
|
||||||
|
|
||||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||||
// checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
|
// memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
|
||||||
|
|
||||||
cudaProfilerStart();
|
cudaProfilerStart();
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
|
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
|
||||||
cuda_assert("calc_force", cudaPeekAtLastError());
|
cuda_assert("calc_force", cudaPeekAtLastError());
|
||||||
cuda_assert("calc_force", cudaDeviceSynchronize());
|
cuda_assert("calc_force", cudaDeviceSynchronize());
|
||||||
cudaProfilerStop();
|
cudaProfilerStop();
|
||||||
|
@ -114,10 +114,9 @@ __global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, in
|
|||||||
} while (!sorted);
|
} while (!sorted);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
|
__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
|
||||||
Atom* atom = &a;
|
DeviceAtom* atom = &a;
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
|
||||||
if(i >= nall) {
|
if(i >= nall) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -135,16 +134,17 @@ __global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
__global__ void compute_neighborhood(
|
||||||
|
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
|
||||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
|
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
|
||||||
|
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const int Nlocal = a.Nlocal;
|
if(i >= nlocal) {
|
||||||
if( i >= Nlocal ) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Atom *atom = &a;
|
DeviceAtom *atom = &a;
|
||||||
Neighbor *neighbor = &neigh;
|
DeviceNeighbor *neighbor = &neigh;
|
||||||
|
|
||||||
int* neighptr = &(neighbor->neighbors[i]);
|
int* neighptr = &(neighbor->neighbors[i]);
|
||||||
int n = 0;
|
int n = 0;
|
||||||
@ -179,7 +179,7 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if( rsq <= cutoff ) {
|
if( rsq <= cutoff ) {
|
||||||
int idx = atom->Nlocal * n;
|
int idx = nlocal * n;
|
||||||
neighptr[idx] = j;
|
neighptr[idx] = j;
|
||||||
n += 1;
|
n += 1;
|
||||||
}
|
}
|
||||||
@ -187,13 +187,13 @@ __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np,
|
|||||||
}
|
}
|
||||||
|
|
||||||
neighbor->numneigh[i] = n;
|
neighbor->numneigh[i] = n;
|
||||||
if(n > neighbor->maxneighs) {
|
if(n > maxneighs) {
|
||||||
atomicMax(new_maxneighs, n);
|
atomicMax(new_maxneighs, n);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
|
void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
|
||||||
int nall = c_atom->Nlocal + c_atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
int resize = 1;
|
int resize = 1;
|
||||||
const int num_blocks = ceil((float) nall / (float) threads_per_block);
|
const int num_blocks = ceil((float) nall / (float) threads_per_block);
|
||||||
|
|
||||||
@ -202,7 +202,7 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh
|
|||||||
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
|
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
|
||||||
memsetGPU(c_resize_needed, 0, sizeof(int));
|
memsetGPU(c_resize_needed, 0, sizeof(int));
|
||||||
|
|
||||||
binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||||
cuda_assert("binatoms", cudaPeekAtLastError());
|
cuda_assert("binatoms", cudaPeekAtLastError());
|
||||||
cuda_assert("binatoms", cudaDeviceSynchronize());
|
cuda_assert("binatoms", cudaDeviceSynchronize());
|
||||||
|
|
||||||
@ -220,10 +220,10 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh
|
|||||||
cuda_assert("sort_bin", cudaDeviceSynchronize());
|
cuda_assert("sort_bin", cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||||
|
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
c_neighbor->maxneighs = neighbor->maxneighs;
|
|
||||||
|
|
||||||
cudaProfilerStart();
|
cudaProfilerStart();
|
||||||
|
|
||||||
@ -263,18 +263,17 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* bin local & ghost atoms */
|
/* bin local & ghost atoms */
|
||||||
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||||
if(c_new_maxneighs == NULL) {
|
if(c_new_maxneighs == NULL) {
|
||||||
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
|
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
int resize = 1;
|
int resize = 1;
|
||||||
|
|
||||||
/* extend c_neighbor arrays if necessary */
|
|
||||||
if(nall > nmax) {
|
if(nall > nmax) {
|
||||||
nmax = nall;
|
nmax = nall;
|
||||||
c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, nmax * c_neighbor->maxneighs * sizeof(int));
|
d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
|
||||||
c_neighbor->numneigh = (int *) reallocateGPU(c_neighbor->numneigh, nmax * sizeof(int));
|
d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* loop over each atom, storing neighbors */
|
/* loop over each atom, storing neighbors */
|
||||||
@ -282,8 +281,8 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
|
|||||||
resize = 0;
|
resize = 0;
|
||||||
memsetGPU(c_new_maxneighs, 0, sizeof(int));
|
memsetGPU(c_new_maxneighs, 0, sizeof(int));
|
||||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||||
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
|
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
|
||||||
np, nstencil, c_stencil,
|
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
|
||||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||||
c_new_maxneighs,
|
c_new_maxneighs,
|
||||||
cutneighsq);
|
cutneighsq);
|
||||||
@ -293,19 +292,18 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
|
|||||||
|
|
||||||
int new_maxneighs;
|
int new_maxneighs;
|
||||||
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
|
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
|
||||||
if (new_maxneighs > c_neighbor->maxneighs){
|
if(new_maxneighs > neighbor->maxneighs){
|
||||||
resize = 1;
|
resize = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(resize) {
|
if(resize) {
|
||||||
printf("RESIZE %d\n", c_neighbor->maxneighs);
|
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||||
c_neighbor->maxneighs = new_maxneighs * 1.2;
|
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||||
printf("NEW SIZE %d\n", c_neighbor->maxneighs);
|
printf("NEW SIZE %d\n", neighbor->maxneighs);
|
||||||
c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, c_atom->Nmax * c_neighbor->maxneighs * sizeof(int));
|
neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
neighbor->maxneighs = c_neighbor->maxneighs;
|
|
||||||
cudaProfilerStop();
|
cudaProfilerStop();
|
||||||
}
|
}
|
||||||
|
@ -36,13 +36,13 @@ extern "C" {
|
|||||||
|
|
||||||
extern int NmaxGhost;
|
extern int NmaxGhost;
|
||||||
extern int *PBCx, *PBCy, *PBCz;
|
extern int *PBCx, *PBCy, *PBCz;
|
||||||
static int c_NmaxGhost;
|
static int c_NmaxGhost = 0;
|
||||||
static int *c_PBCx, *c_PBCy, *c_PBCz;
|
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
|
||||||
|
|
||||||
__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
|
__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
Atom* atom = &a;
|
DeviceAtom *atom = &a;
|
||||||
if(i >= atom->Nlocal) {
|
if(i >= nlocal) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,17 +65,14 @@ __global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_F
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const int Nghost = a.Nghost;
|
if(i >= nghost) {
|
||||||
if(i >= Nghost) {
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Atom* atom = &a;
|
DeviceAtom* atom = &a;
|
||||||
int *border_map = atom->border_map;
|
int *border_map = atom->border_map;
|
||||||
int nlocal = atom->Nlocal;
|
|
||||||
|
|
||||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||||
@ -83,36 +80,27 @@ __global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLO
|
|||||||
|
|
||||||
/* update coordinates of ghost atoms */
|
/* update coordinates of ghost atoms */
|
||||||
/* uses mapping created in setupPbc */
|
/* uses mapping created in setupPbc */
|
||||||
void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) {
|
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
|
||||||
if (doReneighbor) {
|
if(reneigh) {
|
||||||
c_atom->Natoms = atom->Natoms;
|
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||||
c_atom->Nlocal = atom->Nlocal;
|
memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax);
|
||||||
c_atom->Nghost = atom->Nghost;
|
|
||||||
c_atom->ntypes = atom->ntypes;
|
|
||||||
|
|
||||||
if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
|
|
||||||
c_atom->Nmax = atom->Nmax;
|
|
||||||
c_atom->x = (MD_FLOAT *) reallocateGPU(c_atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
|
||||||
c_atom->type = (int *) reallocateGPU(c_atom->type, sizeof(int) * atom->Nmax);
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
|
||||||
memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
|
||||||
|
|
||||||
if(c_NmaxGhost < NmaxGhost) {
|
if(c_NmaxGhost < NmaxGhost) {
|
||||||
c_NmaxGhost = NmaxGhost;
|
c_NmaxGhost = NmaxGhost;
|
||||||
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
|
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
|
||||||
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
|
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
|
||||||
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
|
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
|
||||||
c_atom->border_map = (int *) reallocateGPU(c_atom->border_map, NmaxGhost * sizeof(int));
|
atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
|
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
|
||||||
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
|
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
|
||||||
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
|
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
|
||||||
memcpyToGPU(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int));
|
memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
|
||||||
|
cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
|
||||||
|
cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
MD_FLOAT xprd = param->xprd;
|
MD_FLOAT xprd = param->xprd;
|
||||||
@ -120,20 +108,20 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo
|
|||||||
MD_FLOAT zprd = param->zprd;
|
MD_FLOAT zprd = param->zprd;
|
||||||
|
|
||||||
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
||||||
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
||||||
cuda_assert("computePbcUpdate", cudaPeekAtLastError());
|
cuda_assert("updatePbc", cudaPeekAtLastError());
|
||||||
cuda_assert("computePbcUpdate", cudaDeviceSynchronize());
|
cuda_assert("updatePbc", cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param) {
|
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
MD_FLOAT xprd = param->xprd;
|
MD_FLOAT xprd = param->xprd;
|
||||||
MD_FLOAT yprd = param->yprd;
|
MD_FLOAT yprd = param->yprd;
|
||||||
MD_FLOAT zprd = param->zprd;
|
MD_FLOAT zprd = param->zprd;
|
||||||
|
|
||||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||||
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
|
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
|
||||||
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
|
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
|
||||||
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
|
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
|
||||||
memcpyFromGPU(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||||
}
|
}
|
||||||
|
@ -48,6 +48,18 @@
|
|||||||
# define updateAtomsPbc updateAtomsPbc_cpu
|
# define updateAtomsPbc updateAtomsPbc_cpu
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
MD_FLOAT *x, *y, *z;
|
||||||
|
MD_FLOAT *vx, *vy, *vz;
|
||||||
|
MD_FLOAT *fx, *fy, *fz;
|
||||||
|
int *border_map;
|
||||||
|
int *type;
|
||||||
|
MD_FLOAT *epsilon;
|
||||||
|
MD_FLOAT *sigma6;
|
||||||
|
MD_FLOAT *cutforcesq;
|
||||||
|
MD_FLOAT *cutneighsq;
|
||||||
|
} DeviceAtom;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int Natoms, Nlocal, Nghost, Nmax;
|
int Natoms, Nlocal, Nghost, Nmax;
|
||||||
MD_FLOAT *x, *y, *z;
|
MD_FLOAT *x, *y, *z;
|
||||||
@ -60,10 +72,14 @@ typedef struct {
|
|||||||
MD_FLOAT *sigma6;
|
MD_FLOAT *sigma6;
|
||||||
MD_FLOAT *cutforcesq;
|
MD_FLOAT *cutforcesq;
|
||||||
MD_FLOAT *cutneighsq;
|
MD_FLOAT *cutneighsq;
|
||||||
|
|
||||||
// DEM
|
// DEM
|
||||||
MD_FLOAT *radius;
|
MD_FLOAT *radius;
|
||||||
MD_FLOAT *av;
|
MD_FLOAT *av;
|
||||||
MD_FLOAT *r;
|
MD_FLOAT *r;
|
||||||
|
|
||||||
|
// Device data
|
||||||
|
DeviceAtom d_atom;
|
||||||
} Atom;
|
} Atom;
|
||||||
|
|
||||||
extern void initAtom(Atom*);
|
extern void initAtom(Atom*);
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#ifndef __CUDA_ATOM_H_
|
#ifndef __CUDA_ATOM_H_
|
||||||
#define __CUDA_ATOM_H_
|
#define __CUDA_ATOM_H_
|
||||||
extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*);
|
extern void initCuda(Atom*, Neighbor*);
|
||||||
extern void cuda_assert(const char *msg, cudaError_t err);
|
extern void cuda_assert(const char *msg, cudaError_t err);
|
||||||
extern void *allocateGPU(size_t bytesize);
|
extern void *allocateGPU(size_t bytesize);
|
||||||
extern void *reallocateGPU(void *ptr, size_t new_bytesize);
|
extern void *reallocateGPU(void *ptr, size_t new_bytesize);
|
||||||
|
@ -23,7 +23,7 @@
|
|||||||
#include <parameter.h>
|
#include <parameter.h>
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
|
|
||||||
void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
|
void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||||
@ -34,7 +34,7 @@ void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_at
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
|
void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||||
@ -43,6 +43,6 @@ void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
void initialIntegrate_cuda(bool, Parameter*, Atom*, Atom*);
|
void initialIntegrate_cuda(bool, Parameter*, Atom*);
|
||||||
void finalIntegrate_cuda(bool, Parameter*, Atom*, Atom*);
|
void finalIntegrate_cuda(bool, Parameter*, Atom*);
|
||||||
#endif
|
#endif
|
||||||
|
@ -25,6 +25,12 @@
|
|||||||
|
|
||||||
#ifndef __NEIGHBOR_H_
|
#ifndef __NEIGHBOR_H_
|
||||||
#define __NEIGHBOR_H_
|
#define __NEIGHBOR_H_
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int *neighbors;
|
||||||
|
int *numneigh;
|
||||||
|
} DeviceNeighbor;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int every;
|
int every;
|
||||||
int ncalls;
|
int ncalls;
|
||||||
@ -32,6 +38,9 @@ typedef struct {
|
|||||||
int half_neigh;
|
int half_neigh;
|
||||||
int *neighbors;
|
int *neighbors;
|
||||||
int *numneigh;
|
int *numneigh;
|
||||||
|
|
||||||
|
// Device data
|
||||||
|
DeviceNeighbor d_neighbor;
|
||||||
} Neighbor;
|
} Neighbor;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -52,11 +61,11 @@ typedef struct {
|
|||||||
extern void initNeighbor(Neighbor*, Parameter*);
|
extern void initNeighbor(Neighbor*, Parameter*);
|
||||||
extern void setupNeighbor(Parameter*);
|
extern void setupNeighbor(Parameter*);
|
||||||
extern void binatoms(Atom*);
|
extern void binatoms(Atom*);
|
||||||
extern void buildNeighbor_cpu(Atom*, Neighbor*, Atom*, Neighbor*);
|
extern void buildNeighbor_cpu(Atom*, Neighbor*);
|
||||||
extern void sortAtom(Atom*);
|
extern void sortAtom(Atom*);
|
||||||
|
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*);
|
extern void buildNeighbor_cuda(Atom*, Neighbor*);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -28,13 +28,13 @@
|
|||||||
#ifndef __PBC_H_
|
#ifndef __PBC_H_
|
||||||
#define __PBC_H_
|
#define __PBC_H_
|
||||||
extern void initPbc();
|
extern void initPbc();
|
||||||
extern void updatePbc_cpu(Atom*, Atom*, Parameter*, bool);
|
extern void updatePbc_cpu(Atom*, Parameter*, bool);
|
||||||
extern void updateAtomsPbc_cpu(Atom*, Atom*, Parameter*);
|
extern void updateAtomsPbc_cpu(Atom*, Parameter*);
|
||||||
extern void setupPbc(Atom*, Parameter*);
|
extern void setupPbc(Atom*, Parameter*);
|
||||||
|
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
extern void updatePbc_cuda(Atom*, Atom*, Parameter*, bool);
|
extern void updatePbc_cuda(Atom*, Parameter*, bool);
|
||||||
extern void updateAtomsPbc_cuda(Atom*, Atom*, Parameter*);
|
extern void updateAtomsPbc_cuda(Atom*, Parameter*);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -54,10 +54,10 @@ extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
|||||||
|
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
#include <cuda_atom.h>
|
#include <cuda_atom.h>
|
||||||
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*, Atom*, Neighbor*);
|
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) {
|
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||||
double S, E;
|
double S, E;
|
||||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||||
@ -81,23 +81,23 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c
|
|||||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
initCuda(atom, neighbor, c_atom, c_neighbor);
|
initCuda(atom, neighbor);
|
||||||
#endif
|
#endif
|
||||||
updatePbc(atom, c_atom, param, true);
|
updatePbc(atom, param, true);
|
||||||
buildNeighbor(atom, neighbor, c_atom, c_neighbor);
|
buildNeighbor(atom, neighbor);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||||
double S, E;
|
double S, E;
|
||||||
S = getTimeStamp();
|
S = getTimeStamp();
|
||||||
LIKWID_MARKER_START("reneighbour");
|
LIKWID_MARKER_START("reneighbour");
|
||||||
updateAtomsPbc(atom, c_atom, param);
|
updateAtomsPbc(atom, param);
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
updatePbc(atom, c_atom, param, true);
|
updatePbc(atom, param, true);
|
||||||
//sortAtom(atom);
|
//sortAtom(atom);
|
||||||
buildNeighbor(atom, neighbor, c_atom, c_neighbor);
|
buildNeighbor(atom, neighbor);
|
||||||
LIKWID_MARKER_STOP("reneighbour");
|
LIKWID_MARKER_STOP("reneighbour");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
@ -111,7 +111,7 @@ void printAtomState(Atom *atom) {
|
|||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) {
|
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
if(param->force_field == FF_EAM) {
|
if(param->force_field == FF_EAM) {
|
||||||
return computeForceEam(eam, param, atom, neighbor, stats);
|
return computeForceEam(eam, param, atom, neighbor, stats);
|
||||||
} else if(param->force_field == FF_DEM) {
|
} else if(param->force_field == FF_DEM) {
|
||||||
@ -128,7 +128,7 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
return computeForceLJFullNeigh(param, atom, neighbor, c_atom, c_neighbor);
|
return computeForceLJFullNeigh(param, atom, neighbor);
|
||||||
#else
|
#else
|
||||||
return computeForceLJFullNeigh(param, atom, neighbor, stats);
|
return computeForceLJFullNeigh(param, atom, neighbor, stats);
|
||||||
#endif
|
#endif
|
||||||
@ -137,8 +137,8 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
double timer[NUMTIMER];
|
double timer[NUMTIMER];
|
||||||
Eam eam;
|
Eam eam;
|
||||||
Atom atom, c_atom;
|
Atom atom;
|
||||||
Neighbor neighbor, c_neighbor;
|
Neighbor neighbor;
|
||||||
Stats stats;
|
Stats stats;
|
||||||
Parameter param;
|
Parameter param;
|
||||||
|
|
||||||
@ -226,7 +226,7 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
param.cutneigh = param.cutforce + param.skin;
|
param.cutneigh = param.cutforce + param.skin;
|
||||||
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||||
printParameter(¶m);
|
printParameter(¶m);
|
||||||
|
|
||||||
printf("step\ttemp\t\tpressure\n");
|
printf("step\ttemp\t\tpressure\n");
|
||||||
@ -235,7 +235,7 @@ int main(int argc, char** argv) {
|
|||||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||||
timer[NEIGH] = 0.0;
|
timer[NEIGH] = 0.0;
|
||||||
timer[TOTAL] = getTimeStamp();
|
timer[TOTAL] = getTimeStamp();
|
||||||
|
|
||||||
@ -245,23 +245,23 @@ int main(int argc, char** argv) {
|
|||||||
|
|
||||||
for(int n = 0; n < param.ntimes; n++) {
|
for(int n = 0; n < param.ntimes; n++) {
|
||||||
bool reneigh = (n + 1) % param.reneigh_every == 0;
|
bool reneigh = (n + 1) % param.reneigh_every == 0;
|
||||||
initialIntegrate(reneigh, ¶m, &atom, &c_atom);
|
initialIntegrate(reneigh, ¶m, &atom);
|
||||||
if((n + 1) % param.reneigh_every) {
|
if((n + 1) % param.reneigh_every) {
|
||||||
updatePbc(&atom, &c_atom, ¶m, false);
|
updatePbc(&atom, ¶m, false);
|
||||||
} else {
|
} else {
|
||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||||
finalIntegrate(reneigh, ¶m, &atom, &c_atom);
|
finalIntegrate(reneigh, ¶m, &atom);
|
||||||
|
|
||||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
memcpyFromGPU(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
|
memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
|
||||||
#endif
|
#endif
|
||||||
computeThermo(n + 1, ¶m, &atom);
|
computeThermo(n + 1, ¶m, &atom);
|
||||||
}
|
}
|
||||||
|
@ -169,7 +169,7 @@ void setupNeighbor(Parameter* param) {
|
|||||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
|
|
||||||
/* extend atom arrays if necessary */
|
/* extend atom arrays if necessary */
|
||||||
@ -228,7 +228,6 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c
|
|||||||
}
|
}
|
||||||
|
|
||||||
neighbor->numneigh[i] = n;
|
neighbor->numneigh[i] = n;
|
||||||
|
|
||||||
if(n >= neighbor->maxneighs) {
|
if(n >= neighbor->maxneighs) {
|
||||||
resize = 1;
|
resize = 1;
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ void initPbc(Atom* atom) {
|
|||||||
|
|
||||||
/* update coordinates of ghost atoms */
|
/* update coordinates of ghost atoms */
|
||||||
/* uses mapping created in setupPbc */
|
/* uses mapping created in setupPbc */
|
||||||
void updatePbc_cpu(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) {
|
void updatePbc_cpu(Atom *atom, Parameter *param, bool doReneighbor) {
|
||||||
int *border_map = atom->border_map;
|
int *border_map = atom->border_map;
|
||||||
int nlocal = atom->Nlocal;
|
int nlocal = atom->Nlocal;
|
||||||
MD_FLOAT xprd = param->xprd;
|
MD_FLOAT xprd = param->xprd;
|
||||||
@ -60,7 +60,7 @@ void updatePbc_cpu(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor
|
|||||||
|
|
||||||
/* relocate atoms that have left domain according
|
/* relocate atoms that have left domain according
|
||||||
* to periodic boundary conditions */
|
* to periodic boundary conditions */
|
||||||
void updateAtomsPbc_cpu(Atom *atom, Atom *c_atom, Parameter *param) {
|
void updateAtomsPbc_cpu(Atom *atom, Parameter *param) {
|
||||||
MD_FLOAT xprd = param->xprd;
|
MD_FLOAT xprd = param->xprd;
|
||||||
MD_FLOAT yprd = param->yprd;
|
MD_FLOAT yprd = param->yprd;
|
||||||
MD_FLOAT zprd = param->zprd;
|
MD_FLOAT zprd = param->zprd;
|
||||||
|
Loading…
Reference in New Issue
Block a user