Compare commits
No commits in common. "bc7b523979c36fb135808ef9dff03f341bf5d64a" and "f61f59ba3f643c62e596d3f170dd52b5495deb23" have entirely different histories.
bc7b523979
...
f61f59ba3f
@ -7,10 +7,9 @@ ANSI_CFLAGS += -pedantic
|
|||||||
ANSI_CFLAGS += -Wextra
|
ANSI_CFLAGS += -Wextra
|
||||||
|
|
||||||
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||||
#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
|
CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
|
||||||
CFLAGS = -O3 -g # -fopenmp
|
|
||||||
ASFLAGS = -masm=intel
|
ASFLAGS = -masm=intel
|
||||||
LFLAGS =
|
LFLAGS =
|
||||||
DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON
|
DEFINES = -D_GNU_SOURCE -DLIKWID_PERFMON
|
||||||
INCLUDES = $(LIKWID_INC)
|
INCLUDES = $(LIKWID_INC)
|
||||||
LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid
|
LIBS = -lm $(LIKWID_LIB) -llikwid -lcuda -lcudart
|
||||||
|
@ -153,6 +153,7 @@ void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, At
|
|||||||
|
|
||||||
if(doReneighbour) {
|
if(doReneighbour) {
|
||||||
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -54,5 +54,5 @@ extern void binatoms(Atom*);
|
|||||||
extern void buildNeighbor(Atom*, Neighbor*);
|
extern void buildNeighbor(Atom*, Neighbor*);
|
||||||
extern void sortAtom(Atom*);
|
extern void sortAtom(Atom*);
|
||||||
extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
|
extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
|
||||||
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*);
|
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int);
|
||||||
#endif
|
#endif
|
@ -29,6 +29,5 @@ extern void initPbc(Atom*);
|
|||||||
extern void updatePbc(Atom*, Parameter*);
|
extern void updatePbc(Atom*, Parameter*);
|
||||||
extern void updatePbc_cuda(Atom*, Parameter*, Atom*, bool, const int);
|
extern void updatePbc_cuda(Atom*, Parameter*, Atom*, bool, const int);
|
||||||
extern void updateAtomsPbc(Atom*, Parameter*);
|
extern void updateAtomsPbc(Atom*, Parameter*);
|
||||||
extern void updateAtomsPbc_cuda(Atom*, Parameter*, Atom*, const int);
|
|
||||||
extern void setupPbc(Atom*, Parameter*);
|
extern void setupPbc(Atom*, Parameter*);
|
||||||
#endif
|
#endif
|
@ -5,11 +5,6 @@ typedef enum {
|
|||||||
TOTAL = 0,
|
TOTAL = 0,
|
||||||
NEIGH,
|
NEIGH,
|
||||||
FORCE,
|
FORCE,
|
||||||
NEIGH_UPDATE_ATOMS_PBC,
|
|
||||||
NEIGH_SETUP_PBC,
|
|
||||||
NEIGH_UPDATE_PBC,
|
|
||||||
NEIGH_BINATOMS,
|
|
||||||
NEIGH_BUILD_LISTS,
|
|
||||||
NUMTIMER
|
NUMTIMER
|
||||||
} timertype;
|
} timertype;
|
||||||
|
|
@ -124,8 +124,7 @@ double setup(
|
|||||||
Atom *c_atom,
|
Atom *c_atom,
|
||||||
Neighbor *c_neighbor,
|
Neighbor *c_neighbor,
|
||||||
Stats *stats,
|
Stats *stats,
|
||||||
const int num_threads_per_block,
|
const int num_threads_per_block)
|
||||||
double* timers)
|
|
||||||
{
|
{
|
||||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||||
double S, E;
|
double S, E;
|
||||||
@ -146,7 +145,7 @@ double setup(
|
|||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
||||||
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
||||||
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
|
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
|
|
||||||
|
|
||||||
@ -159,32 +158,19 @@ double reneighbour(
|
|||||||
Neighbor *neighbor,
|
Neighbor *neighbor,
|
||||||
Atom *c_atom,
|
Atom *c_atom,
|
||||||
Neighbor *c_neighbor,
|
Neighbor *c_neighbor,
|
||||||
const int num_threads_per_block,
|
const int num_threads_per_block)
|
||||||
double* timers)
|
|
||||||
{
|
{
|
||||||
double S, E, beforeEvent, afterEvent;
|
double S, E;
|
||||||
|
|
||||||
S = getTimeStamp();
|
S = getTimeStamp();
|
||||||
beforeEvent = S;
|
|
||||||
LIKWID_MARKER_START("reneighbour");
|
LIKWID_MARKER_START("reneighbour");
|
||||||
updateAtomsPbc_cuda(atom, param, c_atom, num_threads_per_block);
|
updateAtomsPbc(atom, param);
|
||||||
afterEvent = getTimeStamp();
|
|
||||||
timers[NEIGH_UPDATE_ATOMS_PBC] += afterEvent - beforeEvent;
|
|
||||||
beforeEvent = afterEvent;
|
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
afterEvent = getTimeStamp();
|
|
||||||
timers[NEIGH_SETUP_PBC] += afterEvent - beforeEvent;
|
|
||||||
beforeEvent = afterEvent;
|
|
||||||
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
||||||
afterEvent = getTimeStamp();
|
|
||||||
timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
|
|
||||||
beforeEvent = afterEvent;
|
|
||||||
//sortAtom(atom);
|
//sortAtom(atom);
|
||||||
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
|
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
|
||||||
LIKWID_MARKER_STOP("reneighbour");
|
LIKWID_MARKER_STOP("reneighbour");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
afterEvent = E;
|
|
||||||
timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent;
|
|
||||||
|
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
@ -332,7 +318,7 @@ int main(int argc, char** argv)
|
|||||||
// this should be multiple of 32 as operations are performed at the level of warps
|
// this should be multiple of 32 as operations are performed at the level of warps
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
|
||||||
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer);
|
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block);
|
||||||
computeThermo(0, ¶m, &atom);
|
computeThermo(0, ¶m, &atom);
|
||||||
if(param.force_field == FF_EAM) {
|
if(param.force_field == FF_EAM) {
|
||||||
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
||||||
@ -347,11 +333,6 @@ int main(int argc, char** argv)
|
|||||||
timer[FORCE] = 0.0;
|
timer[FORCE] = 0.0;
|
||||||
timer[NEIGH] = 0.0;
|
timer[NEIGH] = 0.0;
|
||||||
timer[TOTAL] = getTimeStamp();
|
timer[TOTAL] = getTimeStamp();
|
||||||
timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
|
|
||||||
timer[NEIGH_SETUP_PBC] = 0.0;
|
|
||||||
timer[NEIGH_UPDATE_PBC] = 0.0;
|
|
||||||
timer[NEIGH_BINATOMS] = 0.0;
|
|
||||||
timer[NEIGH_BUILD_LISTS] = 0.0;
|
|
||||||
|
|
||||||
if(param.vtk_file != NULL) {
|
if(param.vtk_file != NULL) {
|
||||||
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
||||||
@ -364,12 +345,9 @@ int main(int argc, char** argv)
|
|||||||
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
||||||
|
|
||||||
if(doReneighbour) {
|
if(doReneighbour) {
|
||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
|
||||||
} else {
|
} else {
|
||||||
double before = getTimeStamp();
|
|
||||||
updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block);
|
updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block);
|
||||||
double after = getTimeStamp();
|
|
||||||
timer[NEIGH_UPDATE_PBC] += after - before;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(param.force_field == FF_EAM) {
|
if(param.force_field == FF_EAM) {
|
||||||
@ -394,7 +372,6 @@ int main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS];
|
|
||||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||||
computeThermo(-1, ¶m, &atom);
|
computeThermo(-1, ¶m, &atom);
|
||||||
|
|
||||||
@ -408,15 +385,11 @@ int main(int argc, char** argv)
|
|||||||
#endif
|
#endif
|
||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n",
|
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]);
|
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
printf("Performance: %.2f million atom updates per second\n",
|
printf("Performance: %.2f million atom updates per second\n",
|
||||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||||
double atomUpdatesTotal = (double) atom.Natoms * param.ntimes;
|
|
||||||
printf("Force_perf in millions per sec: %.2f\n", 1e-6 * atomUpdatesTotal / timer[FORCE]);
|
|
||||||
double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every;
|
|
||||||
printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]);
|
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
displayStatistics(&atom, ¶m, &stats, timer);
|
displayStatistics(&atom, ¶m, &stats, timer);
|
||||||
#endif
|
#endif
|
@ -33,8 +33,6 @@ extern "C" {
|
|||||||
#include <parameter.h>
|
#include <parameter.h>
|
||||||
#include <allocate.h>
|
#include <allocate.h>
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
#include <timing.h>
|
|
||||||
#include <timers.h>
|
|
||||||
|
|
||||||
#define SMALL 1.0e-6
|
#define SMALL 1.0e-6
|
||||||
#define FACTOR 0.999
|
#define FACTOR 0.999
|
||||||
@ -196,17 +194,6 @@ static int nstencil; // # of bins in stencil
|
|||||||
static int* stencil; // stencil list of bin offsets
|
static int* stencil; // stencil list of bin offsets
|
||||||
static MD_FLOAT binsizex, binsizey, binsizez;
|
static MD_FLOAT binsizex, binsizey, binsizez;
|
||||||
|
|
||||||
static int* c_stencil = NULL;
|
|
||||||
static int* c_resize_needed = NULL;
|
|
||||||
static int* c_new_maxneighs = NULL;
|
|
||||||
static Binning c_binning{
|
|
||||||
.bincount = NULL,
|
|
||||||
.bins = NULL,
|
|
||||||
.mbins = 0,
|
|
||||||
.atoms_per_bin = 0
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
|
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
|
||||||
static MD_FLOAT bindist(int, int, int);
|
static MD_FLOAT bindist(int, int, int);
|
||||||
|
|
||||||
@ -519,21 +506,21 @@ void sortAtom(Atom* atom) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef AOS
|
#ifdef AOS
|
||||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||||
|
|
||||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||||
#else
|
#else
|
||||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
|
|
||||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
|
double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
|
||||||
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
|
double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
|
||||||
|
|
||||||
for(int mybin = 0; mybin<mbins; mybin++) {
|
for(int mybin = 0; mybin<mbins; mybin++) {
|
||||||
int start = mybin>0?binpos[mybin-1]:0;
|
int start = mybin>0?binpos[mybin-1]:0;
|
||||||
@ -581,6 +568,12 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
|
|||||||
{
|
{
|
||||||
int nall = c_atom->Nlocal + c_atom->Nghost;
|
int nall = c_atom->Nlocal + c_atom->Nghost;
|
||||||
int resize = 1;
|
int resize = 1;
|
||||||
|
if(c_binning->bincount == NULL){
|
||||||
|
checkCUDAError("binatoms_cuda c_binning->bincount malloc", cudaMalloc((void**)(&c_binning->bincount), c_binning->mbins * sizeof(int)) );
|
||||||
|
}
|
||||||
|
if(c_binning->bins == NULL){
|
||||||
|
checkCUDAError("binatoms_cuda c_binning->bins malloc", cudaMalloc((void**)(&c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
|
||||||
|
}
|
||||||
|
|
||||||
const int num_blocks = ceil((float)nall / (float)threads_per_block);
|
const int num_blocks = ceil((float)nall / (float)threads_per_block);
|
||||||
|
|
||||||
@ -592,10 +585,10 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
|
|||||||
/*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
|
/*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
|
||||||
binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||||
|
|
||||||
checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
|
checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
|
||||||
checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
|
checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
|
checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||||
|
|
||||||
if(resize) {
|
if(resize) {
|
||||||
cudaFree(c_binning->bins);
|
cudaFree(c_binning->bins);
|
||||||
@ -611,25 +604,23 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
|
|||||||
checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
|
checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
|
|
||||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers)
|
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
|
||||||
{
|
{
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
c_neighbor->maxneighs = neighbor->maxneighs;
|
c_neighbor->maxneighs = neighbor->maxneighs;
|
||||||
|
|
||||||
cudaProfilerStart();
|
cudaProfilerStart();
|
||||||
/* upload stencil */
|
/* upload stencil */
|
||||||
// TODO move all of this initialization into its own method
|
int* c_stencil;
|
||||||
if(c_stencil == NULL){
|
// TODO move this to be done once at the start
|
||||||
checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
|
checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
|
||||||
checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
|
checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
|
||||||
}
|
|
||||||
|
|
||||||
if(c_binning.mbins == 0){
|
Binning c_binning;
|
||||||
c_binning.mbins = mbins;
|
c_binning.mbins = mbins;
|
||||||
c_binning.atoms_per_bin = atoms_per_bin;
|
c_binning.atoms_per_bin = atoms_per_bin;
|
||||||
checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
|
checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), mbins * sizeof(int)) );
|
||||||
checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
|
checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
|
||||||
}
|
|
||||||
|
|
||||||
Neighbor_params np{
|
Neighbor_params np{
|
||||||
.xprd = xprd,
|
.xprd = xprd,
|
||||||
@ -649,19 +640,14 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
|
|||||||
.mbinz = mbinz
|
.mbinz = mbinz
|
||||||
};
|
};
|
||||||
|
|
||||||
if(c_resize_needed == NULL){
|
int* c_resize_needed;
|
||||||
checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
|
checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
|
||||||
}
|
|
||||||
|
|
||||||
/* bin local & ghost atoms */
|
/* bin local & ghost atoms */
|
||||||
double beforeBinning = getTimeStamp();
|
|
||||||
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||||
double afterBinning = getTimeStamp();
|
|
||||||
timers[NEIGH_BINATOMS] += afterBinning - beforeBinning;
|
|
||||||
|
|
||||||
if(c_new_maxneighs == NULL){
|
int* c_new_maxneighs;
|
||||||
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
||||||
}
|
|
||||||
|
|
||||||
int resize = 1;
|
int resize = 1;
|
||||||
|
|
||||||
@ -715,5 +701,10 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
|
|||||||
neighbor->maxneighs = c_neighbor->maxneighs;
|
neighbor->maxneighs = c_neighbor->maxneighs;
|
||||||
|
|
||||||
cudaProfilerStop();
|
cudaProfilerStop();
|
||||||
|
|
||||||
|
cudaFree(c_new_maxneighs);
|
||||||
|
cudaFree(c_stencil);
|
||||||
|
cudaFree(c_binning.bincount);
|
||||||
|
cudaFree(c_binning.bins);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -33,32 +33,6 @@ extern "C" {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
Atom* atom = &a;
|
|
||||||
if( i >= atom->Nlocal ){
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (atom_x(i) < 0.0) {
|
|
||||||
atom_x(i) += xprd;
|
|
||||||
} else if (atom_x(i) >= xprd) {
|
|
||||||
atom_x(i) -= xprd;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (atom_y(i) < 0.0) {
|
|
||||||
atom_y(i) += yprd;
|
|
||||||
} else if (atom_y(i) >= yprd) {
|
|
||||||
atom_y(i) -= yprd;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (atom_z(i) < 0.0) {
|
|
||||||
atom_z(i) += zprd;
|
|
||||||
} else if (atom_z(i) >= zprd) {
|
|
||||||
atom_z(i) -= zprd;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const int Nghost = a.Nghost;
|
const int Nghost = a.Nghost;
|
||||||
@ -189,21 +163,6 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void updateAtomsPbc_cuda(Atom* atom, Parameter* param, Atom* c_atom, const int num_threads_per_block){
|
|
||||||
MD_FLOAT xprd = param->xprd;
|
|
||||||
MD_FLOAT yprd = param->yprd;
|
|
||||||
MD_FLOAT zprd = param->zprd;
|
|
||||||
|
|
||||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
|
||||||
/*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
|
|
||||||
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
|
|
||||||
|
|
||||||
checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
|
|
||||||
checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
|
|
||||||
|
|
||||||
checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* setup periodic boundary conditions by
|
/* setup periodic boundary conditions by
|
||||||
* defining ghost atoms around domain
|
* defining ghost atoms around domain
|
||||||
* only creates mapping and coordinate corrections
|
* only creates mapping and coordinate corrections
|
Loading…
Reference in New Issue
Block a user