First crude attempt at parallelizing neighborhood computation (only the part after binning the atoms is parallelized with cuda)
This commit is contained in:
parent
757d4329f3
commit
c49278cb21
35
src/force.cu
35
src/force.cu
@ -126,25 +126,9 @@ extern "C" {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
int get_num_threads() {
|
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
|
||||||
|
|
||||||
const char *num_threads_env = getenv("NUM_THREADS");
|
|
||||||
int num_threads = 0;
|
|
||||||
if(num_threads_env == nullptr)
|
|
||||||
num_threads = 32;
|
|
||||||
else {
|
|
||||||
num_threads = atoi(num_threads_env);
|
|
||||||
}
|
|
||||||
|
|
||||||
return num_threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
|
|
||||||
|
|
||||||
const int Nlocal = atom->Nlocal;
|
const int Nlocal = atom->Nlocal;
|
||||||
const int num_threads = get_num_threads();
|
|
||||||
|
|
||||||
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
|
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
|
||||||
@ -157,12 +141,9 @@ void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
|
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
|
||||||
|
|
||||||
const int Nlocal = atom->Nlocal;
|
const int Nlocal = atom->Nlocal;
|
||||||
const int num_threads = get_num_threads();
|
|
||||||
|
|
||||||
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
|
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
|
||||||
@ -182,7 +163,8 @@ double computeForce(
|
|||||||
Atom *atom,
|
Atom *atom,
|
||||||
Neighbor *neighbor,
|
Neighbor *neighbor,
|
||||||
Atom *c_atom,
|
Atom *c_atom,
|
||||||
Neighbor *c_neighbor
|
Neighbor *c_neighbor,
|
||||||
|
int num_threads_per_block
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
@ -192,8 +174,6 @@ double computeForce(
|
|||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const int num_threads = get_num_threads();
|
|
||||||
|
|
||||||
c_atom->Natoms = atom->Natoms;
|
c_atom->Natoms = atom->Natoms;
|
||||||
c_atom->Nlocal = atom->Nlocal;
|
c_atom->Nlocal = atom->Nlocal;
|
||||||
c_atom->Nghost = atom->Nghost;
|
c_atom->Nghost = atom->Nghost;
|
||||||
@ -219,14 +199,11 @@ double computeForce(
|
|||||||
|
|
||||||
cudaProfilerStart();
|
cudaProfilerStart();
|
||||||
|
|
||||||
checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
|
||||||
|
|
||||||
if(reneighbourHappenend) {
|
if(!reneighbourHappenend) {
|
||||||
checkCUDAError( "c_neighbor->numneigh memcpy", cudaMemcpy(c_neighbor->numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||||
checkCUDAError( "c_neighbor->neighbors memcpy", cudaMemcpy(c_neighbor->neighbors, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) );
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
@ -46,4 +46,5 @@ extern void setupNeighbor();
|
|||||||
extern void binatoms(Atom*);
|
extern void binatoms(Atom*);
|
||||||
extern void buildNeighbor(Atom*, Neighbor*);
|
extern void buildNeighbor(Atom*, Neighbor*);
|
||||||
extern void sortAtom(Atom*);
|
extern void sortAtom(Atom*);
|
||||||
|
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int);
|
||||||
#endif
|
#endif
|
||||||
|
50
src/main.c
50
src/main.c
@ -45,10 +45,14 @@
|
|||||||
|
|
||||||
#define HLINE "----------------------------------------------------------------------------\n"
|
#define HLINE "----------------------------------------------------------------------------\n"
|
||||||
|
|
||||||
extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom);
|
extern void cuda_final_integrate(bool doReneighbour, Parameter *param,
|
||||||
extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom);
|
Atom *atom, Atom *c_atom,
|
||||||
|
const int num_threads_per_block);
|
||||||
|
extern void cuda_initial_integrate(bool doReneighbour, Parameter *param,
|
||||||
|
Atom *atom, Atom *c_atom,
|
||||||
|
const int num_threads_per_block);
|
||||||
|
|
||||||
extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*);
|
extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int);
|
||||||
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
|
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
|
||||||
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
|
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
|
||||||
|
|
||||||
@ -111,7 +115,8 @@ double setup(
|
|||||||
Neighbor *neighbor,
|
Neighbor *neighbor,
|
||||||
Atom *c_atom,
|
Atom *c_atom,
|
||||||
Neighbor *c_neighbor,
|
Neighbor *c_neighbor,
|
||||||
Stats *stats)
|
Stats *stats,
|
||||||
|
const int num_threads_per_block)
|
||||||
{
|
{
|
||||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||||
double S, E;
|
double S, E;
|
||||||
@ -131,7 +136,7 @@ double setup(
|
|||||||
adjustThermo(param, atom);
|
adjustThermo(param, atom);
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
updatePbc(atom, param);
|
updatePbc(atom, param);
|
||||||
buildNeighbor(atom, neighbor);
|
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
|
|
||||||
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
||||||
@ -142,7 +147,10 @@ double setup(
|
|||||||
double reneighbour(
|
double reneighbour(
|
||||||
Parameter *param,
|
Parameter *param,
|
||||||
Atom *atom,
|
Atom *atom,
|
||||||
Neighbor *neighbor)
|
Neighbor *neighbor,
|
||||||
|
Atom *c_atom,
|
||||||
|
Neighbor *c_neighbor,
|
||||||
|
const int num_threads_per_block)
|
||||||
{
|
{
|
||||||
double S, E;
|
double S, E;
|
||||||
|
|
||||||
@ -152,7 +160,7 @@ double reneighbour(
|
|||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
updatePbc(atom, param);
|
updatePbc(atom, param);
|
||||||
//sortAtom(atom);
|
//sortAtom(atom);
|
||||||
buildNeighbor(atom, neighbor);
|
buildNeighbor(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
|
||||||
LIKWID_MARKER_STOP("reneighbour");
|
LIKWID_MARKER_STOP("reneighbour");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
|
|
||||||
@ -206,6 +214,19 @@ const char* ff2str(int ff)
|
|||||||
return "invalid";
|
return "invalid";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int get_num_threads() {
|
||||||
|
|
||||||
|
const char *num_threads_env = getenv("NUM_THREADS");
|
||||||
|
int num_threads = 0;
|
||||||
|
if(num_threads_env == nullptr)
|
||||||
|
num_threads = 32;
|
||||||
|
else {
|
||||||
|
num_threads = atoi(num_threads_env);
|
||||||
|
}
|
||||||
|
|
||||||
|
return num_threads;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv)
|
int main(int argc, char** argv)
|
||||||
{
|
{
|
||||||
double timer[NUMTIMER];
|
double timer[NUMTIMER];
|
||||||
@ -286,7 +307,10 @@ int main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
// this should be multiple of 32 as operations are performed at the level of warps
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
|
||||||
|
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block);
|
||||||
computeThermo(0, ¶m, &atom);
|
computeThermo(0, ¶m, &atom);
|
||||||
if(param.force_field == FF_EAM) {
|
if(param.force_field == FF_EAM) {
|
||||||
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
||||||
@ -294,7 +318,7 @@ int main(int argc, char** argv)
|
|||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
|
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
|
||||||
computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0);
|
computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0);
|
||||||
#else
|
#else
|
||||||
computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor);
|
computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -310,10 +334,10 @@ int main(int argc, char** argv)
|
|||||||
|
|
||||||
const bool doReneighbour = (n + 1) % param.every == 0;
|
const bool doReneighbour = (n + 1) % param.every == 0;
|
||||||
|
|
||||||
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom);
|
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
||||||
|
|
||||||
if(doReneighbour) {
|
if(doReneighbour) {
|
||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
|
||||||
} else {
|
} else {
|
||||||
updatePbc(&atom, ¶m);
|
updatePbc(&atom, ¶m);
|
||||||
}
|
}
|
||||||
@ -324,11 +348,11 @@ int main(int argc, char** argv)
|
|||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
|
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
|
||||||
timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1);
|
timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1);
|
||||||
#else
|
#else
|
||||||
timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor);
|
timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom);
|
cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
||||||
|
|
||||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||||
computeThermo(n + 1, ¶m, &atom);
|
computeThermo(n + 1, ¶m, &atom);
|
||||||
|
102
src/neighbor.cu
102
src/neighbor.cu
@ -67,9 +67,10 @@ __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin,
|
|||||||
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void compute_neighborhood(Atom a, Neighbor neigh, int Nlocal, Neighbor_params np, int nstencil, int* stencil,
|
__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
||||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs){
|
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs){
|
||||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const int Nlocal = a.Nlocal;
|
||||||
if( i >= Nlocal ) {
|
if( i >= Nlocal ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -513,41 +514,110 @@ void sortAtom(Atom* atom) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor)
|
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
|
||||||
{
|
{
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
|
|
||||||
/* extend atom arrays if necessary */
|
c_atom->Natoms = atom->Natoms;
|
||||||
|
c_atom->Nlocal = atom->Nlocal;
|
||||||
|
c_atom->Nghost = atom->Nghost;
|
||||||
|
c_atom->Nmax = atom->Nmax;
|
||||||
|
c_atom->ntypes = atom->ntypes;
|
||||||
|
|
||||||
|
c_neighbor->maxneighs = neighbor->maxneighs;
|
||||||
|
|
||||||
|
/* extend c_neighbor arrays if necessary */
|
||||||
if(nall > nmax) {
|
if(nall > nmax) {
|
||||||
nmax = nall;
|
nmax = nall;
|
||||||
if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh);
|
if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
|
||||||
if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors);
|
if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
|
||||||
checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) );
|
checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
|
||||||
checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) );
|
checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
|
||||||
// neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
|
||||||
// neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* bin local & ghost atoms */
|
/* bin local & ghost atoms */
|
||||||
binatoms(atom);
|
binatoms(atom);
|
||||||
int resize = 1;
|
int resize = 1;
|
||||||
|
|
||||||
|
cudaProfilerStart();
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
/* upload stencil */
|
||||||
|
int* c_stencil;
|
||||||
|
// TODO move this to be done once at the start
|
||||||
|
checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
|
||||||
|
checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
|
||||||
|
|
||||||
|
int *c_bincount;
|
||||||
|
checkCUDAError( "buildNeighbor c_bincount malloc", cudaMalloc((void**)&c_bincount, mbins * sizeof(int)) );
|
||||||
|
checkCUDAError( "buildNeighbor c_bincount memcpy", cudaMemcpy(c_bincount, bincount, mbins * sizeof(int), cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
int *c_bins;
|
||||||
|
checkCUDAError( "buidlNeighbor c_bins malloc", cudaMalloc((void**)&c_bins, mbins * atoms_per_bin * sizeof(int)) );
|
||||||
|
checkCUDAError( "buildNeighbor c_bins memcpy", cudaMemcpy(c_bins, bins, mbins * atoms_per_bin * sizeof(int), cudaMemcpyHostToDevice ) );
|
||||||
|
|
||||||
|
Neighbor_params np{
|
||||||
|
.xprd = xprd,
|
||||||
|
.yprd = yprd,
|
||||||
|
.zprd = zprd,
|
||||||
|
.bininvx = bininvx,
|
||||||
|
.bininvy = bininvy,
|
||||||
|
.bininvz = bininvz,
|
||||||
|
.mbinxlo = mbinxlo,
|
||||||
|
.mbinylo = mbinylo,
|
||||||
|
.mbinzlo = mbinzlo,
|
||||||
|
.nbinx = nbinx,
|
||||||
|
.nbiny = nbiny,
|
||||||
|
.nbinz = nbinz,
|
||||||
|
.mbinx = mbinx,
|
||||||
|
.mbiny = mbiny,
|
||||||
|
.mbinz = mbinz
|
||||||
|
};
|
||||||
|
|
||||||
|
int* c_new_maxneighs;
|
||||||
|
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
||||||
|
|
||||||
/* loop over each atom, storing neighbors */
|
/* loop over each atom, storing neighbors */
|
||||||
while(resize) {
|
while(resize) {
|
||||||
int new_maxneighs = neighbor->maxneighs;
|
|
||||||
resize = 0;
|
resize = 0;
|
||||||
|
|
||||||
// TODO allocate space for and then copy all necessary components
|
checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, c_neighbor->maxneighs, sizeof(int) ));
|
||||||
// TODO dont forget to copy the atom positions over
|
|
||||||
|
|
||||||
// TODO call compute_neigborhood kernel here
|
// TODO call compute_neigborhood kernel here
|
||||||
|
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||||
|
/*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
||||||
|
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
|
||||||
|
* */
|
||||||
|
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_Atom, *c_neighbor,
|
||||||
|
np, nstencil, c_stencil,
|
||||||
|
c_bins, atoms_per_bin, c_bincount,
|
||||||
|
c_new_maxneighs);
|
||||||
|
|
||||||
|
// TODO copy the value of c_new_maxneighs back to host and check if it has been modified
|
||||||
|
int new_maxneighs;
|
||||||
|
checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
|
||||||
|
if (new_maxneighs > c_neighbor->maxneighs){
|
||||||
|
resize = 1;
|
||||||
|
}
|
||||||
|
|
||||||
if(resize) {
|
if(resize) {
|
||||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
printf("RESIZE %d\n", c_neighbor->maxneighs);
|
||||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
c_neighbor->maxneighs = new_maxneighs * 1.2;
|
||||||
free(neighbor->neighbors);
|
cudaFree(c_neighbor->neighbors);
|
||||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
checkCUDAError("c_neighbor->neighbors resize malloc",
|
||||||
|
cudaMalloc((void**)(&c_neighbor->neighbors),
|
||||||
|
c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
neighbor->maxneighs = c_neighbor->maxneighs;
|
||||||
|
|
||||||
|
cudaProfilerStop();
|
||||||
|
|
||||||
|
cudaFree(c_new_maxneighs);
|
||||||
|
cudaFree(c_n_stencil);
|
||||||
|
cudaFree(c_bincount);
|
||||||
|
cudaFree(c_bins);
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user