Cleanup. Remove copyright year. Reformat.
This commit is contained in:
180
verletlist/cuda/force.cu
Normal file
180
verletlist/cuda/force.cu
Normal file
@@ -0,0 +1,180 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
//---
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
// cuda kernel
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= Nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
const int numneighs = neigh_numneigh[i];
|
||||
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neigh_neighbors[Nlocal * k + i];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) = fix;
|
||||
atom_fy(i) = fiy;
|
||||
atom_fz(i) = fiz;
|
||||
}
|
||||
|
||||
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
atom_vz(i) += dtforce * atom_fz(i);
|
||||
atom_x(i) = atom_x(i) + dt * atom_vx(i);
|
||||
atom_y(i) = atom_y(i) + dt * atom_vy(i);
|
||||
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||
}
|
||||
|
||||
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
atom_vz(i) += dtforce * atom_fz(i);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
|
||||
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
|
||||
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
|
||||
|
||||
if(reneigh) {
|
||||
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
||||
}
|
||||
|
||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
|
||||
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
|
||||
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
|
||||
|
||||
if(reneigh) {
|
||||
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
||||
}
|
||||
|
||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int Nlocal = atom->Nlocal;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
/*
|
||||
int nDevices;
|
||||
cudaGetDeviceCount(&nDevices);
|
||||
size_t free, total;
|
||||
for(int i = 0; i < nDevices; ++i) {
|
||||
cudaMemGetInfo( &free, &total );
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, i);
|
||||
printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||
// memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
|
||||
|
||||
cudaProfilerStart();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
|
||||
cuda_assert("calc_force", cudaPeekAtLastError());
|
||||
cuda_assert("calc_force", cudaDeviceSynchronize());
|
||||
cudaProfilerStop();
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
}
|
293
verletlist/cuda/neighbor.cu
Normal file
293
verletlist/cuda/neighbor.cu
Normal file
@@ -0,0 +1,293 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
//---
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <parameter.h>
|
||||
#include <neighbor.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern MD_FLOAT xprd, yprd, zprd;
|
||||
extern MD_FLOAT bininvx, bininvy, bininvz;
|
||||
extern int mbinxlo, mbinylo, mbinzlo;
|
||||
extern int nbinx, nbiny, nbinz;
|
||||
extern int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
extern int mbins; //total number of bins
|
||||
extern int atoms_per_bin; // max atoms per bin
|
||||
extern MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
extern int nmax;
|
||||
extern int nstencil; // # of bins in stencil
|
||||
extern int* stencil; // stencil list of bin offsets
|
||||
static int* c_stencil = NULL;
|
||||
static int* c_resize_needed = NULL;
|
||||
static int* c_new_maxneighs = NULL;
|
||||
static Binning c_binning {
|
||||
.bincount = NULL,
|
||||
.bins = NULL,
|
||||
.mbins = 0,
|
||||
.atoms_per_bin = 0
|
||||
};
|
||||
|
||||
__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= np.xprd) {
|
||||
ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= np.yprd) {
|
||||
iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= np.zprd) {
|
||||
iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
||||
}
|
||||
|
||||
/* sorts the contents of a bin to make it comparable to the CPU version */
|
||||
/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
|
||||
__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= mbins) {
|
||||
return;
|
||||
}
|
||||
|
||||
int atoms_in_bin = bincount[i];
|
||||
int *bin_ptr = &bins[i * atoms_per_bin];
|
||||
int sorted;
|
||||
do {
|
||||
sorted = 1;
|
||||
int tmp;
|
||||
for(int index = 0; index < atoms_in_bin - 1; index++){
|
||||
if (bin_ptr[index] > bin_ptr[index + 1]){
|
||||
tmp = bin_ptr[index];
|
||||
bin_ptr[index] = bin_ptr[index + 1];
|
||||
bin_ptr[index + 1] = tmp;
|
||||
sorted = 0;
|
||||
}
|
||||
}
|
||||
} while (!sorted);
|
||||
}
|
||||
|
||||
__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
|
||||
DeviceAtom* atom = &a;
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nall) {
|
||||
return;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
int ibin = coord2bin_device(x, y, z, np);
|
||||
int ac = atomicAdd(&bincount[ibin], 1);
|
||||
|
||||
if(ac < atoms_per_bin){
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
atomicMax(resize_needed, ac);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_neighborhood(
|
||||
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom *atom = &a;
|
||||
DeviceNeighbor *neighbor = &neigh;
|
||||
|
||||
int* neighptr = &(neighbor->neighbors[i]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
|
||||
if ( j == i ){
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
|
||||
if( rsq <= cutoff ) {
|
||||
int idx = nlocal * n;
|
||||
neighptr[idx] = j;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
if(n > maxneighs) {
|
||||
atomicMax(new_maxneighs, n);
|
||||
}
|
||||
}
|
||||
|
||||
void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
const int num_blocks = ceil((float) nall / (float) threads_per_block);
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
|
||||
memsetGPU(c_resize_needed, 0, sizeof(int));
|
||||
|
||||
binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||
cuda_assert("binatoms", cudaPeekAtLastError());
|
||||
cuda_assert("binatoms", cudaDeviceSynchronize());
|
||||
|
||||
memcpyFromGPU(&resize, c_resize_needed, sizeof(int));
|
||||
if(resize) {
|
||||
c_binning->atoms_per_bin *= 2;
|
||||
c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
atoms_per_bin = c_binning->atoms_per_bin;
|
||||
const int sortBlocks = ceil((float) mbins / (float) threads_per_block);
|
||||
sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
|
||||
cuda_assert("sort_bin", cudaPeekAtLastError());
|
||||
cuda_assert("sort_bin", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
cudaProfilerStart();
|
||||
|
||||
// TODO move all of this initialization into its own method
|
||||
if(c_stencil == NULL) {
|
||||
c_stencil = (int *) allocateGPU(nstencil * sizeof(int));
|
||||
memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int));
|
||||
}
|
||||
|
||||
if(c_binning.mbins == 0) {
|
||||
c_binning.mbins = mbins;
|
||||
c_binning.atoms_per_bin = atoms_per_bin;
|
||||
c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int));
|
||||
c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int));
|
||||
}
|
||||
|
||||
Neighbor_params np {
|
||||
.xprd = xprd,
|
||||
.yprd = yprd,
|
||||
.zprd = zprd,
|
||||
.bininvx = bininvx,
|
||||
.bininvy = bininvy,
|
||||
.bininvz = bininvz,
|
||||
.mbinxlo = mbinxlo,
|
||||
.mbinylo = mbinylo,
|
||||
.mbinzlo = mbinzlo,
|
||||
.nbinx = nbinx,
|
||||
.nbiny = nbiny,
|
||||
.nbinz = nbinz,
|
||||
.mbinx = mbinx,
|
||||
.mbiny = mbiny,
|
||||
.mbinz = mbinz
|
||||
};
|
||||
|
||||
if(c_resize_needed == NULL) {
|
||||
c_resize_needed = (int *) allocateGPU(sizeof(int));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||
if(c_new_maxneighs == NULL) {
|
||||
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
|
||||
}
|
||||
|
||||
int resize = 1;
|
||||
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
|
||||
d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int));
|
||||
}
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
resize = 0;
|
||||
memsetGPU(c_new_maxneighs, 0, sizeof(int));
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
|
||||
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
|
||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||
c_new_maxneighs,
|
||||
cutneighsq, atom->ntypes);
|
||||
|
||||
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
|
||||
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
|
||||
|
||||
int new_maxneighs;
|
||||
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
|
||||
if(new_maxneighs > neighbor->maxneighs){
|
||||
resize = 1;
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
printf("NEW SIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
cudaProfilerStop();
|
||||
}
|
111
verletlist/cuda/pbc.cu
Normal file
111
verletlist/cuda/pbc.cu
Normal file
@@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
//---
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <pbc.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern int NmaxGhost;
|
||||
extern int *PBCx, *PBCy, *PBCz;
|
||||
static int c_NmaxGhost = 0;
|
||||
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
|
||||
|
||||
__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
DeviceAtom *atom = &a;
|
||||
if(i >= nlocal) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if (atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if (atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if (atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if (atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if (atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nghost) {
|
||||
return;
|
||||
}
|
||||
|
||||
DeviceAtom* atom = &a;
|
||||
int *border_map = atom->border_map;
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
|
||||
if(reneigh) {
|
||||
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax);
|
||||
|
||||
if(c_NmaxGhost < NmaxGhost) {
|
||||
c_NmaxGhost = NmaxGhost;
|
||||
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
|
||||
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
|
||||
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
|
||||
atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
|
||||
}
|
||||
|
||||
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
|
||||
memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
|
||||
cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
|
||||
cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
||||
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
||||
cuda_assert("updatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("updatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
|
||||
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
|
||||
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
|
||||
memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
|
||||
}
|
Reference in New Issue
Block a user