Porting atom velocity memory layout to AoS, porting velocity integration to CUDA, adding measurements + logbook update
This commit is contained in:
parent
50007216ed
commit
dc4d5f1a9c
1
.gitignore
vendored
1
.gitignore
vendored
@ -52,6 +52,7 @@ Mkfile.old
|
|||||||
dkms.conf
|
dkms.conf
|
||||||
|
|
||||||
# Build directories and executables
|
# Build directories and executables
|
||||||
|
.vscode/
|
||||||
GCC/
|
GCC/
|
||||||
ICC/
|
ICC/
|
||||||
MDBench-GCC*
|
MDBench-GCC*
|
||||||
|
@ -78,7 +78,7 @@ void* reallocate (
|
|||||||
size_t newBytesize,
|
size_t newBytesize,
|
||||||
size_t oldBytesize)
|
size_t oldBytesize)
|
||||||
{
|
{
|
||||||
void* newarray = allocate(alignment, newBytesize);
|
void* newarray = allocate(alignment, newBytesize);
|
||||||
|
|
||||||
if(ptr != NULL) {
|
if(ptr != NULL) {
|
||||||
memcpy(newarray, ptr, oldBytesize);
|
memcpy(newarray, ptr, oldBytesize);
|
||||||
|
12
src/atom.c
12
src/atom.c
@ -137,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param)
|
|||||||
atom_x(atom->Nlocal) = xtmp;
|
atom_x(atom->Nlocal) = xtmp;
|
||||||
atom_y(atom->Nlocal) = ytmp;
|
atom_y(atom->Nlocal) = ytmp;
|
||||||
atom_z(atom->Nlocal) = ztmp;
|
atom_z(atom->Nlocal) = ztmp;
|
||||||
atom->vx[atom->Nlocal] = vxtmp;
|
atom_vx(atom->Nlocal) = vxtmp;
|
||||||
atom->vy[atom->Nlocal] = vytmp;
|
atom_vy(atom->Nlocal) = vytmp;
|
||||||
atom->vz[atom->Nlocal] = vztmp;
|
atom_vz(atom->Nlocal) = vztmp;
|
||||||
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
||||||
atom->Nlocal++;
|
atom->Nlocal++;
|
||||||
}
|
}
|
||||||
@ -164,6 +164,8 @@ void growAtom(Atom *atom)
|
|||||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
|
|
||||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
|
|
||||||
|
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||||
#else
|
#else
|
||||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
@ -172,10 +174,12 @@ void growAtom(Atom *atom)
|
|||||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
#endif
|
|
||||||
|
|
||||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||||
}
|
}
|
||||||
|
142
src/force.cu
142
src/force.cu
@ -92,13 +92,115 @@ __global__ void calc_force(
|
|||||||
atom_fz(i) = fiz;
|
atom_fz(i) = fiz;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
|
||||||
|
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if( i >= Nlocal ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Atom *atom = &a;
|
||||||
|
|
||||||
|
atom_vx(i) += dtforce * atom_fx(i);
|
||||||
|
atom_vy(i) += dtforce * atom_fy(i);
|
||||||
|
atom_vz(i) += dtforce * atom_fz(i);
|
||||||
|
atom_x(i) = atom_x(i) + dt * atom_vx(i);
|
||||||
|
atom_y(i) = atom_y(i) + dt * atom_vy(i);
|
||||||
|
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
|
||||||
|
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if( i >= Nlocal ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Atom *atom = &a;
|
||||||
|
|
||||||
|
atom_vx(i) += dtforce * atom_fx(i);
|
||||||
|
atom_vy(i) += dtforce * atom_fy(i);
|
||||||
|
atom_vz(i) += dtforce * atom_fz(i);
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
bool initialized = false;
|
|
||||||
static Atom c_atom;
|
static Atom c_atom;
|
||||||
int *c_neighs;
|
int *c_neighs;
|
||||||
int *c_neigh_numneigh;
|
int *c_neigh_numneigh;
|
||||||
|
|
||||||
|
int get_num_threads() {
|
||||||
|
|
||||||
|
const char *num_threads_env = getenv("NUM_THREADS");
|
||||||
|
int num_threads = 0;
|
||||||
|
if(num_threads_env == nullptr)
|
||||||
|
num_threads = 32;
|
||||||
|
else {
|
||||||
|
num_threads = atoi(num_threads_env);
|
||||||
|
}
|
||||||
|
|
||||||
|
return num_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
|
||||||
|
|
||||||
|
const int Nlocal = atom->Nlocal;
|
||||||
|
const int num_threads = get_num_threads();
|
||||||
|
|
||||||
|
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
||||||
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
|
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, c_atom);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
if(doReneighbour) {
|
||||||
|
checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
checkCUDAError( "FinalIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
|
||||||
|
|
||||||
|
const int Nlocal = atom->Nlocal;
|
||||||
|
const int num_threads = get_num_threads();
|
||||||
|
|
||||||
|
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
||||||
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
|
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, c_atom);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
}
|
||||||
|
|
||||||
|
void initCudaAtom(Atom *atom, Neighbor *neighbor) {
|
||||||
|
|
||||||
|
const int Nlocal = atom->Nlocal;
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
||||||
|
checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||||
|
checkCUDAError( "c_atom.vx malloc", cudaMalloc((void**)&(c_atom.vx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||||
|
checkCUDAError( "c_atom.vx memcpy", cudaMemcpy(c_atom.vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
|
||||||
|
checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
|
checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
|
checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
|
||||||
|
checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
}
|
||||||
|
|
||||||
double computeForce(
|
double computeForce(
|
||||||
bool reneighbourHappenend,
|
bool reneighbourHappenend,
|
||||||
Parameter *param,
|
Parameter *param,
|
||||||
@ -113,15 +215,7 @@ double computeForce(
|
|||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
cudaProfilerStart();
|
const int num_threads = get_num_threads();
|
||||||
|
|
||||||
const char *num_threads_env = getenv("NUM_THREADS");
|
|
||||||
int num_threads = 0;
|
|
||||||
if(num_threads_env == nullptr)
|
|
||||||
num_threads = 32;
|
|
||||||
else {
|
|
||||||
num_threads = atoi(num_threads_env);
|
|
||||||
}
|
|
||||||
|
|
||||||
c_atom.Natoms = atom->Natoms;
|
c_atom.Natoms = atom->Natoms;
|
||||||
c_atom.Nlocal = atom->Nlocal;
|
c_atom.Nlocal = atom->Nlocal;
|
||||||
@ -149,25 +243,9 @@ double computeForce(
|
|||||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||||
// HINT: Only works for data layout = AOS!!!
|
// HINT: Only works for data layout = AOS!!!
|
||||||
|
|
||||||
if(!initialized) {
|
// checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||||
checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
|
||||||
checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
|
||||||
checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
|
|
||||||
checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
|
||||||
checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
|
||||||
checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
|
||||||
|
|
||||||
checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
|
cudaProfilerStart();
|
||||||
checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
|
|
||||||
|
|
||||||
checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
|
||||||
checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
|
||||||
checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
|
||||||
|
|
||||||
checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
|
||||||
}
|
|
||||||
|
|
||||||
checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
|
|
||||||
|
|
||||||
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
@ -184,12 +262,12 @@ double computeForce(
|
|||||||
|
|
||||||
calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
|
calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
|
||||||
|
|
||||||
checkCUDAError( "PeekAtLastError", cudaPeekAtLastError() );
|
checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
|
||||||
checkCUDAError( "DeviceSync", cudaDeviceSynchronize() );
|
checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
// copy results in c_atom.fx/fy/fz to atom->fx/fy/fz
|
// copy results in c_atom.fx/fy/fz to atom->fx/fy/fz
|
||||||
|
|
||||||
cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost);
|
checkCUDAError( "memcpy force back", cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
|
||||||
/*
|
/*
|
||||||
cudaFree(c_atom.x);
|
cudaFree(c_atom.x);
|
||||||
@ -207,8 +285,6 @@ double computeForce(
|
|||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
|
|
||||||
initialized = true;
|
|
||||||
|
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -54,6 +54,10 @@ extern void growAtom(Atom*);
|
|||||||
#define atom_fy(i) atom->fx[(i) * 3 + 1]
|
#define atom_fy(i) atom->fx[(i) * 3 + 1]
|
||||||
#define atom_fz(i) atom->fx[(i) * 3 + 2]
|
#define atom_fz(i) atom->fx[(i) * 3 + 2]
|
||||||
|
|
||||||
|
#define atom_vx(i) atom->vx[(i) * 3 + 0]
|
||||||
|
#define atom_vy(i) atom->vx[(i) * 3 + 1]
|
||||||
|
#define atom_vz(i) atom->vx[(i) * 3 + 2]
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#define POS_DATA_LAYOUT "SoA"
|
#define POS_DATA_LAYOUT "SoA"
|
||||||
|
|
||||||
@ -65,6 +69,10 @@ extern void growAtom(Atom*);
|
|||||||
#define atom_fy(i) atom->fy[i]
|
#define atom_fy(i) atom->fy[i]
|
||||||
#define atom_fz(i) atom->fz[i]
|
#define atom_fz(i) atom->fz[i]
|
||||||
|
|
||||||
|
#define atom_vx(i) atom->vx[i]
|
||||||
|
#define atom_vy(i) atom->vy[i]
|
||||||
|
#define atom_vz(i) atom->vz[i]
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
34
src/main.c
34
src/main.c
@ -45,6 +45,11 @@
|
|||||||
|
|
||||||
#define HLINE "----------------------------------------------------------------------------\n"
|
#define HLINE "----------------------------------------------------------------------------\n"
|
||||||
|
|
||||||
|
extern void initCudaAtom(Atom *atom, Neighbor *neighbor);
|
||||||
|
|
||||||
|
extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom);
|
||||||
|
extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom);
|
||||||
|
|
||||||
extern double computeForce(bool, Parameter*, Atom*, Neighbor*);
|
extern double computeForce(bool, Parameter*, Atom*, Neighbor*);
|
||||||
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
|
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
|
||||||
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
|
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
|
||||||
@ -101,6 +106,8 @@ double setup(
|
|||||||
buildNeighbor(atom, neighbor);
|
buildNeighbor(atom, neighbor);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
|
|
||||||
|
initCudaAtom(atom, neighbor);
|
||||||
|
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -126,26 +133,22 @@ double reneighbour(
|
|||||||
|
|
||||||
void initialIntegrate(Parameter *param, Atom *atom)
|
void initialIntegrate(Parameter *param, Atom *atom)
|
||||||
{
|
{
|
||||||
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
|
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
vx[i] += param->dtforce * atom_fx(i);
|
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||||
vy[i] += param->dtforce * atom_fy(i);
|
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||||
vz[i] += param->dtforce * atom_fz(i);
|
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||||
atom_x(i) = atom_x(i) + param->dt * vx[i];
|
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
|
||||||
atom_y(i) = atom_y(i) + param->dt * vy[i];
|
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
|
||||||
atom_z(i) = atom_z(i) + param->dt * vz[i];
|
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void finalIntegrate(Parameter *param, Atom *atom)
|
void finalIntegrate(Parameter *param, Atom *atom)
|
||||||
{
|
{
|
||||||
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
|
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
vx[i] += param->dtforce * atom_fx(i);
|
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||||
vy[i] += param->dtforce * atom_fy(i);
|
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||||
vz[i] += param->dtforce * atom_fz(i);
|
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -274,10 +277,11 @@ int main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
for(int n = 0; n < param.ntimes; n++) {
|
for(int n = 0; n < param.ntimes; n++) {
|
||||||
initialIntegrate(¶m, &atom);
|
|
||||||
|
|
||||||
const bool doReneighbour = (n + 1) % param.every == 0;
|
const bool doReneighbour = (n + 1) % param.every == 0;
|
||||||
|
|
||||||
|
cuda_initial_integrate(doReneighbour, ¶m, &atom); // initialIntegrate(¶m, &atom);
|
||||||
|
|
||||||
if(doReneighbour) {
|
if(doReneighbour) {
|
||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||||
} else {
|
} else {
|
||||||
@ -293,7 +297,7 @@ int main(int argc, char** argv)
|
|||||||
timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor);
|
timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
finalIntegrate(¶m, &atom);
|
cuda_final_integrate(doReneighbour, ¶m, &atom); // finalIntegrate(¶m, &atom);
|
||||||
|
|
||||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||||
computeThermo(n + 1, ¶m, &atom);
|
computeThermo(n + 1, ¶m, &atom);
|
||||||
|
@ -359,14 +359,18 @@ void sortAtom(Atom* atom) {
|
|||||||
|
|
||||||
#ifdef AOS
|
#ifdef AOS
|
||||||
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||||
|
|
||||||
|
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||||
#else
|
#else
|
||||||
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
#endif
|
|
||||||
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||||
|
#endif
|
||||||
|
|
||||||
double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
|
double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
|
||||||
double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
|
double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
|
||||||
|
|
||||||
@ -380,24 +384,34 @@ void sortAtom(Atom* atom) {
|
|||||||
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
|
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
|
||||||
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
|
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
|
||||||
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
|
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
|
||||||
|
|
||||||
|
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
|
||||||
|
new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
|
||||||
|
new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
|
||||||
#else
|
#else
|
||||||
new_x[new_i] = old_x[old_i];
|
new_x[new_i] = old_x[old_i];
|
||||||
new_y[new_i] = old_y[old_i];
|
new_y[new_i] = old_y[old_i];
|
||||||
new_z[new_i] = old_z[old_i];
|
new_z[new_i] = old_z[old_i];
|
||||||
#endif
|
|
||||||
new_vx[new_i] = old_vx[old_i];
|
new_vx[new_i] = old_vx[old_i];
|
||||||
new_vy[new_i] = old_vy[old_i];
|
new_vy[new_i] = old_vy[old_i];
|
||||||
new_vz[new_i] = old_vz[old_i];
|
new_vz[new_i] = old_vz[old_i];
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
free(atom->x);
|
free(atom->x);
|
||||||
atom->x = new_x;
|
atom->x = new_x;
|
||||||
|
|
||||||
|
free(atom->vx);
|
||||||
|
atom->vx = new_vx;
|
||||||
#ifndef AOS
|
#ifndef AOS
|
||||||
free(atom->y);
|
free(atom->y);
|
||||||
free(atom->z);
|
free(atom->z);
|
||||||
atom->y = new_y; atom->z = new_z;
|
atom->y = new_y; atom->z = new_z;
|
||||||
|
|
||||||
|
free(atom->vy); free(atom->vz);
|
||||||
|
atom->vy = new_vy; atom->vz = new_vz;
|
||||||
#endif
|
#endif
|
||||||
free(atom->vx); free(atom->vy); free(atom->vz);
|
|
||||||
atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz;
|
|
||||||
}
|
}
|
||||||
|
26
src/thermo.c
26
src/thermo.c
@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms)
|
|||||||
void computeThermo(int iflag, Parameter *param, Atom *atom)
|
void computeThermo(int iflag, Parameter *param, Atom *atom)
|
||||||
{
|
{
|
||||||
MD_FLOAT t = 0.0, p;
|
MD_FLOAT t = 0.0, p;
|
||||||
MD_FLOAT* vx = atom->vx;
|
|
||||||
MD_FLOAT* vy = atom->vy;
|
|
||||||
MD_FLOAT* vz = atom->vz;
|
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
|
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||||
}
|
}
|
||||||
|
|
||||||
t = t * t_scale;
|
t = t * t_scale;
|
||||||
@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom)
|
|||||||
{
|
{
|
||||||
/* zero center-of-mass motion */
|
/* zero center-of-mass motion */
|
||||||
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
|
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
|
||||||
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
|
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
vxtot += vx[i];
|
vxtot += atom_vx(i);
|
||||||
vytot += vy[i];
|
vytot += atom_vy(i);
|
||||||
vztot += vz[i];
|
vztot += atom_vz(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
vxtot = vxtot / atom->Natoms;
|
vxtot = vxtot / atom->Natoms;
|
||||||
@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom)
|
|||||||
vztot = vztot / atom->Natoms;
|
vztot = vztot / atom->Natoms;
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
vx[i] -= vxtot;
|
atom_vx(i) -= vxtot;
|
||||||
vy[i] -= vytot;
|
atom_vy(i) -= vytot;
|
||||||
vz[i] -= vztot;
|
atom_vz(i) -= vztot;
|
||||||
}
|
}
|
||||||
|
|
||||||
t_act = 0;
|
t_act = 0;
|
||||||
MD_FLOAT t = 0.0;
|
MD_FLOAT t = 0.0;
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
|
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||||
}
|
}
|
||||||
|
|
||||||
t *= t_scale;
|
t *= t_scale;
|
||||||
MD_FLOAT factor = sqrt(param->temp / t);
|
MD_FLOAT factor = sqrt(param->temp / t);
|
||||||
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
vx[i] *= factor;
|
atom_vx(i) *= factor;
|
||||||
vy[i] *= factor;
|
atom_vy(i) *= factor;
|
||||||
vz[i] *= factor;
|
atom_vz(i) *= factor;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user