Porting atom velocity memory layout to AoS, porting velocity integration to CUDA, adding measurements + logbook update

This commit is contained in:
Maximilian Gaul 2022-01-01 18:18:12 +01:00
parent 50007216ed
commit dc4d5f1a9c
8 changed files with 175 additions and 72 deletions

1
.gitignore vendored
View File

@ -52,6 +52,7 @@ Mkfile.old
dkms.conf
# Build directories and executables
.vscode/
GCC/
ICC/
MDBench-GCC*

View File

@ -78,7 +78,7 @@ void* reallocate (
size_t newBytesize,
size_t oldBytesize)
{
void* newarray = allocate(alignment, newBytesize);
void* newarray = allocate(alignment, newBytesize);
if(ptr != NULL) {
memcpy(newarray, ptr, oldBytesize);

View File

@ -137,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param)
atom_x(atom->Nlocal) = xtmp;
atom_y(atom->Nlocal) = ytmp;
atom_z(atom->Nlocal) = ztmp;
atom->vx[atom->Nlocal] = vxtmp;
atom->vy[atom->Nlocal] = vytmp;
atom->vz[atom->Nlocal] = vztmp;
atom_vx(atom->Nlocal) = vxtmp;
atom_vy(atom->Nlocal) = vytmp;
atom_vz(atom->Nlocal) = vztmp;
atom->type[atom->Nlocal] = rand() % atom->ntypes;
atom->Nlocal++;
}
@ -164,6 +164,8 @@ void growAtom(Atom *atom)
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
#else
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
@ -172,10 +174,12 @@ void growAtom(Atom *atom)
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
#endif
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
#endif
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
}

View File

@ -92,13 +92,115 @@ __global__ void calc_force(
atom_fz(i) = fiz;
}
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
atom_vz(i) += dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + dt * atom_vx(i);
atom_y(i) = atom_y(i) + dt * atom_vy(i);
atom_z(i) = atom_z(i) + dt * atom_vz(i);
}
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
atom_vz(i) += dtforce * atom_fz(i);
}
extern "C" {
bool initialized = false;
static Atom c_atom;
int *c_neighs;
int *c_neigh_numneigh;
int get_num_threads() {
const char *num_threads_env = getenv("NUM_THREADS");
int num_threads = 0;
if(num_threads_env == nullptr)
num_threads = 32;
else {
num_threads = atoi(num_threads_env);
}
return num_threads;
}
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads = get_num_threads();
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, c_atom);
checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
if(doReneighbour) {
checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
checkCUDAError( "FinalIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
}
}
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads = get_num_threads();
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, c_atom);
checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
}
void initCudaAtom(Atom *atom, Neighbor *neighbor) {
const int Nlocal = atom->Nlocal;
checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
checkCUDAError( "c_atom.vx malloc", cudaMalloc((void**)&(c_atom.vx), sizeof(MD_FLOAT) * Nlocal * 3) );
checkCUDAError( "c_atom.vx memcpy", cudaMemcpy(c_atom.vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
}
double computeForce(
bool reneighbourHappenend,
Parameter *param,
@ -113,15 +215,7 @@ double computeForce(
MD_FLOAT epsilon = param->epsilon;
#endif
cudaProfilerStart();
const char *num_threads_env = getenv("NUM_THREADS");
int num_threads = 0;
if(num_threads_env == nullptr)
num_threads = 32;
else {
num_threads = atoi(num_threads_env);
}
const int num_threads = get_num_threads();
c_atom.Natoms = atom->Natoms;
c_atom.Nlocal = atom->Nlocal;
@ -149,25 +243,9 @@ double computeForce(
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
// HINT: Only works for data layout = AOS!!!
if(!initialized) {
checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
// checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
}
checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
cudaProfilerStart();
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
@ -184,12 +262,12 @@ double computeForce(
calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
checkCUDAError( "PeekAtLastError", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync", cudaDeviceSynchronize() );
checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
// copy results in c_atom.fx/fy/fz to atom->fx/fy/fz
cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost);
checkCUDAError( "memcpy force back", cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost) );
/*
cudaFree(c_atom.x);
@ -207,8 +285,6 @@ double computeForce(
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
initialized = true;
return E-S;
}
}

View File

@ -54,6 +54,10 @@ extern void growAtom(Atom*);
#define atom_fy(i) atom->fx[(i) * 3 + 1]
#define atom_fz(i) atom->fx[(i) * 3 + 2]
#define atom_vx(i) atom->vx[(i) * 3 + 0]
#define atom_vy(i) atom->vx[(i) * 3 + 1]
#define atom_vz(i) atom->vx[(i) * 3 + 2]
#else
#define POS_DATA_LAYOUT "SoA"
@ -65,6 +69,10 @@ extern void growAtom(Atom*);
#define atom_fy(i) atom->fy[i]
#define atom_fz(i) atom->fz[i]
#define atom_vx(i) atom->vx[i]
#define atom_vy(i) atom->vy[i]
#define atom_vz(i) atom->vz[i]
#endif
#endif

View File

@ -45,6 +45,11 @@
#define HLINE "----------------------------------------------------------------------------\n"
extern void initCudaAtom(Atom *atom, Neighbor *neighbor);
extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom);
extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom);
extern double computeForce(bool, Parameter*, Atom*, Neighbor*);
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
@ -101,6 +106,8 @@ double setup(
buildNeighbor(atom, neighbor);
E = getTimeStamp();
initCudaAtom(atom, neighbor);
return E-S;
}
@ -126,26 +133,22 @@ double reneighbour(
void initialIntegrate(Parameter *param, Atom *atom)
{
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
for(int i = 0; i < atom->Nlocal; i++) {
vx[i] += param->dtforce * atom_fx(i);
vy[i] += param->dtforce * atom_fy(i);
vz[i] += param->dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + param->dt * vx[i];
atom_y(i) = atom_y(i) + param->dt * vy[i];
atom_z(i) = atom_z(i) + param->dt * vz[i];
atom_vx(i) += param->dtforce * atom_fx(i);
atom_vy(i) += param->dtforce * atom_fy(i);
atom_vz(i) += param->dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
}
}
void finalIntegrate(Parameter *param, Atom *atom)
{
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
for(int i = 0; i < atom->Nlocal; i++) {
vx[i] += param->dtforce * atom_fx(i);
vy[i] += param->dtforce * atom_fy(i);
vz[i] += param->dtforce * atom_fz(i);
atom_vx(i) += param->dtforce * atom_fx(i);
atom_vy(i) += param->dtforce * atom_fy(i);
atom_vz(i) += param->dtforce * atom_fz(i);
}
}
@ -274,10 +277,11 @@ int main(int argc, char** argv)
}
for(int n = 0; n < param.ntimes; n++) {
initialIntegrate(&param, &atom);
const bool doReneighbour = (n + 1) % param.every == 0;
cuda_initial_integrate(doReneighbour, &param, &atom); // initialIntegrate(&param, &atom);
if(doReneighbour) {
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
} else {
@ -293,7 +297,7 @@ int main(int argc, char** argv)
timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor);
#endif
}
finalIntegrate(&param, &atom);
cuda_final_integrate(doReneighbour, &param, &atom); // finalIntegrate(&param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
computeThermo(n + 1, &param, &atom);

View File

@ -359,14 +359,18 @@ void sortAtom(Atom* atom) {
#ifdef AOS
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
#else
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
#endif
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
#endif
double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
@ -380,24 +384,34 @@ void sortAtom(Atom* atom) {
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
#else
new_x[new_i] = old_x[old_i];
new_y[new_i] = old_y[old_i];
new_z[new_i] = old_z[old_i];
#endif
new_vx[new_i] = old_vx[old_i];
new_vy[new_i] = old_vy[old_i];
new_vz[new_i] = old_vz[old_i];
#endif
}
}
free(atom->x);
atom->x = new_x;
free(atom->vx);
atom->vx = new_vx;
#ifndef AOS
free(atom->y);
free(atom->z);
atom->y = new_y; atom->z = new_z;
free(atom->vy); free(atom->vz);
atom->vy = new_vy; atom->vz = new_vz;
#endif
free(atom->vx); free(atom->vy); free(atom->vz);
atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz;
}

View File

@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms)
void computeThermo(int iflag, Parameter *param, Atom *atom)
{
MD_FLOAT t = 0.0, p;
MD_FLOAT* vx = atom->vx;
MD_FLOAT* vy = atom->vy;
MD_FLOAT* vz = atom->vz;
for(int i = 0; i < atom->Nlocal; i++) {
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
}
t = t * t_scale;
@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom)
{
/* zero center-of-mass motion */
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
for(int i = 0; i < atom->Nlocal; i++) {
vxtot += vx[i];
vytot += vy[i];
vztot += vz[i];
vxtot += atom_vx(i);
vytot += atom_vy(i);
vztot += atom_vz(i);
}
vxtot = vxtot / atom->Natoms;
@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom)
vztot = vztot / atom->Natoms;
for(int i = 0; i < atom->Nlocal; i++) {
vx[i] -= vxtot;
vy[i] -= vytot;
vz[i] -= vztot;
atom_vx(i) -= vxtot;
atom_vy(i) -= vytot;
atom_vz(i) -= vztot;
}
t_act = 0;
MD_FLOAT t = 0.0;
for(int i = 0; i < atom->Nlocal; i++) {
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
}
t *= t_scale;
MD_FLOAT factor = sqrt(param->temp / t);
for(int i = 0; i < atom->Nlocal; i++) {
vx[i] *= factor;
vy[i] *= factor;
vz[i] *= factor;
atom_vx(i) *= factor;
atom_vy(i) *= factor;
atom_vz(i) *= factor;
}
}