Porting atom velocity memory layout to AoS, porting velocity integration to CUDA, adding measurements + logbook update
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -52,6 +52,7 @@ Mkfile.old | |||||||
| dkms.conf | dkms.conf | ||||||
|  |  | ||||||
| # Build directories and executables | # Build directories and executables | ||||||
|  | .vscode/ | ||||||
| GCC/ | GCC/ | ||||||
| ICC/ | ICC/ | ||||||
| MDBench-GCC* | MDBench-GCC* | ||||||
|   | |||||||
| @@ -78,7 +78,7 @@ void* reallocate ( | |||||||
|         size_t newBytesize, |         size_t newBytesize, | ||||||
|         size_t oldBytesize) |         size_t oldBytesize) | ||||||
| { | { | ||||||
|     void* newarray =  allocate(alignment, newBytesize); |     void* newarray = allocate(alignment, newBytesize); | ||||||
|  |  | ||||||
|     if(ptr != NULL) { |     if(ptr != NULL) { | ||||||
|         memcpy(newarray, ptr, oldBytesize); |         memcpy(newarray, ptr, oldBytesize); | ||||||
|   | |||||||
							
								
								
									
										12
									
								
								src/atom.c
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								src/atom.c
									
									
									
									
									
								
							| @@ -137,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param) | |||||||
|                 atom_x(atom->Nlocal) = xtmp; |                 atom_x(atom->Nlocal) = xtmp; | ||||||
|                 atom_y(atom->Nlocal) = ytmp; |                 atom_y(atom->Nlocal) = ytmp; | ||||||
|                 atom_z(atom->Nlocal) = ztmp; |                 atom_z(atom->Nlocal) = ztmp; | ||||||
|                 atom->vx[atom->Nlocal] = vxtmp; |                 atom_vx(atom->Nlocal) = vxtmp; | ||||||
|                 atom->vy[atom->Nlocal] = vytmp; |                 atom_vy(atom->Nlocal) = vytmp; | ||||||
|                 atom->vz[atom->Nlocal] = vztmp; |                 atom_vz(atom->Nlocal) = vztmp; | ||||||
|                 atom->type[atom->Nlocal] = rand() % atom->ntypes; |                 atom->type[atom->Nlocal] = rand() % atom->ntypes; | ||||||
|                 atom->Nlocal++; |                 atom->Nlocal++; | ||||||
|             } |             } | ||||||
| @@ -164,6 +164,8 @@ void growAtom(Atom *atom) | |||||||
|     atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); |     atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); | ||||||
|  |  | ||||||
|     atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); |     atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); | ||||||
|  |  | ||||||
|  |     atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3); | ||||||
|     #else |     #else | ||||||
|     atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|     atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
| @@ -172,10 +174,12 @@ void growAtom(Atom *atom) | |||||||
|     atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|     atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|     atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|     #endif |  | ||||||
|  |  | ||||||
|     atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|     atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|     atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); |     atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT)); | ||||||
|  |  | ||||||
|  |     #endif | ||||||
|  |  | ||||||
|     atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int)); |     atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int)); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										142
									
								
								src/force.cu
									
									
									
									
									
								
							
							
						
						
									
										142
									
								
								src/force.cu
									
									
									
									
									
								
							| @@ -92,13 +92,115 @@ __global__ void calc_force( | |||||||
|     atom_fz(i) = fiz; |     atom_fz(i) = fiz; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | __global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) { | ||||||
|  |  | ||||||
|  |     const int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||||
|  |     if( i >= Nlocal ) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Atom *atom = &a; | ||||||
|  |  | ||||||
|  |     atom_vx(i) += dtforce * atom_fx(i); | ||||||
|  |     atom_vy(i) += dtforce * atom_fy(i); | ||||||
|  |     atom_vz(i) += dtforce * atom_fz(i); | ||||||
|  |     atom_x(i) = atom_x(i) + dt * atom_vx(i); | ||||||
|  |     atom_y(i) = atom_y(i) + dt * atom_vy(i); | ||||||
|  |     atom_z(i) = atom_z(i) + dt * atom_vz(i); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) { | ||||||
|  |  | ||||||
|  |     const int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||||
|  |     if( i >= Nlocal ) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Atom *atom = &a; | ||||||
|  |  | ||||||
|  |     atom_vx(i) += dtforce * atom_fx(i); | ||||||
|  |     atom_vy(i) += dtforce * atom_fy(i); | ||||||
|  |     atom_vz(i) += dtforce * atom_fz(i); | ||||||
|  | } | ||||||
|  |  | ||||||
| extern "C" { | extern "C" { | ||||||
|  |  | ||||||
| bool initialized = false; |  | ||||||
| static Atom c_atom; | static Atom c_atom; | ||||||
| int *c_neighs; | int *c_neighs; | ||||||
| int *c_neigh_numneigh; | int *c_neigh_numneigh; | ||||||
|  |  | ||||||
|  | int get_num_threads() { | ||||||
|  |  | ||||||
|  |     const char *num_threads_env = getenv("NUM_THREADS"); | ||||||
|  |     int num_threads = 0; | ||||||
|  |     if(num_threads_env == nullptr) | ||||||
|  |         num_threads = 32; | ||||||
|  |     else { | ||||||
|  |         num_threads = atoi(num_threads_env); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     return num_threads; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom) { | ||||||
|  |  | ||||||
|  |     const int Nlocal = atom->Nlocal; | ||||||
|  |     const int num_threads = get_num_threads(); | ||||||
|  |  | ||||||
|  |     const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps | ||||||
|  |     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); | ||||||
|  |  | ||||||
|  |     kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, c_atom); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() ); | ||||||
|  |     checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() ); | ||||||
|  |  | ||||||
|  |     if(doReneighbour) { | ||||||
|  |         checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); | ||||||
|  |         checkCUDAError( "FinalIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom) { | ||||||
|  |  | ||||||
|  |     const int Nlocal = atom->Nlocal; | ||||||
|  |     const int num_threads = get_num_threads(); | ||||||
|  |  | ||||||
|  |     const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps | ||||||
|  |     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); | ||||||
|  |  | ||||||
|  |     kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, c_atom); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() ); | ||||||
|  |     checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() ); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); | ||||||
|  |     checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void initCudaAtom(Atom *atom, Neighbor *neighbor) { | ||||||
|  |  | ||||||
|  |     const int Nlocal = atom->Nlocal; | ||||||
|  |  | ||||||
|  |     checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); | ||||||
|  |     checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) ); | ||||||
|  |     checkCUDAError( "c_atom.vx malloc", cudaMalloc((void**)&(c_atom.vx), sizeof(MD_FLOAT) * Nlocal * 3) ); | ||||||
|  |     checkCUDAError( "c_atom.vx memcpy", cudaMemcpy(c_atom.vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) ); | ||||||
|  |     checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) ); | ||||||
|  |     checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); | ||||||
|  |     checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); | ||||||
|  |     checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) ); | ||||||
|  |     checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) ); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); | ||||||
|  |     checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); | ||||||
|  |     checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); | ||||||
|  |  | ||||||
|  |     checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); | ||||||
|  | } | ||||||
|  |  | ||||||
| double computeForce( | double computeForce( | ||||||
|         bool reneighbourHappenend, |         bool reneighbourHappenend, | ||||||
|         Parameter *param, |         Parameter *param, | ||||||
| @@ -113,15 +215,7 @@ double computeForce( | |||||||
|     MD_FLOAT epsilon = param->epsilon; |     MD_FLOAT epsilon = param->epsilon; | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|     cudaProfilerStart(); |     const int num_threads = get_num_threads(); | ||||||
|  |  | ||||||
|     const char *num_threads_env = getenv("NUM_THREADS"); |  | ||||||
|     int num_threads = 0; |  | ||||||
|     if(num_threads_env == nullptr) |  | ||||||
|         num_threads = 32; |  | ||||||
|     else { |  | ||||||
|         num_threads = atoi(num_threads_env); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     c_atom.Natoms = atom->Natoms; |     c_atom.Natoms = atom->Natoms; | ||||||
|     c_atom.Nlocal = atom->Nlocal; |     c_atom.Nlocal = atom->Nlocal; | ||||||
| @@ -149,25 +243,9 @@ double computeForce( | |||||||
|     // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error |     // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error | ||||||
|     // HINT: Only works for data layout = AOS!!! |     // HINT: Only works for data layout = AOS!!! | ||||||
|  |  | ||||||
|     if(!initialized) { |     // checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) ); | ||||||
|         checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); |  | ||||||
|         checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) ); |  | ||||||
|         checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) ); |  | ||||||
|         checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); |  | ||||||
|         checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); |  | ||||||
|         checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); |  | ||||||
|  |  | ||||||
|         checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) ); |     cudaProfilerStart(); | ||||||
|         checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) ); |  | ||||||
|  |  | ||||||
|         checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); |  | ||||||
|         checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); |  | ||||||
|         checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); |  | ||||||
|  |  | ||||||
|         checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) ); |  | ||||||
|  |  | ||||||
|     checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); |     checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); | ||||||
|  |  | ||||||
| @@ -184,12 +262,12 @@ double computeForce( | |||||||
|  |  | ||||||
|     calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh); |     calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh); | ||||||
|  |  | ||||||
|     checkCUDAError( "PeekAtLastError", cudaPeekAtLastError() ); |     checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() ); | ||||||
|     checkCUDAError( "DeviceSync", cudaDeviceSynchronize() ); |     checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() ); | ||||||
|  |  | ||||||
|     // copy results in c_atom.fx/fy/fz to atom->fx/fy/fz |     // copy results in c_atom.fx/fy/fz to atom->fx/fy/fz | ||||||
|  |  | ||||||
|     cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost); |     checkCUDAError( "memcpy force back", cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyDeviceToHost) ); | ||||||
|  |  | ||||||
|     /* |     /* | ||||||
|     cudaFree(c_atom.x); |     cudaFree(c_atom.x); | ||||||
| @@ -207,8 +285,6 @@ double computeForce( | |||||||
|     LIKWID_MARKER_STOP("force"); |     LIKWID_MARKER_STOP("force"); | ||||||
|     double E = getTimeStamp(); |     double E = getTimeStamp(); | ||||||
|  |  | ||||||
|     initialized = true; |  | ||||||
|  |  | ||||||
|     return E-S; |     return E-S; | ||||||
| } | } | ||||||
| } | } | ||||||
| @@ -54,6 +54,10 @@ extern void growAtom(Atom*); | |||||||
| #define atom_fy(i)          atom->fx[(i) * 3 + 1] | #define atom_fy(i)          atom->fx[(i) * 3 + 1] | ||||||
| #define atom_fz(i)          atom->fx[(i) * 3 + 2] | #define atom_fz(i)          atom->fx[(i) * 3 + 2] | ||||||
|  |  | ||||||
|  | #define atom_vx(i)          atom->vx[(i) * 3 + 0] | ||||||
|  | #define atom_vy(i)          atom->vx[(i) * 3 + 1] | ||||||
|  | #define atom_vz(i)          atom->vx[(i) * 3 + 2] | ||||||
|  |  | ||||||
| #else | #else | ||||||
| #define POS_DATA_LAYOUT     "SoA" | #define POS_DATA_LAYOUT     "SoA" | ||||||
|  |  | ||||||
| @@ -65,6 +69,10 @@ extern void growAtom(Atom*); | |||||||
| #define atom_fy(i)           atom->fy[i] | #define atom_fy(i)           atom->fy[i] | ||||||
| #define atom_fz(i)           atom->fz[i] | #define atom_fz(i)           atom->fz[i] | ||||||
|  |  | ||||||
|  | #define atom_vx(i)           atom->vx[i] | ||||||
|  | #define atom_vy(i)           atom->vy[i] | ||||||
|  | #define atom_vz(i)           atom->vz[i] | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
| #endif | #endif | ||||||
|   | |||||||
							
								
								
									
										34
									
								
								src/main.c
									
									
									
									
									
								
							
							
						
						
									
										34
									
								
								src/main.c
									
									
									
									
									
								
							| @@ -45,6 +45,11 @@ | |||||||
|  |  | ||||||
| #define HLINE "----------------------------------------------------------------------------\n" | #define HLINE "----------------------------------------------------------------------------\n" | ||||||
|  |  | ||||||
|  | extern void initCudaAtom(Atom *atom, Neighbor *neighbor); | ||||||
|  |  | ||||||
|  | extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom); | ||||||
|  | extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom); | ||||||
|  |  | ||||||
| extern double computeForce(bool, Parameter*, Atom*, Neighbor*); | extern double computeForce(bool, Parameter*, Atom*, Neighbor*); | ||||||
| extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); | extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); | ||||||
| extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep); | extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep); | ||||||
| @@ -101,6 +106,8 @@ double setup( | |||||||
|     buildNeighbor(atom, neighbor); |     buildNeighbor(atom, neighbor); | ||||||
|     E = getTimeStamp(); |     E = getTimeStamp(); | ||||||
|  |  | ||||||
|  |     initCudaAtom(atom, neighbor); | ||||||
|  |  | ||||||
|     return E-S; |     return E-S; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -126,26 +133,22 @@ double reneighbour( | |||||||
|  |  | ||||||
| void initialIntegrate(Parameter *param, Atom *atom) | void initialIntegrate(Parameter *param, Atom *atom) | ||||||
| { | { | ||||||
|     MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz; |  | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         vx[i] += param->dtforce * atom_fx(i); |         atom_vx(i) += param->dtforce * atom_fx(i); | ||||||
|         vy[i] += param->dtforce * atom_fy(i); |         atom_vy(i) += param->dtforce * atom_fy(i); | ||||||
|         vz[i] += param->dtforce * atom_fz(i); |         atom_vz(i) += param->dtforce * atom_fz(i); | ||||||
|         atom_x(i) = atom_x(i) + param->dt * vx[i]; |         atom_x(i) = atom_x(i) + param->dt * atom_vx(i); | ||||||
|         atom_y(i) = atom_y(i) + param->dt * vy[i]; |         atom_y(i) = atom_y(i) + param->dt * atom_vy(i); | ||||||
|         atom_z(i) = atom_z(i) + param->dt * vz[i]; |         atom_z(i) = atom_z(i) + param->dt * atom_vz(i); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| void finalIntegrate(Parameter *param, Atom *atom) | void finalIntegrate(Parameter *param, Atom *atom) | ||||||
| { | { | ||||||
|     MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz; |  | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         vx[i] += param->dtforce * atom_fx(i); |         atom_vx(i) += param->dtforce * atom_fx(i); | ||||||
|         vy[i] += param->dtforce * atom_fy(i); |         atom_vy(i) += param->dtforce * atom_fy(i); | ||||||
|         vz[i] += param->dtforce * atom_fz(i); |         atom_vz(i) += param->dtforce * atom_fz(i); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -274,10 +277,11 @@ int main(int argc, char** argv) | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     for(int n = 0; n < param.ntimes; n++) { |     for(int n = 0; n < param.ntimes; n++) { | ||||||
|         initialIntegrate(¶m, &atom); |  | ||||||
|  |  | ||||||
|         const bool doReneighbour = (n + 1) % param.every == 0; |         const bool doReneighbour = (n + 1) % param.every == 0; | ||||||
|  |  | ||||||
|  |         cuda_initial_integrate(doReneighbour, ¶m, &atom); // initialIntegrate(¶m, &atom); | ||||||
|  |  | ||||||
|         if(doReneighbour) { |         if(doReneighbour) { | ||||||
|             timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); |             timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); | ||||||
|         } else { |         } else { | ||||||
| @@ -293,7 +297,7 @@ int main(int argc, char** argv) | |||||||
|             timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor); |             timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor); | ||||||
| #endif | #endif | ||||||
|         } |         } | ||||||
|         finalIntegrate(¶m, &atom); |         cuda_final_integrate(doReneighbour, ¶m, &atom); // finalIntegrate(¶m, &atom); | ||||||
|  |  | ||||||
|         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { |         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { | ||||||
|             computeThermo(n + 1, ¶m, &atom); |             computeThermo(n + 1, ¶m, &atom); | ||||||
|   | |||||||
| @@ -359,14 +359,18 @@ void sortAtom(Atom* atom) { | |||||||
|  |  | ||||||
| #ifdef AOS | #ifdef AOS | ||||||
|     double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3); |     double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3); | ||||||
|  |  | ||||||
|  |     double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3); | ||||||
| #else | #else | ||||||
|     double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT)); |     double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT)); | ||||||
|     double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT)); |     double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT)); | ||||||
|     double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT)); |     double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT)); | ||||||
| #endif |  | ||||||
|     double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT)); |     double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT)); | ||||||
|     double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT)); |     double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT)); | ||||||
|     double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT)); |     double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT)); | ||||||
|  | #endif | ||||||
|  |  | ||||||
|     double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z; |     double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z; | ||||||
|     double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz; |     double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz; | ||||||
|  |  | ||||||
| @@ -380,24 +384,34 @@ void sortAtom(Atom* atom) { | |||||||
|             new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0]; |             new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0]; | ||||||
|             new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1]; |             new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1]; | ||||||
|             new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2]; |             new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2]; | ||||||
|  |  | ||||||
|  |             new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0]; | ||||||
|  |             new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1]; | ||||||
|  |             new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2]; | ||||||
| #else | #else | ||||||
|             new_x[new_i] = old_x[old_i]; |             new_x[new_i] = old_x[old_i]; | ||||||
|             new_y[new_i] = old_y[old_i]; |             new_y[new_i] = old_y[old_i]; | ||||||
|             new_z[new_i] = old_z[old_i]; |             new_z[new_i] = old_z[old_i]; | ||||||
| #endif |  | ||||||
|             new_vx[new_i] = old_vx[old_i]; |             new_vx[new_i] = old_vx[old_i]; | ||||||
|             new_vy[new_i] = old_vy[old_i]; |             new_vy[new_i] = old_vy[old_i]; | ||||||
|             new_vz[new_i] = old_vz[old_i]; |             new_vz[new_i] = old_vz[old_i]; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     free(atom->x); |     free(atom->x); | ||||||
|     atom->x = new_x; |     atom->x = new_x; | ||||||
|  |  | ||||||
|  |     free(atom->vx); | ||||||
|  |     atom->vx = new_vx; | ||||||
| #ifndef AOS | #ifndef AOS | ||||||
|     free(atom->y); |     free(atom->y); | ||||||
|     free(atom->z); |     free(atom->z); | ||||||
|     atom->y = new_y; atom->z = new_z; |     atom->y = new_y; atom->z = new_z; | ||||||
|  |  | ||||||
|  |     free(atom->vy); free(atom->vz); | ||||||
|  |     atom->vy = new_vy; atom->vz = new_vz; | ||||||
| #endif | #endif | ||||||
|     free(atom->vx); free(atom->vy); free(atom->vz); |  | ||||||
|     atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz; |  | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										26
									
								
								src/thermo.c
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								src/thermo.c
									
									
									
									
									
								
							| @@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms) | |||||||
| void computeThermo(int iflag, Parameter *param, Atom *atom) | void computeThermo(int iflag, Parameter *param, Atom *atom) | ||||||
| { | { | ||||||
|     MD_FLOAT t = 0.0, p; |     MD_FLOAT t = 0.0, p; | ||||||
|     MD_FLOAT* vx = atom->vx; |  | ||||||
|     MD_FLOAT* vy = atom->vy; |  | ||||||
|     MD_FLOAT* vz = atom->vz; |  | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass; |         t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     t = t * t_scale; |     t = t * t_scale; | ||||||
| @@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom) | |||||||
| { | { | ||||||
|     /* zero center-of-mass motion */ |     /* zero center-of-mass motion */ | ||||||
|     MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0; |     MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0; | ||||||
|     MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz; |  | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         vxtot += vx[i]; |         vxtot += atom_vx(i); | ||||||
|         vytot += vy[i]; |         vytot += atom_vy(i); | ||||||
|         vztot += vz[i]; |         vztot += atom_vz(i); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     vxtot = vxtot / atom->Natoms; |     vxtot = vxtot / atom->Natoms; | ||||||
| @@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom) | |||||||
|     vztot = vztot / atom->Natoms; |     vztot = vztot / atom->Natoms; | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         vx[i] -= vxtot; |         atom_vx(i) -= vxtot; | ||||||
|         vy[i] -= vytot; |         atom_vy(i) -= vytot; | ||||||
|         vz[i] -= vztot; |         atom_vz(i) -= vztot; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     t_act = 0; |     t_act = 0; | ||||||
|     MD_FLOAT t = 0.0; |     MD_FLOAT t = 0.0; | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass; |         t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     t *= t_scale; |     t *= t_scale; | ||||||
|     MD_FLOAT factor = sqrt(param->temp / t); |     MD_FLOAT factor = sqrt(param->temp / t); | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->Nlocal; i++) { |     for(int i = 0; i < atom->Nlocal; i++) { | ||||||
|         vx[i] *= factor; |         atom_vx(i) *= factor; | ||||||
|         vy[i] *= factor; |         atom_vy(i) *= factor; | ||||||
|         vz[i] *= factor; |         atom_vz(i) *= factor; | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user