Initial refactoring of CUDA code
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
		@@ -38,6 +38,10 @@ XTC_OUTPUT ?= false
 | 
				
			|||||||
# Check if cj is local when decreasing reaction force
 | 
					# Check if cj is local when decreasing reaction force
 | 
				
			||||||
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
 | 
					HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Configurations for CUDA
 | 
				
			||||||
 | 
					# Use CUDA host memory to optimize transfers
 | 
				
			||||||
 | 
					USE_CUDA_HOST_MEMORY ?= false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#Feature options
 | 
					#Feature options
 | 
				
			||||||
OPTIONS =  -DALIGNMENT=64
 | 
					OPTIONS =  -DALIGNMENT=64
 | 
				
			||||||
#OPTIONS +=  More options
 | 
					#OPTIONS +=  More options
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -49,36 +49,61 @@ void *allocate(int alignment, size_t bytesize) {
 | 
				
			|||||||
    return ptr;
 | 
					    return ptr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void *reallocate(void* ptr, int alignment, size_t newBytesize, size_t oldBytesize) {
 | 
					void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) {
 | 
				
			||||||
    void *newarray = allocate(alignment, newBytesize);
 | 
					    void *newarray = allocate(alignment, new_bytesize);
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if(ptr != NULL) {
 | 
					    if(ptr != NULL) {
 | 
				
			||||||
        memcpy(newarray, ptr, oldBytesize);
 | 
					        memcpy(newarray, ptr, old_bytesize);
 | 
				
			||||||
        free(ptr);
 | 
					        free(ptr);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return newarray;
 | 
					    return newarray;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#ifndef CUDA_TARGET
 | 
					#ifndef CUDA_TARGET
 | 
				
			||||||
void *allocate_gpu(int alignment, size_t bytesize) { return NULL; }
 | 
					void *allocateGPU(size_t bytesize) { return NULL; }
 | 
				
			||||||
void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { return NULL; }
 | 
					void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; }
 | 
				
			||||||
 | 
					void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {}
 | 
				
			||||||
 | 
					void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {}
 | 
				
			||||||
 | 
					void memsetGPU(void *d_ptr, int value, size_t bytesize) {}
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
#include <cuda_runtime.h>
 | 
					#include <cuda_runtime.h>
 | 
				
			||||||
#include <cuda_atom.h>
 | 
					#include <cuda_atom.h>
 | 
				
			||||||
void *allocate_gpu(int alignment, size_t bytesize) {
 | 
					void *allocateGPU(size_t bytesize) {
 | 
				
			||||||
    void *ptr;
 | 
					    void *ptr;
 | 
				
			||||||
    checkCUDAError("allocate", cudaMallocHost((void **) &ptr, bytesize));
 | 
					    #ifdef CUDA_HOST_MEMORY
 | 
				
			||||||
 | 
					    cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize));
 | 
				
			||||||
 | 
					    #else
 | 
				
			||||||
 | 
					    cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize));
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
    return ptr;
 | 
					    return ptr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// Data is not preserved
 | 
					// Data is not preserved
 | 
				
			||||||
void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) {
 | 
					void *reallocateGPU(void *ptr, size_t new_bytesize) {
 | 
				
			||||||
    void *newarray = allocate_gpu(alignment, newBytesize);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if(ptr != NULL) {
 | 
					    if(ptr != NULL) {
 | 
				
			||||||
 | 
					        #ifdef CUDA_HOST_MEMORY
 | 
				
			||||||
        cudaFreeHost(ptr);
 | 
					        cudaFreeHost(ptr);
 | 
				
			||||||
 | 
					        #else
 | 
				
			||||||
 | 
					        cudaFree(ptr);
 | 
				
			||||||
 | 
					        #endif
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return newarray;
 | 
					    return allocateGPU(new_bytesize);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {
 | 
				
			||||||
 | 
					    #ifndef CUDA_HOST_MEMORY
 | 
				
			||||||
 | 
					    cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice));
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {
 | 
				
			||||||
 | 
					    #ifndef CUDA_HOST_MEMORY
 | 
				
			||||||
 | 
					    cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost));
 | 
				
			||||||
 | 
					    #endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void memsetGPU(void *d_ptr, int value, size_t bytesize) {
 | 
				
			||||||
 | 
					    cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -71,14 +71,6 @@ void createAtom(Atom *atom, Parameter *param) {
 | 
				
			|||||||
    atom->Natoms = 4 * param->nx * param->ny * param->nz;
 | 
					    atom->Natoms = 4 * param->nx * param->ny * param->nz;
 | 
				
			||||||
    atom->Nlocal = 0;
 | 
					    atom->Nlocal = 0;
 | 
				
			||||||
    atom->ntypes = param->ntypes;
 | 
					    atom->ntypes = param->ntypes;
 | 
				
			||||||
 | 
					 | 
				
			||||||
#ifdef CUDA_TARGET
 | 
					 | 
				
			||||||
    checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
    checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
    checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
    checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					 | 
				
			||||||
#endif
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
					    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -37,38 +37,29 @@ void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor
 | 
				
			|||||||
    c_atom->Nghost = atom->Nghost;
 | 
					    c_atom->Nghost = atom->Nghost;
 | 
				
			||||||
    c_atom->Nmax = atom->Nmax;
 | 
					    c_atom->Nmax = atom->Nmax;
 | 
				
			||||||
    c_atom->ntypes = atom->ntypes;
 | 
					    c_atom->ntypes = atom->ntypes;
 | 
				
			||||||
 | 
					 | 
				
			||||||
    c_atom->border_map = NULL;
 | 
					    c_atom->border_map = NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const int Nlocal = atom->Nlocal;
 | 
					    c_atom->x               =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
 | 
					    c_atom->vx              =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
 | 
					    c_atom->fx              =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
 | 
					    c_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
				
			||||||
 | 
					    c_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
				
			||||||
 | 
					    c_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
				
			||||||
 | 
					    c_atom->type            =   (int *) allocateGPU(sizeof(int) * atom->Nmax * 3);
 | 
				
			||||||
 | 
					    c_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
 | 
				
			||||||
 | 
					    c_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
 | 
					    memcpyToGPU(c_atom->x,              atom->x,          sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
    checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
 | 
					    memcpyToGPU(c_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
 | 
					    memcpyToGPU(c_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
				
			||||||
    checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
					    memcpyToGPU(c_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
				
			||||||
 | 
					    memcpyToGPU(c_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
				
			||||||
    checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
					    memcpyToGPU(c_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
 | 
				
			||||||
    checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) );
 | 
					 | 
				
			||||||
    checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void checkCUDAError(const char *msg, cudaError_t err) {
 | 
					void cuda_assert(const char *label, cudaError_t err) {
 | 
				
			||||||
    if (err != cudaSuccess) {
 | 
					    if (err != cudaSuccess) {
 | 
				
			||||||
        //print a human readable error message
 | 
					        printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err));
 | 
				
			||||||
        printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
 | 
					 | 
				
			||||||
        exit(-1);
 | 
					        exit(-1);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -33,6 +33,7 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
extern "C" {
 | 
					extern "C" {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#include <allocate.h>
 | 
				
			||||||
#include <atom.h>
 | 
					#include <atom.h>
 | 
				
			||||||
#include <cuda_atom.h>
 | 
					#include <cuda_atom.h>
 | 
				
			||||||
#include <allocate.h>
 | 
					#include <allocate.h>
 | 
				
			||||||
@@ -123,33 +124,31 @@ __global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
extern "C" {
 | 
					extern "C" {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void finalIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
 | 
					void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
 | 
				
			||||||
    const int Nlocal = atom->Nlocal;
 | 
					    const int Nlocal = atom->Nlocal;
 | 
				
			||||||
    const int num_threads_per_block = get_num_threads();
 | 
					    const int num_threads_per_block = get_num_threads();
 | 
				
			||||||
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
					    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
 | 
					    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
 | 
				
			||||||
 | 
					    cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
 | 
				
			||||||
 | 
					    cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
 | 
					    if(reneigh) {
 | 
				
			||||||
    checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
 | 
					        memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if(doReneighbour) {
 | 
					 | 
				
			||||||
        checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void initialIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
 | 
					void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
 | 
				
			||||||
    const int Nlocal = atom->Nlocal;
 | 
					    const int Nlocal = atom->Nlocal;
 | 
				
			||||||
    const int num_threads_per_block = get_num_threads();
 | 
					    const int num_threads_per_block = get_num_threads();
 | 
				
			||||||
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
					    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
 | 
					    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
 | 
				
			||||||
 | 
					    cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
 | 
				
			||||||
 | 
					    cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
 | 
					    if(reneigh) {
 | 
				
			||||||
    checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
 | 
					        memcpyFromGPU(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if(doReneighbour) {
 | 
					 | 
				
			||||||
        checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -176,26 +175,20 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
 | 
					    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
 | 
				
			||||||
 | 
					 | 
				
			||||||
    // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
					    // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cudaProfilerStart();
 | 
					    cudaProfilerStart();
 | 
				
			||||||
 | 
					 | 
				
			||||||
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
					    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					 | 
				
			||||||
    double S = getTimeStamp();
 | 
					    double S = getTimeStamp();
 | 
				
			||||||
    LIKWID_MARKER_START("force");
 | 
					    LIKWID_MARKER_START("force");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
 | 
					    calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
 | 
				
			||||||
 | 
					    cuda_assert("calc_force", cudaPeekAtLastError());
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
 | 
					    cuda_assert("calc_force", cudaDeviceSynchronize());
 | 
				
			||||||
    checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cudaProfilerStop();
 | 
					    cudaProfilerStop();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LIKWID_MARKER_STOP("force");
 | 
					    LIKWID_MARKER_STOP("force");
 | 
				
			||||||
    double E = getTimeStamp();
 | 
					    double E = getTimeStamp();
 | 
				
			||||||
 | 
					 | 
				
			||||||
    return E-S;
 | 
					    return E-S;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -199,30 +199,25 @@ void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neigh
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    while(resize > 0) {
 | 
					    while(resize > 0) {
 | 
				
			||||||
        resize = 0;
 | 
					        resize = 0;
 | 
				
			||||||
        checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
 | 
					        memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
 | 
				
			||||||
        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
 | 
					        memsetGPU(c_resize_needed, 0, sizeof(int));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
 | 
					 | 
				
			||||||
        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
 | 
					        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
 | 
				
			||||||
 | 
						    cuda_assert("binatoms", cudaPeekAtLastError());
 | 
				
			||||||
 | 
						    cuda_assert("binatoms", cudaDeviceSynchronize());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	    checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
 | 
					        memcpyFromGPU(&resize, c_resize_needed, sizeof(int));
 | 
				
			||||||
	    checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
 | 
					 | 
				
			||||||
        
 | 
					 | 
				
			||||||
	    checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if(resize) {
 | 
					        if(resize) {
 | 
				
			||||||
            cudaFree(c_binning->bins);
 | 
					 | 
				
			||||||
            c_binning->atoms_per_bin *= 2;
 | 
					            c_binning->atoms_per_bin *= 2;
 | 
				
			||||||
            checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
 | 
					            c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int));
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    atoms_per_bin = c_binning->atoms_per_bin;
 | 
					    atoms_per_bin = c_binning->atoms_per_bin;
 | 
				
			||||||
    const int sortBlocks = ceil((float)mbins / (float)threads_per_block);
 | 
					    const int sortBlocks = ceil((float) mbins / (float) threads_per_block);
 | 
				
			||||||
    /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/
 | 
					 | 
				
			||||||
    sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
 | 
					    sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() );
 | 
						cuda_assert("sort_bin", cudaPeekAtLastError());
 | 
				
			||||||
    checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
 | 
						cuda_assert("sort_bin", cudaDeviceSynchronize());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
 | 
					void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
 | 
				
			||||||
@@ -231,18 +226,18 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
 | 
				
			|||||||
    c_neighbor->maxneighs = neighbor->maxneighs;
 | 
					    c_neighbor->maxneighs = neighbor->maxneighs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cudaProfilerStart();
 | 
					    cudaProfilerStart();
 | 
				
			||||||
    /* upload stencil */
 | 
					
 | 
				
			||||||
    // TODO move all of this initialization into its own method
 | 
					    // TODO move all of this initialization into its own method
 | 
				
			||||||
    if(c_stencil == NULL){
 | 
					    if(c_stencil == NULL) {
 | 
				
			||||||
        checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
 | 
					        c_stencil = (int *) allocateGPU(nstencil * sizeof(int));
 | 
				
			||||||
        checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
 | 
					        memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if(c_binning.mbins == 0){
 | 
					    if(c_binning.mbins == 0) {
 | 
				
			||||||
        c_binning.mbins = mbins;
 | 
					        c_binning.mbins = mbins;
 | 
				
			||||||
        c_binning.atoms_per_bin = atoms_per_bin;
 | 
					        c_binning.atoms_per_bin = atoms_per_bin;
 | 
				
			||||||
        checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
 | 
					        c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int));
 | 
				
			||||||
        checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
 | 
					        c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Neighbor_params np {
 | 
					    Neighbor_params np {
 | 
				
			||||||
@@ -263,14 +258,14 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
 | 
				
			|||||||
        .mbinz = mbinz
 | 
					        .mbinz = mbinz
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if(c_resize_needed == NULL){
 | 
					    if(c_resize_needed == NULL) {
 | 
				
			||||||
        checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
 | 
					        c_resize_needed = (int *) allocateGPU(sizeof(int));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /* bin local & ghost atoms */
 | 
					    /* bin local & ghost atoms */
 | 
				
			||||||
    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
 | 
					    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
 | 
				
			||||||
    if(c_new_maxneighs == NULL){
 | 
					    if(c_new_maxneighs == NULL) {
 | 
				
			||||||
        checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
 | 
					        c_new_maxneighs = (int *) allocateGPU(sizeof(int));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    int resize = 1;
 | 
					    int resize = 1;
 | 
				
			||||||
@@ -278,35 +273,26 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
 | 
				
			|||||||
    /* extend c_neighbor arrays if necessary */
 | 
					    /* extend c_neighbor arrays if necessary */
 | 
				
			||||||
    if(nall > nmax) {
 | 
					    if(nall > nmax) {
 | 
				
			||||||
        nmax = nall;
 | 
					        nmax = nall;
 | 
				
			||||||
        if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
 | 
					        c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, nmax * c_neighbor->maxneighs * sizeof(int));
 | 
				
			||||||
        if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
 | 
					        c_neighbor->numneigh  = (int *) reallocateGPU(c_neighbor->numneigh,  nmax * sizeof(int));
 | 
				
			||||||
        checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
 | 
					 | 
				
			||||||
        checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /* loop over each atom, storing neighbors */
 | 
					    /* loop over each atom, storing neighbors */
 | 
				
			||||||
    while(resize) {
 | 
					    while(resize) {
 | 
				
			||||||
        resize = 0;
 | 
					        resize = 0;
 | 
				
			||||||
 | 
					        memsetGPU(c_new_maxneighs, 0, sizeof(int));
 | 
				
			||||||
        checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) ));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // TODO call compute_neigborhood kernel here
 | 
					 | 
				
			||||||
        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
 | 
					        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
        /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
 | 
					 | 
				
			||||||
                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
 | 
					 | 
				
			||||||
         * */
 | 
					 | 
				
			||||||
        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
 | 
					        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
 | 
				
			||||||
                                                                    np, nstencil, c_stencil,
 | 
					                                                                    np, nstencil, c_stencil,
 | 
				
			||||||
                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
 | 
					                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
 | 
				
			||||||
                                                                    c_new_maxneighs,
 | 
					                                                                    c_new_maxneighs,
 | 
				
			||||||
								                                    cutneighsq);
 | 
													                                    cutneighsq);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() );
 | 
					        cuda_assert("compute_neighborhood", cudaPeekAtLastError());
 | 
				
			||||||
        checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() );
 | 
					        cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // TODO copy the value of c_new_maxneighs back to host and check if it has been modified
 | 
					 | 
				
			||||||
        int new_maxneighs;
 | 
					        int new_maxneighs;
 | 
				
			||||||
        checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
 | 
					        memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
 | 
				
			||||||
        if (new_maxneighs > c_neighbor->maxneighs){
 | 
					        if (new_maxneighs > c_neighbor->maxneighs){
 | 
				
			||||||
            resize = 1;
 | 
					            resize = 1;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
@@ -315,8 +301,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
 | 
				
			|||||||
            printf("RESIZE %d\n", c_neighbor->maxneighs);
 | 
					            printf("RESIZE %d\n", c_neighbor->maxneighs);
 | 
				
			||||||
            c_neighbor->maxneighs = new_maxneighs * 1.2;
 | 
					            c_neighbor->maxneighs = new_maxneighs * 1.2;
 | 
				
			||||||
            printf("NEW SIZE %d\n", c_neighbor->maxneighs);
 | 
					            printf("NEW SIZE %d\n", c_neighbor->maxneighs);
 | 
				
			||||||
            cudaFree(c_neighbor->neighbors);
 | 
					            c_neighbor->neighbors = (int *) reallocateGPU(c_neighbor->neighbors, c_atom->Nmax * c_neighbor->maxneighs * sizeof(int));
 | 
				
			||||||
            checkCUDAError("c_neighbor->neighbors resize malloc", cudaMalloc((void**)(&c_neighbor->neighbors), c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -39,7 +39,7 @@ extern int *PBCx, *PBCy, *PBCz;
 | 
				
			|||||||
static int c_NmaxGhost;
 | 
					static int c_NmaxGhost;
 | 
				
			||||||
static int *c_PBCx, *c_PBCy, *c_PBCz;
 | 
					static int *c_PBCx, *c_PBCy, *c_PBCz;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
 | 
					__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
 | 
				
			||||||
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
					    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
				
			||||||
    Atom* atom = &a;
 | 
					    Atom* atom = &a;
 | 
				
			||||||
    if(i >= atom->Nlocal) {
 | 
					    if(i >= atom->Nlocal) {
 | 
				
			||||||
@@ -94,30 +94,25 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
 | 
					        if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
 | 
				
			||||||
            c_atom->Nmax = atom->Nmax;
 | 
					            c_atom->Nmax = atom->Nmax;
 | 
				
			||||||
            if(c_atom->x != NULL){ cudaFree(c_atom->x); }
 | 
					            c_atom->x = (MD_FLOAT *) reallocateGPU(c_atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
            if(c_atom->type != NULL){ cudaFree(c_atom->type); }
 | 
					            c_atom->type = (int *) reallocateGPU(c_atom->type, sizeof(int) * atom->Nmax);
 | 
				
			||||||
            checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
 | 
					 | 
				
			||||||
            checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        // TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well
 | 
					 | 
				
			||||||
        checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
        checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if(c_NmaxGhost < NmaxGhost){
 | 
					        memcpyToGPU(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
				
			||||||
 | 
					        memcpyToGPU(c_atom->type, atom->type, sizeof(int) * atom->Nmax);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if(c_NmaxGhost < NmaxGhost) {
 | 
				
			||||||
            c_NmaxGhost = NmaxGhost;
 | 
					            c_NmaxGhost = NmaxGhost;
 | 
				
			||||||
            if(c_PBCx != NULL){ cudaFree(c_PBCx); }
 | 
					            c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
 | 
				
			||||||
            if(c_PBCy != NULL){ cudaFree(c_PBCy); }
 | 
					            c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
 | 
				
			||||||
            if(c_PBCz != NULL){ cudaFree(c_PBCz); }
 | 
					            c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
 | 
				
			||||||
            if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); }
 | 
					            c_atom->border_map = (int *) reallocateGPU(c_atom->border_map, NmaxGhost * sizeof(int));
 | 
				
			||||||
            checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) );
 | 
					 | 
				
			||||||
            checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) );
 | 
					 | 
				
			||||||
            checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) );
 | 
					 | 
				
			||||||
            checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) );
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
 | 
					
 | 
				
			||||||
        checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
 | 
					        memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
 | 
				
			||||||
        checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
 | 
					        memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
 | 
				
			||||||
        checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
 | 
					        memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
 | 
				
			||||||
 | 
					        memcpyToGPU(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int));
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    MD_FLOAT xprd = param->xprd;
 | 
					    MD_FLOAT xprd = param->xprd;
 | 
				
			||||||
@@ -125,26 +120,20 @@ void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbo
 | 
				
			|||||||
    MD_FLOAT zprd = param->zprd;
 | 
					    MD_FLOAT zprd = param->zprd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
 | 
					    const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
 | 
				
			||||||
 | 
					 | 
				
			||||||
    /*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz,
 | 
					 | 
				
			||||||
     *                                                          MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)
 | 
					 | 
				
			||||||
     * */
 | 
					 | 
				
			||||||
    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
 | 
					    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
 | 
					    cuda_assert("computePbcUpdate", cudaPeekAtLastError());
 | 
				
			||||||
    checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
 | 
					    cuda_assert("computePbcUpdate", cudaDeviceSynchronize());
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param){
 | 
					void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param) {
 | 
				
			||||||
    const int num_threads_per_block = get_num_threads();
 | 
					    const int num_threads_per_block = get_num_threads();
 | 
				
			||||||
    MD_FLOAT xprd = param->xprd;
 | 
					    MD_FLOAT xprd = param->xprd;
 | 
				
			||||||
    MD_FLOAT yprd = param->yprd;
 | 
					    MD_FLOAT yprd = param->yprd;
 | 
				
			||||||
    MD_FLOAT zprd = param->zprd;
 | 
					    MD_FLOAT zprd = param->zprd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
 | 
					    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
 | 
				
			||||||
    /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
 | 
					 | 
				
			||||||
    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
 | 
					    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
 | 
				
			||||||
 | 
					    cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
 | 
				
			||||||
    checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
 | 
					    cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
 | 
				
			||||||
    checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
 | 
					    memcpyFromGPU(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
 | 
				
			||||||
    checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -6,5 +6,10 @@
 | 
				
			|||||||
#ifndef __CUDA_ATOM_H_
 | 
					#ifndef __CUDA_ATOM_H_
 | 
				
			||||||
#define __CUDA_ATOM_H_
 | 
					#define __CUDA_ATOM_H_
 | 
				
			||||||
extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*);
 | 
					extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*);
 | 
				
			||||||
extern void checkCUDAError(const char *msg, cudaError_t err);
 | 
					extern void cuda_assert(const char *msg, cudaError_t err);
 | 
				
			||||||
 | 
					extern void *allocateGPU(size_t bytesize);
 | 
				
			||||||
 | 
					extern void *reallocateGPU(void *ptr, size_t new_bytesize);
 | 
				
			||||||
 | 
					extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize);
 | 
				
			||||||
 | 
					extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize);
 | 
				
			||||||
 | 
					extern void memsetGPU(void *d_ptr, int value, size_t bytesize);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -261,9 +261,8 @@ int main(int argc, char** argv) {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
 | 
					        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
 | 
				
			||||||
            #ifdef CUDA_TARGET
 | 
					            #ifdef CUDA_TARGET
 | 
				
			||||||
            checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost));
 | 
					            memcpyFromGPU(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
 | 
				
			||||||
            #endif
 | 
					            #endif
 | 
				
			||||||
 | 
					 | 
				
			||||||
            computeThermo(n + 1, ¶m, &atom);
 | 
					            computeThermo(n + 1, ¶m, &atom);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user