From c49278cb21e3b0cc26f873ef382b80ca578b799c Mon Sep 17 00:00:00 2001
From: Martin Bauernfeind <martin.m.bauernfeind@fau.de>
Date: Sun, 26 Jun 2022 16:25:59 +0200
Subject: [PATCH] First crude attempt at parallelizing neighborhood computation
 (only the part after binning the atoms is parallelized with cuda)

---
 src/force.cu            |  35 +++-----------
 src/includes/neighbor.h |   1 +
 src/main.c              |  50 +++++++++++++++-----
 src/neighbor.cu         | 102 +++++++++++++++++++++++++++++++++-------
 4 files changed, 130 insertions(+), 58 deletions(-)

diff --git a/src/force.cu b/src/force.cu
index e7ef535..6f27d56 100644
--- a/src/force.cu
+++ b/src/force.cu
@@ -126,25 +126,9 @@ extern "C" {
 
 
 
-int get_num_threads() {
-
-    const char *num_threads_env = getenv("NUM_THREADS");
-    int num_threads = 0;
-    if(num_threads_env == nullptr)
-        num_threads = 32;
-    else {
-        num_threads = atoi(num_threads_env);
-    }
-
-    return num_threads;
-}
-
-void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
+void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
 
     const int Nlocal = atom->Nlocal;
-    const int num_threads = get_num_threads();
-
-    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 
     kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
@@ -157,12 +141,9 @@ void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom
     }
 }
 
-void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
+void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
 
     const int Nlocal = atom->Nlocal;
-    const int num_threads = get_num_threads();
-
-    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 
     kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
@@ -182,7 +163,8 @@ double computeForce(
         Atom *atom,
         Neighbor *neighbor,
         Atom *c_atom,
-        Neighbor *c_neighbor
+        Neighbor *c_neighbor,
+        int num_threads_per_block
         )
 {
     int Nlocal = atom->Nlocal;
@@ -192,8 +174,6 @@ double computeForce(
     MD_FLOAT epsilon = param->epsilon;
 #endif
 
-    const int num_threads = get_num_threads();
-
     c_atom->Natoms = atom->Natoms;
     c_atom->Nlocal = atom->Nlocal;
     c_atom->Nghost = atom->Nghost;
@@ -219,14 +199,11 @@ double computeForce(
 
     cudaProfilerStart();
 
-    checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
 
-    if(reneighbourHappenend) {
-        checkCUDAError( "c_neighbor->numneigh memcpy", cudaMemcpy(c_neighbor->numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) );
-        checkCUDAError( "c_neighbor->neighbors memcpy", cudaMemcpy(c_neighbor->neighbors, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) );
+    if(!reneighbourHappenend) {
+        checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
     }
 
-    const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
     const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 
     double S = getTimeStamp();
diff --git a/src/includes/neighbor.h b/src/includes/neighbor.h
index 17b7bb4..014ece2 100644
--- a/src/includes/neighbor.h
+++ b/src/includes/neighbor.h
@@ -46,4 +46,5 @@ extern void setupNeighbor();
 extern void binatoms(Atom*);
 extern void buildNeighbor(Atom*, Neighbor*);
 extern void sortAtom(Atom*);
+extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int);
 #endif
diff --git a/src/main.c b/src/main.c
index 35537df..30fb374 100644
--- a/src/main.c
+++ b/src/main.c
@@ -45,10 +45,14 @@
 
 #define HLINE "----------------------------------------------------------------------------\n"
 
-extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom);
-extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom);
+extern void cuda_final_integrate(bool doReneighbour, Parameter *param,
+                                 Atom *atom, Atom *c_atom,
+                                 const int num_threads_per_block);
+extern void cuda_initial_integrate(bool doReneighbour, Parameter *param,
+                                   Atom *atom, Atom *c_atom,
+                                   const int num_threads_per_block);
 
-extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*);
+extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int);
 extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
 extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
 
@@ -111,7 +115,8 @@ double setup(
         Neighbor *neighbor,
         Atom *c_atom,
         Neighbor *c_neighbor,
-        Stats *stats)
+        Stats *stats,
+        const int num_threads_per_block)
 {
     if(param->force_field == FF_EAM) { initEam(eam, param); }
     double S, E;
@@ -131,7 +136,7 @@ double setup(
     adjustThermo(param, atom);
     setupPbc(atom, param);
     updatePbc(atom, param);
-    buildNeighbor(atom, neighbor);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
     E = getTimeStamp();
 
     initCudaAtom(atom, neighbor, c_atom, c_neighbor);
@@ -142,7 +147,10 @@ double setup(
 double reneighbour(
         Parameter *param,
         Atom *atom,
-        Neighbor *neighbor)
+        Neighbor *neighbor,
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        const int num_threads_per_block)
 {
     double S, E;
 
@@ -152,7 +160,7 @@ double reneighbour(
     setupPbc(atom, param);
     updatePbc(atom, param);
     //sortAtom(atom);
-    buildNeighbor(atom, neighbor);
+    buildNeighbor(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
     LIKWID_MARKER_STOP("reneighbour");
     E = getTimeStamp();
 
@@ -206,6 +214,19 @@ const char* ff2str(int ff)
     return "invalid";
 }
 
+int get_num_threads() {
+
+    const char *num_threads_env = getenv("NUM_THREADS");
+    int num_threads = 0;
+    if(num_threads_env == nullptr)
+        num_threads = 32;
+    else {
+        num_threads = atoi(num_threads_env);
+    }
+
+    return num_threads;
+}
+
 int main(int argc, char** argv)
 {
     double timer[NUMTIMER];
@@ -286,7 +307,10 @@ int main(int argc, char** argv)
         }
     }
 
-    setup(&param, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
+    // this should be multiple of 32 as operations are performed at the level of warps
+    const int num_threads_per_block = get_num_threads();
+
+    setup(&param, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block);
     computeThermo(0, &param, &atom);
     if(param.force_field == FF_EAM) {
         computeForceEam(&eam, &param, &atom, &neighbor, &stats, 1, 0);
@@ -294,7 +318,7 @@ int main(int argc, char** argv)
 #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
         computeForceTracing(&param, &atom, &neighbor, &stats, 1, 0);
 #else
-        computeForce(true, &param, &atom, &neighbor, &c_atom, &c_neighbor);
+        computeForce(true, &param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
 #endif
     }
 
@@ -310,10 +334,10 @@ int main(int argc, char** argv)
 
         const bool doReneighbour = (n + 1) % param.every == 0;
 
-        cuda_initial_integrate(doReneighbour, &param, &atom, &c_atom);
+        cuda_initial_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);
 
         if(doReneighbour) {
-            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
         } else {
             updatePbc(&atom, &param);
         }
@@ -324,11 +348,11 @@ int main(int argc, char** argv)
 #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
             timer[FORCE] += computeForceTracing(&param, &atom, &neighbor, &stats, 0, n + 1);
 #else
-            timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor, &c_atom, &c_neighbor);
+            timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
 #endif
         }
 
-        cuda_final_integrate(doReneighbour, &param, &atom, &c_atom);
+        cuda_final_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);
 
         if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
             computeThermo(n + 1, &param, &atom);
diff --git a/src/neighbor.cu b/src/neighbor.cu
index 89dce5c..feb02b5 100644
--- a/src/neighbor.cu
+++ b/src/neighbor.cu
@@ -67,9 +67,10 @@ __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin,
     return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
 }
 
-__global__ void compute_neighborhood(Atom a, Neighbor neigh, int Nlocal, Neighbor_params np, int nstencil, int* stencil,
+__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
                                      int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs){
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nlocal = a.Nlocal;
     if( i >= Nlocal ) {
         return;
     }
@@ -513,41 +514,110 @@ void sortAtom(Atom* atom) {
 #endif
 }
 
-void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor)
+void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
 {
     int nall = atom->Nlocal + atom->Nghost;
 
-    /* extend atom arrays if necessary */
+    c_atom->Natoms = atom->Natoms;
+    c_atom->Nlocal = atom->Nlocal;
+    c_atom->Nghost = atom->Nghost;
+    c_atom->Nmax = atom->Nmax;
+    c_atom->ntypes = atom->ntypes;
+
+    c_neighbor->maxneighs = neighbor->maxneighs;
+
+    /* extend c_neighbor arrays if necessary */
     if(nall > nmax) {
         nmax = nall;
-        if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh);
-        if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors);
-        checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) );
-        checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) );
-        // neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
-        // neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+        if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
+        if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
+        checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
     }
 
     /* bin local & ghost atoms */
     binatoms(atom);
     int resize = 1;
 
+    cudaProfilerStart();
+
+    checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+
+    /* upload stencil */
+    int* c_stencil;
+    // TODO move this to be done once at the start
+    checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
+    checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+
+    int *c_bincount;
+    checkCUDAError( "buildNeighbor c_bincount malloc", cudaMalloc((void**)&c_bincount, mbins * sizeof(int)) );
+    checkCUDAError( "buildNeighbor c_bincount memcpy", cudaMemcpy(c_bincount, bincount, mbins * sizeof(int), cudaMemcpyHostToDevice) );
+
+    int *c_bins;
+    checkCUDAError( "buidlNeighbor c_bins malloc", cudaMalloc((void**)&c_bins, mbins * atoms_per_bin * sizeof(int)) );
+    checkCUDAError( "buildNeighbor c_bins memcpy", cudaMemcpy(c_bins, bins, mbins * atoms_per_bin * sizeof(int), cudaMemcpyHostToDevice ) );
+
+    Neighbor_params np{
+        .xprd = xprd,
+        .yprd = yprd,
+        .zprd = zprd,
+        .bininvx = bininvx,
+        .bininvy = bininvy,
+        .bininvz = bininvz,
+        .mbinxlo = mbinxlo,
+        .mbinylo = mbinylo,
+        .mbinzlo = mbinzlo,
+        .nbinx = nbinx,
+        .nbiny = nbiny,
+        .nbinz = nbinz,
+        .mbinx = mbinx,
+        .mbiny = mbiny,
+        .mbinz = mbinz
+    };
+
+    int* c_new_maxneighs;
+    checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+
     /* loop over each atom, storing neighbors */
     while(resize) {
-        int new_maxneighs = neighbor->maxneighs;
         resize = 0;
 
-        // TODO allocate space for and then copy all necessary components
-        // TODO dont forget to copy the atom positions over
+        checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, c_neighbor->maxneighs, sizeof(int) ));
 
         // TODO call compute_neigborhood kernel here
+        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+        /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
+         * */
+        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_Atom, *c_neighbor,
+                                                                    np, nstencil, c_stencil,
+                                                                    c_bins, atoms_per_bin, c_bincount,
+                                                                    c_new_maxneighs);
+
+        // TODO copy the value of c_new_maxneighs back to host and check if it has been modified
+        int new_maxneighs;
+        checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
+        if (new_maxneighs > c_neighbor->maxneighs){
+            resize = 1;
+        }
 
         if(resize) {
-            printf("RESIZE %d\n", neighbor->maxneighs);
-            neighbor->maxneighs = new_maxneighs * 1.2;
-            free(neighbor->neighbors);
-            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+            printf("RESIZE %d\n", c_neighbor->maxneighs);
+            c_neighbor->maxneighs = new_maxneighs * 1.2;
+            cudaFree(c_neighbor->neighbors);
+            checkCUDAError("c_neighbor->neighbors resize malloc",
+                           cudaMalloc((void**)(&c_neighbor->neighbors),
+                                      c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
         }
+
     }
+    neighbor->maxneighs = c_neighbor->maxneighs;
+
+    cudaProfilerStop();
+
+    cudaFree(c_new_maxneighs);
+    cudaFree(c_n_stencil);
+    cudaFree(c_bincount);
+    cudaFree(c_bins);
 }
 }
\ No newline at end of file