From 0967e8f6714624030fa7a2d9cd5d9a8d8f51819d Mon Sep 17 00:00:00 2001
From: Martin Bauernfeind <martin.m.bauernfeind@fau.de>
Date: Sun, 10 Jul 2022 18:05:06 +0200
Subject: [PATCH] The program now does the binning on the GPU via the
 binatoms_cuda method

---
 src/includes/neighbor.h |   2 +-
 src/neighbor.cu         | 103 +++++++++++++++++++---------------------
 2 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/src/includes/neighbor.h b/src/includes/neighbor.h
index 18efcfa..0af49a5 100644
--- a/src/includes/neighbor.h
+++ b/src/includes/neighbor.h
@@ -53,6 +53,6 @@ extern void setupNeighbor();
 extern void binatoms(Atom*);
 extern void buildNeighbor(Atom*, Neighbor*);
 extern void sortAtom(Atom*);
-extern void binatoms_cuda(Atom*, Binning*, int**, Neighbor_params*, const int);
+extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
 extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int);
 #endif
diff --git a/src/neighbor.cu b/src/neighbor.cu
index b5b1cef..a382486 100644
--- a/src/neighbor.cu
+++ b/src/neighbor.cu
@@ -539,7 +539,7 @@ void sortAtom(Atom* atom) {
 #endif
 }
 
-void binatoms_cuda(Atom* c_atom, Binning* c_binning, int** c_resize_needed, NeighborParams *np, const int threads_per_block)
+void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, NeighborParams *np, const int threads_per_block)
 {
     int nall = c_atom->Nlocal + c_atom->Nghost;
     int resize = 1;
@@ -549,21 +549,18 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int** c_resize_needed, Neig
     if(c_binning->bins == NULL){
         checkCUDAError("binatoms_cuda c_binning->bins malloc", cudaMalloc((void**)(&c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
     }
-    if(*c_resize_needed == NULL){
-        checkCUDAError("binatoms_cuda c_resize_needed malloc", cudaMalloc(&c_resize_needed, sizeof(int)) );
-    }
 
     const int num_blocks = ceil((float)nall-> / (float)num_threads_per_block);
 
     while(resize > 0) {
         resize = 0;
         checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
-        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(*c_resize_needed, 0, sizeof(int)) );
+        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
 
         /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
-        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, *c_resize_needed);
+        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
 
-        checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, *c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
+        checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
 
         if(resize) {
             cudaFree(c_binning->bins);
@@ -576,9 +573,50 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int** c_resize_needed, Neig
 void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
 {
     int nall = atom->Nlocal + atom->Nghost;
-
     c_neighbor->maxneighs = neighbor->maxneighs;
 
+    cudaProfilerStart();
+    /* upload stencil */
+    int* c_stencil;
+    // TODO move this to be done once at the start
+    checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
+    checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+
+    Binning c_binning;
+    c_binning->mbins = mbins;
+    c_binning->atoms_per_bin = atoms_per_bin;
+    checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning->bincount), mbins * sizeof(int)) );
+    checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
+
+    Neighbor_params np{
+            .xprd = xprd,
+            .yprd = yprd,
+            .zprd = zprd,
+            .bininvx = bininvx,
+            .bininvy = bininvy,
+            .bininvz = bininvz,
+            .mbinxlo = mbinxlo,
+            .mbinylo = mbinylo,
+            .mbinzlo = mbinzlo,
+            .nbinx = nbinx,
+            .nbiny = nbiny,
+            .nbinz = nbinz,
+            .mbinx = mbinx,
+            .mbiny = mbiny,
+            .mbinz = mbinz
+    };
+
+    int* c_resize_needed;
+    checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
+    
+    /* bin local & ghost atoms */
+    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+
+    int* c_new_maxneighs;
+    checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+
+    int resize = 1;
+    
     /* extend c_neighbor arrays if necessary */
     if(nall > nmax) {
         nmax = nall;
@@ -588,49 +626,6 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
         checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
     }
 
-    checkCUDAError( "buildNeighbor c_atom->x memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * 3 * c_atom->Nmax, cudaMemcpyDeviceToHost) );
-
-    /* bin local & ghost atoms */
-    binatoms(atom);
-    int resize = 1;
-
-    cudaProfilerStart();
-
-    /* upload stencil */
-    int* c_stencil;
-    // TODO move this to be done once at the start
-    checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
-    checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
-
-    int *c_bincount;
-    checkCUDAError( "buildNeighbor c_bincount malloc", cudaMalloc((void**)&c_bincount, mbins * sizeof(int)) );
-    checkCUDAError( "buildNeighbor c_bincount memcpy", cudaMemcpy(c_bincount, bincount, mbins * sizeof(int), cudaMemcpyHostToDevice) );
-
-    int *c_bins;
-    checkCUDAError( "buidlNeighbor c_bins malloc", cudaMalloc((void**)&c_bins, mbins * atoms_per_bin * sizeof(int)) );
-    checkCUDAError( "buildNeighbor c_bins memcpy", cudaMemcpy(c_bins, bins, mbins * atoms_per_bin * sizeof(int), cudaMemcpyHostToDevice ) );
-
-    Neighbor_params np{
-        .xprd = xprd,
-        .yprd = yprd,
-        .zprd = zprd,
-        .bininvx = bininvx,
-        .bininvy = bininvy,
-        .bininvz = bininvz,
-        .mbinxlo = mbinxlo,
-        .mbinylo = mbinylo,
-        .mbinzlo = mbinzlo,
-        .nbinx = nbinx,
-        .nbiny = nbiny,
-        .nbinz = nbinz,
-        .mbinx = mbinx,
-        .mbiny = mbiny,
-        .mbinz = mbinz
-    };
-
-    int* c_new_maxneighs;
-    checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
-
     /* loop over each atom, storing neighbors */
     while(resize) {
         resize = 0;
@@ -644,7 +639,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
          * */
         compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
                                                                     np, nstencil, c_stencil,
-                                                                    c_bins, atoms_per_bin, c_bincount,
+                                                                    c_binning->bins, atoms_per_bin, c_binning->bincount,
                                                                     c_new_maxneighs,
 								                                    cutneighsq);
 
@@ -675,7 +670,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
 
     cudaFree(c_new_maxneighs);
     cudaFree(c_stencil);
-    cudaFree(c_bincount);
-    cudaFree(c_bins);
+    cudaFree(c_binning->bincount);
+    cudaFree(c_binning->bins);
 }
 }