From b65199308df165ccd8745c8cc45448d29b27a880 Mon Sep 17 00:00:00 2001
From: Martin Bauernfeind <martin.m.bauernfeind@fau.de>
Date: Wed, 6 Jul 2022 01:09:11 +0200
Subject: [PATCH] Ported the binatoms method to cuda - not used in the program
 yet

---
 src/neighbor.cu | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/src/neighbor.cu b/src/neighbor.cu
index 54134c4..ce26ad5 100644
--- a/src/neighbor.cu
+++ b/src/neighbor.cu
@@ -70,6 +70,28 @@ __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin,
     return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
 }
 
+__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
+    Atom* atom = &a;
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int nall = atom->Nlocal + atom->Nghost;
+    if(i >= nall){
+        return;
+    }
+    
+    MD_FLOAT x = atom_x(i);
+    MD_FLOAT y = atom_y(i);
+    MD_FLOAT z = atom_z(i);
+    int ibin = coord2bin_device(x, y, z, np);
+
+    int ac = atomicIncrement(bincount[ibin]);
+            
+    if(ac < atoms_per_bin){
+        bins[ibin * atoms_per_bin + ac] = i;
+    } else {
+        atomicMax(resize_needed, ac);
+    }
+}
+
 __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
                                      int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -517,6 +539,40 @@ void sortAtom(Atom* atom) {
 #endif
 }
 
+void binatoms_cuda(Atom *c_atom, int** c_bincount, int** c_bins, int** c_resize_needed, int mbins, NeighborParams *np, const int threads_per_block)
+{
+    int nall = c_atom->Nlocal + c_atom->Nghost;
+    int resize = 1;
+    if(*c_bincount == NULL){
+        checkCUDAError("binatoms_cuda c_bincount malloc", cudaMalloc(c_bincount, mbins * sizeof(int)) );
+    }
+    if(*c_bins == NULL){
+        checkCUDAError("binatoms_cuda c_bins malloc", cudaMalloc(c_bins, mbins * atoms_per_bin * sizeof(int)) );
+    }
+    if(*c_resize_needed == NULL){
+        checkCUDAError("binatoms_cuda c_resize_needed malloc", cudaMalloc(&c_resize_needed, sizeof(int)) );
+    }
+
+    const int num_blocks = ceil((float)nall-> / (float)num_threads_per_block);
+
+    while(resize > 0) {
+        resize = 0;
+        checkCUDAError("binatoms_cuda c_bincount memset", cudaMemset(*c_bincount, 0, mbins * sizeof(int)));
+        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(*c_resize_needed, 0, sizeof(int)) );
+
+        /*binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) */
+        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, *c_bincount, *c_bins, atoms_per_bin, *np, *c_resize_needed);
+
+        checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, *c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        if(resize) {
+            cudaFree(*c_bins);
+            atoms_per_bin *= 2;
+            checkCUDAError("binatoms_cuda c_bins resize malloc", cudaMalloc(c_bins, mbins * atoms_per_bin * sizeof(int)) );
+        }
+    }
+}
+
 void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
 {
     int nall = atom->Nlocal + atom->Nghost;