Integrate LAMMPS CUDA versions into master branch

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-09 18:53:53 +02:00
parent eb77e1a3bd
commit c18124b066
19 changed files with 1004 additions and 92 deletions
@@ -0,0 +1,76 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+
+extern "C" {
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+//---
+#include <allocate.h>
+#include <atom.h>
+#include <cuda_atom.h>
+#include <neighbor.h>
+
+void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
+    c_atom->Natoms = atom->Natoms;
+    c_atom->Nlocal = atom->Nlocal;
+    c_atom->Nghost = atom->Nghost;
+    c_atom->Nmax = atom->Nmax;
+    c_atom->ntypes = atom->ntypes;
+
+    c_atom->border_map = NULL;
+
+    const int Nlocal = atom->Nlocal;
+
+    checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+    checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) );
+
+    checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) );
+    checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
+    checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+
+    checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) );
+    checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) );
+
+    checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+}
+
+void checkCUDAError(const char *msg, cudaError_t err) {
+    if (err != cudaSuccess) {
+        //print a human readable error message
+        printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+}
@@ -0,0 +1,202 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+//---
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+//---    
+#include <likwid-marker.h>
+
+extern "C" {
+
+#include <atom.h>
+#include <cuda_atom.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <timing.h>
+#include <util.h>
+
+}
+
+// cuda kernel
+__global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) { 
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= Nlocal) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    const int numneighs = neigh_numneigh[i];
+
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+
+    MD_FLOAT fix = 0;
+    MD_FLOAT fiy = 0;
+    MD_FLOAT fiz = 0;
+
+    for(int k = 0; k < numneighs; k++) {
+        int j = neigh_neighbors[atom->Nlocal * k + i];
+        MD_FLOAT delx = xtmp - atom_x(j);
+        MD_FLOAT dely = ytmp - atom_y(j);
+        MD_FLOAT delz = ztmp - atom_z(j);
+        MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+        const int type_j = atom->type[j];
+        const int type_ij = type_i * atom->ntypes + type_j;
+        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+        const MD_FLOAT epsilon = atom->epsilon[type_ij];
+#endif
+
+        if(rsq < cutforcesq) {
+            MD_FLOAT sr2 = 1.0 / rsq;
+            MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+            MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+            fix += delx * force;
+            fiy += dely * force;
+            fiz += delz * force;
+        }
+    }
+
+    atom_fx(i) = fix;
+    atom_fy(i) = fiy;
+    atom_fz(i) = fiz;
+}
+
+__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+    atom_x(i) = atom_x(i) + dt * atom_vx(i);
+    atom_y(i) = atom_y(i) + dt * atom_vy(i);
+    atom_z(i) = atom_z(i) + dt * atom_vz(i);
+}
+
+__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+}
+
+extern "C" {
+
+void finalIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
+    const int Nlocal = atom->Nlocal;
+    const int num_threads_per_block = get_num_threads();
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
+
+    checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+void initialIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
+    const int Nlocal = atom->Nlocal;
+    const int num_threads_per_block = get_num_threads();
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
+
+    checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
+    const int num_threads_per_block = get_num_threads();
+    int Nlocal = atom->Nlocal;
+#ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+#endif
+
+    /*
+    int nDevices;
+    cudaGetDeviceCount(&nDevices);
+    size_t free, total;
+    for(int i = 0; i < nDevices; ++i) {
+        cudaMemGetInfo( &free, &total );
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
+    }
+    */
+
+
+    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
+
+    // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
+
+    cudaProfilerStart();
+
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
+
+    checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
+
+    cudaProfilerStop();
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+
+    return E-S;
+}
+
+}
@@ -0,0 +1,329 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+//---
+
+extern "C" {
+
+#include <atom.h>
+#include <cuda_atom.h>
+#include <parameter.h>
+#include <neighbor.h>
+#include <util.h>
+
+}
+
+static MD_FLOAT xprd, yprd, zprd;
+static MD_FLOAT bininvx, bininvy, bininvz;
+static int mbinxlo, mbinylo, mbinzlo;
+static int nbinx, nbiny, nbinz;
+static int mbinx, mbiny, mbinz; // n bins in x, y, z
+static int mbins; //total number of bins
+static int atoms_per_bin;  // max atoms per bin
+static MD_FLOAT cutneighsq;  // neighbor cutoff squared
+static int nmax;
+static int nstencil;      // # of bins in stencil
+static int* stencil;      // stencil list of bin offsets
+static int* c_stencil = NULL;
+static int* c_resize_needed = NULL;
+static int* c_new_maxneighs = NULL;
+static Binning c_binning {
+    .bincount = NULL,
+    .bins = NULL,
+    .mbins = 0,
+    .atoms_per_bin = 0
+};
+
+
+__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
+    int ix, iy, iz;
+
+    if(xin >= np.xprd) {
+        ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo;
+    } else {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
+    }
+
+    if(yin >= np.yprd) {
+        iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * np.bininvy) - np.mbinylo;
+    } else {
+        iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
+    }
+
+    if(zin >= np.zprd) {
+        iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo;
+    } else {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
+    }
+
+    return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
+}
+
+/* sorts the contents of a bin to make it comparable to the CPU version */
+/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
+__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= mbins) {
+        return;
+    }
+
+    int atoms_in_bin = bincount[i];
+    int *bin_ptr = &bins[i * atoms_per_bin];
+    int sorted;
+    do {
+        sorted = 1;
+        int tmp;
+        for(int index = 0; index < atoms_in_bin - 1; index++){
+            if (bin_ptr[index] > bin_ptr[index + 1]){
+                tmp = bin_ptr[index];
+                bin_ptr[index] = bin_ptr[index + 1];
+                bin_ptr[index + 1] = tmp;
+                sorted = 0;
+            }
+        }
+    } while (!sorted);
+}
+
+__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
+    Atom* atom = &a;
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int nall = atom->Nlocal + atom->Nghost;
+    if(i >= nall){
+        return;
+    }
+    
+    MD_FLOAT x = atom_x(i);
+    MD_FLOAT y = atom_y(i);
+    MD_FLOAT z = atom_z(i);
+    int ibin = coord2bin_device(x, y, z, np);
+
+    int ac = atomicAdd(&bincount[ibin], 1);
+            
+    if(ac < atoms_per_bin){
+        bins[ibin * atoms_per_bin + ac] = i;
+    } else {
+        atomicMax(resize_needed, ac);
+    }
+}
+
+__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nlocal = a.Nlocal;
+    if( i >= Nlocal ) {
+        return;
+    }
+    
+    Atom *atom = &a;
+    Neighbor *neighbor = &neigh;
+    
+    int* neighptr = &(neighbor->neighbors[i]);
+    int n = 0;
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+    int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
+#ifdef EXPLICIT_TYPES
+    int type_i = atom->type[i];
+#endif
+    for(int k = 0; k < nstencil; k++) {
+        int jbin = ibin + stencil[k];
+        int* loc_bin = &bins[jbin * atoms_per_bin];
+
+        for(int m = 0; m < bincount[jbin]; m++) {
+            int j = loc_bin[m];
+
+            if ( j == i ){
+                continue;
+            }
+
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+            int type_j = atom->type[j];
+            const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+#else
+            const MD_FLOAT cutoff = cutneighsq;
+#endif
+
+            if( rsq <= cutoff ) {
+                int idx = atom->Nlocal * n;
+                neighptr[idx] = j;
+                n += 1;
+            }
+        }
+    }
+
+    neighbor->numneigh[i] = n;
+
+    if(n > neighbor->maxneighs) {
+        atomicMax(new_maxneighs, n);
+    }
+}
+
+void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
+    int nall = c_atom->Nlocal + c_atom->Nghost;
+    int resize = 1;
+    const int num_blocks = ceil((float) nall / (float) threads_per_block);
+
+    while(resize > 0) {
+        resize = 0;
+        checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
+        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
+
+        /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
+        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
+
+	    checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
+	    checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
+        
+	    checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        if(resize) {
+            cudaFree(c_binning->bins);
+            c_binning->atoms_per_bin *= 2;
+            checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
+        }
+    }
+
+    atoms_per_bin = c_binning->atoms_per_bin;
+    const int sortBlocks = ceil((float)mbins / (float)threads_per_block);
+    /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/
+    sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
+    checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
+}
+
+void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
+    const int num_threads_per_block = get_num_threads();
+    int nall = atom->Nlocal + atom->Nghost;
+    c_neighbor->maxneighs = neighbor->maxneighs;
+
+    cudaProfilerStart();
+    /* upload stencil */
+    // TODO move all of this initialization into its own method
+    if(c_stencil == NULL){
+        checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+    }
+
+    if(c_binning.mbins == 0){
+        c_binning.mbins = mbins;
+        c_binning.atoms_per_bin = atoms_per_bin;
+        checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
+        checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
+    }
+
+    Neighbor_params np {
+        .xprd = xprd,
+        .yprd = yprd,
+        .zprd = zprd,
+        .bininvx = bininvx,
+        .bininvy = bininvy,
+        .bininvz = bininvz,
+        .mbinxlo = mbinxlo,
+        .mbinylo = mbinylo,
+        .mbinzlo = mbinzlo,
+        .nbinx = nbinx,
+        .nbiny = nbiny,
+        .nbinz = nbinz,
+        .mbinx = mbinx,
+        .mbiny = mbiny,
+        .mbinz = mbinz
+    };
+
+    if(c_resize_needed == NULL){
+        checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
+    }
+
+    /* bin local & ghost atoms */
+    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+    if(c_new_maxneighs == NULL){
+        checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+    }
+
+    int resize = 1;
+    
+    /* extend c_neighbor arrays if necessary */
+    if(nall > nmax) {
+        nmax = nall;
+        if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
+        if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
+        checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
+    }
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        resize = 0;
+
+        checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) ));
+
+        // TODO call compute_neigborhood kernel here
+        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+        /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
+         * */
+        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
+                                                                    np, nstencil, c_stencil,
+                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
+                                                                    c_new_maxneighs,
+								                                    cutneighsq);
+
+	checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() );
+	checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() );
+
+        // TODO copy the value of c_new_maxneighs back to host and check if it has been modified
+        int new_maxneighs;
+        checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
+        if (new_maxneighs > c_neighbor->maxneighs){
+            resize = 1;
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", c_neighbor->maxneighs);
+            c_neighbor->maxneighs = new_maxneighs * 1.2;
+            printf("NEW SIZE %d\n", c_neighbor->maxneighs);
+            cudaFree(c_neighbor->neighbors);
+            checkCUDAError("c_neighbor->neighbors resize malloc", cudaMalloc((void**)(&c_neighbor->neighbors), c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
+        }
+
+    }
+    neighbor->maxneighs = c_neighbor->maxneighs;
+
+    cudaProfilerStop();
+}
@@ -0,0 +1,151 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+//---
+
+extern "C" {
+
+#include <allocate.h>
+#include <atom.h>
+#include <cuda_atom.h>
+#include <pbc.h>
+#include <util.h>
+
+}
+
+static int NmaxGhost;
+static int *PBCx, *PBCy, *PBCz;
+static int c_NmaxGhost = 0;
+static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
+
+
+__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    Atom* atom = &a;
+    if( i >= atom->Nlocal ){
+        return;
+    }
+
+    if (atom_x(i) < 0.0) {
+        atom_x(i) += xprd;
+    } else if (atom_x(i) >= xprd) {
+        atom_x(i) -= xprd;
+    }
+
+    if (atom_y(i) < 0.0) {
+        atom_y(i) += yprd;
+    } else if (atom_y(i) >= yprd) {
+        atom_y(i) -= yprd;
+    }
+
+    if (atom_z(i) < 0.0) {
+        atom_z(i) += zprd;
+    } else if (atom_z(i) >= zprd) {
+        atom_z(i) -= zprd;
+    }
+}
+
+__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nghost = a.Nghost;
+    if( i >= Nghost ) {
+        return;
+    }
+    Atom* atom = &a;
+    int *border_map = atom->border_map;
+    int nlocal = atom->Nlocal;
+
+    atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
+    atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
+    atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) {
+    const int num_threads_per_block = get_num_threads();
+
+    if (doReneighbor){
+        c_atom->Natoms = atom->Natoms;
+        c_atom->Nlocal = atom->Nlocal;
+        c_atom->Nghost = atom->Nghost;
+        c_atom->ntypes = atom->ntypes;
+
+        if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
+            c_atom->Nmax = atom->Nmax;
+            if(c_atom->x != NULL){ cudaFree(c_atom->x); }
+            if(c_atom->type != NULL){ cudaFree(c_atom->type); }
+            checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+            checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
+        }
+        // TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well
+        checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+
+        if(c_NmaxGhost < NmaxGhost){
+            c_NmaxGhost = NmaxGhost;
+            if(c_PBCx != NULL){ cudaFree(c_PBCx); }
+            if(c_PBCy != NULL){ cudaFree(c_PBCy); }
+            if(c_PBCz != NULL){ cudaFree(c_PBCz); }
+            if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); }
+            checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) );
+        }
+        checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+    }
+
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
+
+    /*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz,
+     *                                                          MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)
+     * */
+    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
+    checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
+}
+
+void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param){
+    const int num_threads_per_block = get_num_threads();
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+    /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
+    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
+
+    checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
+
+    checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+}