Move src directory to lammps

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
Remove likwid and architecture-specific compilation flags
2022-08-04 17:25:31 +02:00 · 2022-08-04 17:09:17 +02:00 · 2022-07-22 13:55:27 +02:00 · 2022-07-20 23:04:22 +02:00 · 2022-07-20 22:43:57 +02:00 · 2022-07-19 20:38:11 +02:00
39 changed files with 1506 additions and 740 deletions
@@ -27,6 +27,7 @@
 *.so
 *.so.*
 *.dylib
+.DS_Store

 # Executables
 *.exe
@@ -52,6 +53,7 @@ Mkfile.old
 dkms.conf

 # Build directories and executables
+.vscode/
 GCC/
 ICC/
 MDBench-GCC*
@@ -58,6 +58,9 @@ ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.
 OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
 OBJ       = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
 OBJ      += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
+OBJ		 += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
+
+
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)

 # $(warning $(OBJ))
@@ -88,6 +91,11 @@ $(BUILD_DIR)/%.o:  %.s
 	$(info ===>  ASSEMBLE  $@)
 	$(Q)$(AS) $< -o $@

+$(BUILD_DIR)/%.o:  %.cu
+	$(info ===>  COMPILE  $@)
+	$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+	$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM  $< > $(BUILD_DIR)/$*.d
+
 .PHONY: clean distclean tags info asm

 clean:
@@ -1,5 +1,5 @@
-# Compiler tag (GCC/CLANG/ICC)
-TAG ?= CLANG
+# Compiler tag (GCC/CLANG/ICC/NVCC)
+TAG ?= NVCC
 # Enable likwid (true or false)
 ENABLE_LIKWID ?= false
 # SP or DP
@@ -22,7 +22,7 @@ INDEX_TRACER ?= false
 # Vector width (elements) for index and distance tracer
 VECTOR_WIDTH ?= 8
 # Compute statistics
-COMPUTE_STATS ?= true
+COMPUTE_STATS ?= false

 #Feature options
 OPTIONS =  -DALIGNMENT=64
@@ -0,0 +1,10 @@
+END=32
+for ((i=1;i<=END;i++)); do
+	output=$(eval "likwid-mpirun -np 1 -t $i -m -g FLOPS_DP -omp gnu ./MDBench-GCC -n 50")
+	echo -n "$i,"
+	echo "$output" > "FLOPS_DP/thread_$i.txt"
+done
+
+## likwid perf measurements on testfront1:
+# srun --nodes=1 --exclusive --nodelist=rome1 --time=00:30:00 --export=NONE -c 64 -C hwperf --pty /bin/bash -l
+# likwid-mpirun -np 1 -t 32 -m -g MEM -omp gnu -d ./MDBench-GCC
@@ -0,0 +1,6 @@
+#!/bin/bash
+for i in $(seq 1 32); do
+	echo "$i"
+	export "OMP_NUM_THREADS=$i"
+	./MDBench-GCC -n 50 | grep "Performance"
+done
@@ -0,0 +1,5 @@
+END=32
+for ((i=16;i<=END;i++)); do
+	export NUM_THREADS=$i
+	$(eval "ncu --set full -o /home/hpc/rzku/ptfs410h/MD-Bench/log/MG/presentation_2/Resources/GPU/Metrics/threads_$i ./MDBench-NVCC -n 50")
+done
@@ -0,0 +1,6 @@
+END=64
+for ((i=1;i<=END;i*=2)); do
+	output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 2000")
+	echo -n "$i,"
+	echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.,]//g' | awk '{print $1"e6"}'
+done
@@ -0,0 +1,16 @@
+CC  = nvcc
+LINKER = $(CC)
+
+ANSI_CFLAGS  = -ansi
+ANSI_CFLAGS += -std=c99
+ANSI_CFLAGS += -pedantic
+ANSI_CFLAGS += -Wextra
+
+# CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
+#CFLAGS   = -O3 -g -arch=sm_61 # -fopenmp
+CFLAGS   = -O3 -g # -fopenmp
+ASFLAGS  =  -masm=intel
+LFLAGS   =
+DEFINES  = -D_GNU_SOURCE #-DLIKWID_PERFMON
+INCLUDES = $(LIKWID_INC)
+LIBS     = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid
@@ -25,11 +25,29 @@
 #include <string.h>
 #include <errno.h>

+#include <cuda_runtime.h>
+
+void checkCUDAError(const char *msg, cudaError_t err)
+{
+    if (err != cudaSuccess)
+    {
+        //print a human readable error message
+        printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+
 void* allocate (int alignment, size_t bytesize)
 {
    int errorCode;
    void* ptr;

+    checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
+
+    return ptr;
+
+    /*
    errorCode =  posix_memalign(&ptr, alignment, bytesize);

    if (errorCode) {
@@ -51,6 +69,7 @@ void* allocate (int alignment, size_t bytesize)
    }

    return ptr;
+    */
 }

 void* reallocate (
@@ -63,7 +82,7 @@ void* reallocate (

    if(ptr != NULL) {
        memcpy(newarray, ptr, oldBytesize);
-        free(ptr);
+        cudaFreeHost(ptr);
    }

    return newarray;
@@ -30,6 +30,9 @@
 #include <allocate.h>
 #include <util.h>

+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
 #define DELTA 20000

 void initAtom(Atom *atom)
@@ -57,10 +60,10 @@ void createAtom(Atom *atom, Parameter *param)
    atom->Natoms = 4 * param->nx * param->ny * param->nz;
    atom->Nlocal = 0;
    atom->ntypes = param->ntypes;
-    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
-    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
-    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
-    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
        atom->epsilon[i] = param->epsilon;
        atom->sigma6[i] = param->sigma6;
@@ -134,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param)
                atom_x(atom->Nlocal) = xtmp;
                atom_y(atom->Nlocal) = ytmp;
                atom_z(atom->Nlocal) = ztmp;
-                atom->vx[atom->Nlocal] = vxtmp;
-                atom->vy[atom->Nlocal] = vytmp;
-                atom->vz[atom->Nlocal] = vztmp;
+                atom_vx(atom->Nlocal) = vxtmp;
+                atom_vy(atom->Nlocal) = vytmp;
+                atom_vz(atom->Nlocal) = vztmp;
                atom->type[atom->Nlocal] = rand() % atom->ntypes;
                atom->Nlocal++;
            }
@@ -159,16 +162,24 @@ void growAtom(Atom *atom)

    #ifdef AOS
    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+
+    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
    #else
    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
    atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
    atom->z  = (MD_FLOAT*) reallocate(atom->z,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
-    #endif
-    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
-    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
-    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
    atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
    atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
+    #endif
+
    atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
 }
@@ -0,0 +1,212 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+extern "C" {
+    #include <likwid-marker.h>
+
+    #include <timing.h>
+    #include <neighbor.h>
+    #include <parameter.h>
+    #include <atom.h>
+    #include <allocate.h>
+}
+
+// cuda kernel
+__global__ void calc_force(
+    Atom a,
+    MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon,
+    int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    const int numneighs = neigh_numneigh[i];
+
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+
+    MD_FLOAT fix = 0;
+    MD_FLOAT fiy = 0;
+    MD_FLOAT fiz = 0;
+
+    for(int k = 0; k < numneighs; k++) {
+        int j = neigh_neighbors[atom->Nlocal * k + i];
+        MD_FLOAT delx = xtmp - atom_x(j);
+        MD_FLOAT dely = ytmp - atom_y(j);
+        MD_FLOAT delz = ztmp - atom_z(j);
+        MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+        const int type_j = atom->type[j];
+        const int type_ij = type_i * atom->ntypes + type_j;
+        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+        const MD_FLOAT epsilon = atom->epsilon[type_ij];
+#endif
+
+        if(rsq < cutforcesq) {
+            MD_FLOAT sr2 = 1.0 / rsq;
+            MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+            MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+            fix += delx * force;
+            fiy += dely * force;
+            fiz += delz * force;
+        }
+    }
+
+    atom_fx(i) = fix;
+    atom_fy(i) = fiy;
+    atom_fz(i) = fiz;
+}
+
+__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+    atom_x(i) = atom_x(i) + dt * atom_vx(i);
+    atom_y(i) = atom_y(i) + dt * atom_vy(i);
+    atom_z(i) = atom_z(i) + dt * atom_vz(i);
+}
+
+__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+}
+
+extern "C" {
+
+
+
+void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
+
+    const int Nlocal = atom->Nlocal;
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
+
+    checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
+
+    const int Nlocal = atom->Nlocal;
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
+
+    checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+double computeForce(
+        bool reneighbourHappenend,
+        Parameter *param,
+        Atom *atom,
+        Neighbor *neighbor,
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        int num_threads_per_block
+        )
+{
+    int Nlocal = atom->Nlocal;
+#ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+#endif
+
+    /*
+    int nDevices;
+    cudaGetDeviceCount(&nDevices);
+    size_t free, total;
+    for(int i = 0; i < nDevices; ++i) {
+        cudaMemGetInfo( &free, &total );
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
+    }
+    */
+
+
+    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
+
+    // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
+
+    cudaProfilerStart();
+
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
+
+    checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
+
+    cudaProfilerStop();
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+
+    return E-S;
+}
+}
@@ -22,8 +22,12 @@
 */
 #include <stdlib.h>

+#include <cuda_runtime.h>
+
 #ifndef __ALLOCATE_H_
 #define __ALLOCATE_H_
 extern void* allocate (int alignment, size_t bytesize);
 extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
+
+extern void checkCUDAError(const char *msg, cudaError_t err);
 #endif
@@ -45,14 +45,34 @@ extern void growAtom(Atom*);

 #ifdef AOS
 #define POS_DATA_LAYOUT     "AoS"
+
 #define atom_x(i)           atom->x[(i) * 3 + 0]
 #define atom_y(i)           atom->x[(i) * 3 + 1]
 #define atom_z(i)           atom->x[(i) * 3 + 2]
+
+#define atom_fx(i)          atom->fx[(i) * 3 + 0]
+#define atom_fy(i)          atom->fx[(i) * 3 + 1]
+#define atom_fz(i)          atom->fx[(i) * 3 + 2]
+
+#define atom_vx(i)          atom->vx[(i) * 3 + 0]
+#define atom_vy(i)          atom->vx[(i) * 3 + 1]
+#define atom_vz(i)          atom->vx[(i) * 3 + 2]
+
 #else
 #define POS_DATA_LAYOUT     "SoA"
+
 #define atom_x(i)           atom->x[i]
 #define atom_y(i)           atom->y[i]
 #define atom_z(i)           atom->z[i]
+
+#define atom_fx(i)           atom->fx[i]
+#define atom_fy(i)           atom->fy[i]
+#define atom_fz(i)           atom->fz[i]
+
+#define atom_vx(i)           atom->vx[i]
+#define atom_vy(i)           atom->vy[i]
+#define atom_vz(i)           atom->vz[i]
+
 #endif

 #endif
@@ -33,9 +33,26 @@ typedef struct {
    int* numneigh;
 } Neighbor;

+typedef struct {
+    MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
+    MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
+    int mbinxlo; int mbinylo; int mbinzlo;
+    int nbinx; int nbiny; int nbinz;
+    int mbinx; int mbiny; int mbinz;
+} Neighbor_params;
+
+typedef struct {
+    int* bincount;
+    int* bins;
+    int mbins;
+    int atoms_per_bin;
+} Binning;
+
 extern void initNeighbor(Neighbor*, Parameter*);
 extern void setupNeighbor();
 extern void binatoms(Atom*);
 extern void buildNeighbor(Atom*, Neighbor*);
 extern void sortAtom(Atom*);
+extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
+extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*);
 #endif
@@ -25,8 +25,10 @@

 #ifndef __PBC_H_
 #define __PBC_H_
-extern void initPbc();
+extern void initPbc(Atom*);
 extern void updatePbc(Atom*, Parameter*);
+extern void updatePbc_cuda(Atom*, Parameter*, Atom*, bool, const int);
 extern void updateAtomsPbc(Atom*, Parameter*);
+extern void updateAtomsPbc_cuda(Atom*, Parameter*, Atom*, const int);
 extern void setupPbc(Atom*, Parameter*);
 #endif
@@ -5,6 +5,11 @@ typedef enum {
    TOTAL = 0,
    NEIGH,
    FORCE,
+    NEIGH_UPDATE_ATOMS_PBC,
+    NEIGH_SETUP_PBC,
+    NEIGH_UPDATE_PBC,
+    NEIGH_BINATOMS,
+    NEIGH_BUILD_LISTS,
    NUMTIMER
 } timertype;

@@ -23,6 +23,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <stdbool.h>
 #include <unistd.h>
 #include <limits.h>
 #include <math.h>
@@ -44,7 +45,14 @@

 #define HLINE "----------------------------------------------------------------------------\n"

-extern double computeForce(Parameter*, Atom*, Neighbor*);
+extern void cuda_final_integrate(bool doReneighbour, Parameter *param,
+                                 Atom *atom, Atom *c_atom,
+                                 const int num_threads_per_block);
+extern void cuda_initial_integrate(bool doReneighbour, Parameter *param,
+                                   Atom *atom, Atom *c_atom,
+                                   const int num_threads_per_block);
+
+extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int);
 extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
 extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);

@@ -72,12 +80,52 @@ void init(Parameter *param)
    param->proc_freq = 2.4;
 }

+void initCudaAtom(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
+
+    c_atom->Natoms = atom->Natoms;
+    c_atom->Nlocal = atom->Nlocal;
+    c_atom->Nghost = atom->Nghost;
+    c_atom->Nmax = atom->Nmax;
+    c_atom->ntypes = atom->ntypes;
+
+    c_atom->border_map = NULL;
+
+    const int Nlocal = atom->Nlocal;
+
+    checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+    checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) );
+
+    checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) );
+    checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
+    checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+
+    checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) );
+    checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) );
+
+    checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+}
+
+
 double setup(
        Parameter *param,
        Eam *eam,
        Atom *atom,
        Neighbor *neighbor,
-        Stats *stats)
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        Stats *stats,
+        const int num_threads_per_block,
+	double* timers)
 {
    if(param->force_field == FF_EAM) { initEam(eam, param); }
    double S, E;
@@ -96,57 +144,69 @@ double setup(
    setupThermo(param, atom->Natoms);
    adjustThermo(param, atom);
    setupPbc(atom, param);
-    updatePbc(atom, param);
-    buildNeighbor(atom, neighbor);
+    initCudaAtom(atom, neighbor, c_atom, c_neighbor);
+    updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
    E = getTimeStamp();

+
    return E-S;
 }

 double reneighbour(
        Parameter *param,
        Atom *atom,
-        Neighbor *neighbor)
+        Neighbor *neighbor,
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        const int num_threads_per_block,
+	double* timers)
 {
-    double S, E;
+    double S, E, beforeEvent, afterEvent;

    S = getTimeStamp();
+    beforeEvent = S;
    LIKWID_MARKER_START("reneighbour");
-    updateAtomsPbc(atom, param);
+    updateAtomsPbc_cuda(atom, param, c_atom, num_threads_per_block);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_UPDATE_ATOMS_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
    setupPbc(atom, param);
-    updatePbc(atom, param);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_SETUP_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
+    updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
    //sortAtom(atom);
-    buildNeighbor(atom, neighbor);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
    LIKWID_MARKER_STOP("reneighbour");
    E = getTimeStamp();
+    afterEvent = E;
+    timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent;

    return E-S;
 }

 void initialIntegrate(Parameter *param, Atom *atom)
 {
-    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
-    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
-
    for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] += param->dtforce * fx[i];
-        vy[i] += param->dtforce * fy[i];
-        vz[i] += param->dtforce * fz[i];
-        atom_x(i) = atom_x(i) + param->dt * vx[i];
-        atom_y(i) = atom_y(i) + param->dt * vy[i];
-        atom_z(i) = atom_z(i) + param->dt * vz[i];
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
+        atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
+        atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
+        atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
    }
 }

 void finalIntegrate(Parameter *param, Atom *atom)
 {
-    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
-    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
-
    for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] += param->dtforce * fx[i];
-        vy[i] += param->dtforce * fy[i];
-        vz[i] += param->dtforce * fz[i];
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
    }
 }

@@ -176,6 +236,19 @@ const char* ff2str(int ff)
    return "invalid";
 }

+int get_num_threads() {
+
+    const char *num_threads_env = getenv("NUM_THREADS");
+    int num_threads = 0;
+    if(num_threads_env == 0)
+        num_threads = 32;
+    else {
+        num_threads = atoi(num_threads_env);
+    }
+
+    return num_threads;
+}
+
 int main(int argc, char** argv)
 {
    double timer[NUMTIMER];
@@ -184,6 +257,8 @@ int main(int argc, char** argv)
    Neighbor neighbor;
    Stats stats;
    Parameter param;
+    Atom c_atom;
+    Neighbor c_neighbor;

    LIKWID_MARKER_INIT;
 #pragma omp parallel
@@ -254,7 +329,10 @@ int main(int argc, char** argv)
        }
    }

-    setup(&param, &eam, &atom, &neighbor, &stats);
+    // this should be multiple of 32 as operations are performed at the level of warps
+    const int num_threads_per_block = get_num_threads();
+
+    setup(&param, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer);
    computeThermo(0, &param, &atom);
    if(param.force_field == FF_EAM) {
        computeForceEam(&eam, &param, &atom, &neighbor, &stats, 1, 0);
@@ -262,25 +340,36 @@ int main(int argc, char** argv)
 #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
        computeForceTracing(&param, &atom, &neighbor, &stats, 1, 0);
 #else
-        computeForce(&param, &atom, &neighbor);
+        computeForce(true, &param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
 #endif
    }

    timer[FORCE] = 0.0;
    timer[NEIGH] = 0.0;
    timer[TOTAL] = getTimeStamp();
+    timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
+    timer[NEIGH_SETUP_PBC] = 0.0;
+    timer[NEIGH_UPDATE_PBC] = 0.0;
+    timer[NEIGH_BINATOMS] = 0.0;
+    timer[NEIGH_BUILD_LISTS] = 0.0;

    if(param.vtk_file != NULL) {
        write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
    }

    for(int n = 0; n < param.ntimes; n++) {
-        initialIntegrate(&param, &atom);

-        if((n + 1) % param.every) {
-            updatePbc(&atom, &param);
+        const bool doReneighbour = (n + 1) % param.every == 0;
+
+        cuda_initial_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);
+
+        if(doReneighbour) {
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer);
        } else {
-            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
+	    double before = getTimeStamp();
+            updatePbc_cuda(&atom, &param, &c_atom, false, num_threads_per_block);
+	    double after = getTimeStamp();
+	    timer[NEIGH_UPDATE_PBC] += after - before;
        }

        if(param.force_field == FF_EAM) {
@@ -289,12 +378,14 @@ int main(int argc, char** argv)
 #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
            timer[FORCE] += computeForceTracing(&param, &atom, &neighbor, &stats, 0, n + 1);
 #else
-            timer[FORCE] += computeForce(&param, &atom, &neighbor);
+            timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
 #endif
        }
-        finalIntegrate(&param, &atom);
+
+        cuda_final_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);

        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
+	    checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost) );
            computeThermo(n + 1, &param, &atom);
        }

@@ -303,6 +394,7 @@ int main(int argc, char** argv)
        }
    }

+    timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS];
    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
    computeThermo(-1, &param, &atom);

@@ -316,11 +408,15 @@ int main(int argc, char** argv)
 #endif
    printf(HLINE);
    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
-    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
-            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
+    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs   NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n",
+            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]);
    printf(HLINE);
    printf("Performance: %.2f million atom updates per second\n",
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+    double atomUpdatesTotal = (double) atom.Natoms * param.ntimes;
+    printf("Force_perf in millions per sec: %.2f\n", 1e-6 * atomUpdatesTotal / timer[FORCE]);
+    double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every;
+    printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]);
 #ifdef COMPUTE_STATS
    displayStatistics(&atom, &param, &stats, timer);
 #endif
@@ -0,0 +1,719 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+extern "C" {
+
+#include <neighbor.h>
+#include <parameter.h>
+#include <allocate.h>
+#include <atom.h>
+#include <timing.h>
+#include <timers.h>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+}
+
+__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, 
+                                Neighbor_params np)
+{
+    int ix, iy, iz;
+
+    if(xin >= np.xprd) {
+        ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo;
+    } else {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
+    }
+
+    if(yin >= np.yprd) {
+        iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * np.bininvy) - np.mbinylo;
+    } else {
+        iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
+    }
+
+    if(zin >= np.zprd) {
+        iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo;
+    } else {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
+    }
+
+    return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
+}
+
+/* sorts the contents of a bin to make it comparable to the CPU version */
+/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
+__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= mbins){
+        return;
+    }
+
+    int atoms_in_bin = bincount[i];
+    int* bin_ptr = &bins[i * atoms_per_bin];
+    int sorted;
+    do {
+        sorted = 1;
+        int tmp;
+        for(int index = 0; index < atoms_in_bin - 1; index++){
+            if (bin_ptr[index] > bin_ptr[index + 1]){
+                tmp = bin_ptr[index];
+                bin_ptr[index] = bin_ptr[index + 1];
+                bin_ptr[index + 1] = tmp;
+                sorted = 0;
+            }
+        }
+    } while (!sorted);
+}
+
+__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
+    Atom* atom = &a;
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int nall = atom->Nlocal + atom->Nghost;
+    if(i >= nall){
+        return;
+    }
+    
+    MD_FLOAT x = atom_x(i);
+    MD_FLOAT y = atom_y(i);
+    MD_FLOAT z = atom_z(i);
+    int ibin = coord2bin_device(x, y, z, np);
+
+    int ac = atomicAdd(&bincount[ibin], 1);
+            
+    if(ac < atoms_per_bin){
+        bins[ibin * atoms_per_bin + ac] = i;
+    } else {
+        atomicMax(resize_needed, ac);
+    }
+}
+
+__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nlocal = a.Nlocal;
+    if( i >= Nlocal ) {
+        return;
+    }
+    
+    Atom *atom = &a;
+    Neighbor *neighbor = &neigh;
+    
+    int* neighptr = &(neighbor->neighbors[i]);
+    int n = 0;
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+    int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
+#ifdef EXPLICIT_TYPES
+    int type_i = atom->type[i];
+#endif
+    for(int k = 0; k < nstencil; k++) {
+        int jbin = ibin + stencil[k];
+        int* loc_bin = &bins[jbin * atoms_per_bin];
+
+        for(int m = 0; m < bincount[jbin]; m++) {
+            int j = loc_bin[m];
+
+            if ( j == i ){
+                continue;
+            }
+
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+            int type_j = atom->type[j];
+                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+#else
+            const MD_FLOAT cutoff = cutneighsq;
+#endif
+
+            if( rsq <= cutoff ) {
+                int idx = atom->Nlocal * n;
+                neighptr[idx] = j;
+                n += 1;
+            }
+        }
+    }
+
+    neighbor->numneigh[i] = n;
+
+    if(n > neighbor->maxneighs) {
+        atomicMax(new_maxneighs, n);
+    }
+}
+
+extern "C" {
+    
+    
+static MD_FLOAT xprd, yprd, zprd;
+static MD_FLOAT bininvx, bininvy, bininvz;
+static int mbinxlo, mbinylo, mbinzlo;
+static int nbinx, nbiny, nbinz;
+static int mbinx, mbiny, mbinz; // n bins in x, y, z
+static int *bincount;
+static int *bins;
+static int mbins; //total number of bins
+static int atoms_per_bin;  // max atoms per bin
+static MD_FLOAT cutneigh;
+static MD_FLOAT cutneighsq;  // neighbor cutoff squared
+static int nmax;
+static int nstencil;      // # of bins in stencil
+static int* stencil;      // stencil list of bin offsets
+static MD_FLOAT binsizex, binsizey, binsizez;
+
+static int* c_stencil = NULL;
+static int* c_resize_needed = NULL;
+static int* c_new_maxneighs = NULL;
+static Binning c_binning{
+        .bincount = NULL,
+        .bins = NULL,
+        .mbins = 0,
+        .atoms_per_bin = 0
+};
+
+
+static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
+static MD_FLOAT bindist(int, int, int);
+
+/* exported subroutines */
+void initNeighbor(Neighbor *neighbor, Parameter *param)
+{
+    MD_FLOAT neighscale = 5.0 / 6.0;
+    xprd = param->nx * param->lattice;
+    yprd = param->ny * param->lattice;
+    zprd = param->nz * param->lattice;
+    cutneigh = param->cutneigh;
+    nbinx = neighscale * param->nx;
+    nbiny = neighscale * param->ny;
+    nbinz = neighscale * param->nz;
+    nmax = 0;
+    atoms_per_bin = 8;
+    stencil = NULL;
+    bins = NULL;
+    bincount = NULL;
+    neighbor->maxneighs = 100;
+    neighbor->numneigh = NULL;
+    neighbor->neighbors = NULL;
+}
+
+void setupNeighbor()
+{
+    MD_FLOAT coord;
+    int mbinxhi, mbinyhi, mbinzhi;
+    int nextx, nexty, nextz;
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
+
+    cutneighsq = cutneigh * cutneigh;
+    binsizex = xprd / nbinx;
+    binsizey = yprd / nbiny;
+    binsizez = zprd / nbinz;
+    bininvx = 1.0 / binsizex;
+    bininvy = 1.0 / binsizey;
+    bininvz = 1.0 / binsizez;
+
+    coord = xlo - cutneigh - SMALL * xprd;
+    mbinxlo = (int) (coord * bininvx);
+    if (coord < 0.0) {
+        mbinxlo = mbinxlo - 1;
+    }
+    coord = xhi + cutneigh + SMALL * xprd;
+    mbinxhi = (int) (coord * bininvx);
+
+    coord = ylo - cutneigh - SMALL * yprd;
+    mbinylo = (int) (coord * bininvy);
+    if (coord < 0.0) {
+        mbinylo = mbinylo - 1;
+    }
+    coord = yhi + cutneigh + SMALL * yprd;
+    mbinyhi = (int) (coord * bininvy);
+
+    coord = zlo - cutneigh - SMALL * zprd;
+    mbinzlo = (int) (coord * bininvz);
+    if (coord < 0.0) {
+        mbinzlo = mbinzlo - 1;
+    }
+    coord = zhi + cutneigh + SMALL * zprd;
+    mbinzhi = (int) (coord * bininvz);
+
+    mbinxlo = mbinxlo - 1;
+    mbinxhi = mbinxhi + 1;
+    mbinx = mbinxhi - mbinxlo + 1;
+
+    mbinylo = mbinylo - 1;
+    mbinyhi = mbinyhi + 1;
+    mbiny = mbinyhi - mbinylo + 1;
+
+    mbinzlo = mbinzlo - 1;
+    mbinzhi = mbinzhi + 1;
+    mbinz = mbinzhi - mbinzlo + 1;
+
+    nextx = (int) (cutneigh * bininvx);
+    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
+
+    nexty = (int) (cutneigh * bininvy);
+    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
+
+    nextz = (int) (cutneigh * bininvz);
+    if(nextz * binsizez < FACTOR * cutneigh) nextz++;
+
+    if (stencil) {
+        free(stencil);
+    }
+
+    stencil = (int*) malloc(
+            (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
+
+    nstencil = 0;
+    int kstart = -nextz;
+
+    for(int k = kstart; k <= nextz; k++) {
+        for(int j = -nexty; j <= nexty; j++) {
+            for(int i = -nextx; i <= nextx; i++) {
+                if(bindist(i, j, k) < cutneighsq) {
+                    stencil[nstencil++] =
+                        k * mbiny * mbinx + j * mbinx + i;
+                }
+            }
+        }
+    }
+
+    mbins = mbinx * mbiny * mbinz;
+
+    if (bincount) {
+        free(bincount);
+    }
+    bincount = (int*) malloc(mbins * sizeof(int));
+
+    if (bins) {
+        free(bins);
+    }
+    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+}
+
+void buildNeighbor(Atom *atom, Neighbor *neighbor)
+{
+    int nall = atom->Nlocal + atom->Nghost;
+
+    /* extend atom arrays if necessary */
+    if(nall > nmax) {
+        nmax = nall;
+        if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh);
+        if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors);
+        checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) );
+        checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) );
+        // neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
+        // neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+    }
+
+    /* bin local & ghost atoms */
+    binatoms(atom);
+    int resize = 1;
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        int new_maxneighs = neighbor->maxneighs;
+        resize = 0;
+
+        for(int i = 0; i < atom->Nlocal; i++) {
+            int* neighptr = &(neighbor->neighbors[i]);
+            int n = 0;
+            MD_FLOAT xtmp = atom_x(i);
+            MD_FLOAT ytmp = atom_y(i);
+            MD_FLOAT ztmp = atom_z(i);
+            int ibin = coord2bin(xtmp, ytmp, ztmp);
+            #ifdef EXPLICIT_TYPES
+            int type_i = atom->type[i];
+            #endif
+            for(int k = 0; k < nstencil; k++) {
+                int jbin = ibin + stencil[k];
+                int* loc_bin = &bins[jbin * atoms_per_bin];
+
+                for(int m = 0; m < bincount[jbin]; m++) {
+                    int j = loc_bin[m];
+
+                    if ( j == i ){
+                        continue;
+                    }
+
+                    MD_FLOAT delx = xtmp - atom_x(j);
+                    MD_FLOAT dely = ytmp - atom_y(j);
+                    MD_FLOAT delz = ztmp - atom_z(j);
+                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                    #ifdef EXPLICIT_TYPES
+                    int type_j = atom->type[j];
+                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+                    #else
+                    const MD_FLOAT cutoff = cutneighsq;
+                    #endif
+
+                    if( rsq <= cutoff ) {
+                        int idx = atom->Nlocal * n;
+                        neighptr[idx] = j;
+                        n += 1;
+                    }
+                }
+            }
+
+            neighbor->numneigh[i] = n;
+
+            if(n >= neighbor->maxneighs) {
+                resize = 1;
+
+                if(n >= new_maxneighs) {
+                    new_maxneighs = n;
+                }
+            }
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", neighbor->maxneighs);
+            neighbor->maxneighs = new_maxneighs * 1.2;
+            free(neighbor->neighbors);
+            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+        }
+    }
+}
+
+/* internal subroutines */
+MD_FLOAT bindist(int i, int j, int k)
+{
+    MD_FLOAT delx, dely, delz;
+
+    if(i > 0) {
+        delx = (i - 1) * binsizex;
+    } else if(i == 0) {
+        delx = 0.0;
+    } else {
+        delx = (i + 1) * binsizex;
+    }
+
+    if(j > 0) {
+        dely = (j - 1) * binsizey;
+    } else if(j == 0) {
+        dely = 0.0;
+    } else {
+        dely = (j + 1) * binsizey;
+    }
+
+    if(k > 0) {
+        delz = (k - 1) * binsizez;
+    } else if(k == 0) {
+        delz = 0.0;
+    } else {
+        delz = (k + 1) * binsizez;
+    }
+
+    return (delx * delx + dely * dely + delz * delz);
+}
+
+int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin)
+{
+    int ix, iy, iz;
+
+    if(xin >= xprd) {
+        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+
+    if(zin >= zprd) {
+        iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * bininvz) - mbinzlo;
+    } else {
+        iz = (int)(zin * bininvz) - mbinzlo - 1;
+    }
+
+    return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
+}
+
+void binatoms(Atom *atom)
+{
+    int nall = atom->Nlocal + atom->Nghost;
+    int resize = 1;
+
+    while(resize > 0) {
+        resize = 0;
+
+        for(int i = 0; i < mbins; i++) {
+            bincount[i] = 0;
+        }
+
+        for(int i = 0; i < nall; i++) {
+            MD_FLOAT x = atom_x(i);
+            MD_FLOAT y = atom_y(i);
+            MD_FLOAT z = atom_z(i);
+            int ibin = coord2bin(x, y, z);
+
+            if(bincount[ibin] < atoms_per_bin) {
+                int ac = bincount[ibin]++;
+                bins[ibin * atoms_per_bin + ac] = i;
+            } else {
+                resize = 1;
+            }
+        }
+
+        if(resize) {
+            free(bins);
+            atoms_per_bin *= 2;
+            bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+        }
+    }
+}
+
+void sortAtom(Atom* atom) {
+    binatoms(atom);
+    int Nmax = atom->Nmax;
+    int* binpos = bincount;
+
+    for(int i=1; i<mbins; i++) {
+        binpos[i] += binpos[i-1];
+    }
+
+#ifdef AOS
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+#else
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+#endif
+
+    MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
+    MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
+
+    for(int mybin = 0; mybin<mbins; mybin++) {
+        int start = mybin>0?binpos[mybin-1]:0;
+        int count = binpos[mybin] - start;
+        for(int k=0; k<count; k++) {
+            int new_i = start + k;
+            int old_i = bins[mybin * atoms_per_bin + k];
+#ifdef AOS
+            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
+            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
+            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
+
+            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
+            new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
+            new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
+#else
+            new_x[new_i] = old_x[old_i];
+            new_y[new_i] = old_y[old_i];
+            new_z[new_i] = old_z[old_i];
+
+            new_vx[new_i] = old_vx[old_i];
+            new_vy[new_i] = old_vy[old_i];
+            new_vz[new_i] = old_vz[old_i];
+#endif
+
+        }
+    }
+
+    free(atom->x);
+    atom->x = new_x;
+
+    free(atom->vx);
+    atom->vx = new_vx;
+#ifndef AOS
+    free(atom->y);
+    free(atom->z);
+    atom->y = new_y; atom->z = new_z;
+
+    free(atom->vy); free(atom->vz);
+    atom->vy = new_vy; atom->vz = new_vz;
+#endif
+}
+
+void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neighbor_params *np, const int threads_per_block)
+{
+    int nall = c_atom->Nlocal + c_atom->Nghost;
+    int resize = 1;
+
+    const int num_blocks = ceil((float)nall / (float)threads_per_block);
+
+    while(resize > 0) {
+        resize = 0;
+        checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
+        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
+
+        /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
+        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
+
+	    checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
+	    checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
+        
+	    checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        if(resize) {
+            cudaFree(c_binning->bins);
+            c_binning->atoms_per_bin *= 2;
+            checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
+        }
+    }
+    atoms_per_bin = c_binning->atoms_per_bin;
+    const int sortBlocks = ceil((float)mbins / (float)threads_per_block);
+    /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/
+    sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
+    checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
+}
+
+void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers)
+{
+    int nall = atom->Nlocal + atom->Nghost;
+    c_neighbor->maxneighs = neighbor->maxneighs;
+
+    cudaProfilerStart();
+    /* upload stencil */
+    // TODO move all of this initialization into its own method
+    if(c_stencil == NULL){
+        checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+    }
+
+    if(c_binning.mbins == 0){
+        c_binning.mbins = mbins;
+        c_binning.atoms_per_bin = atoms_per_bin;
+        checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
+        checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
+    }
+
+    Neighbor_params np{
+            .xprd = xprd,
+            .yprd = yprd,
+            .zprd = zprd,
+            .bininvx = bininvx,
+            .bininvy = bininvy,
+            .bininvz = bininvz,
+            .mbinxlo = mbinxlo,
+            .mbinylo = mbinylo,
+            .mbinzlo = mbinzlo,
+            .nbinx = nbinx,
+            .nbiny = nbiny,
+            .nbinz = nbinz,
+            .mbinx = mbinx,
+            .mbiny = mbiny,
+            .mbinz = mbinz
+    };
+
+    if(c_resize_needed == NULL){
+        checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
+    }
+
+    /* bin local & ghost atoms */
+    double beforeBinning = getTimeStamp();
+    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+    double afterBinning = getTimeStamp();
+    timers[NEIGH_BINATOMS] += afterBinning - beforeBinning;
+
+    if(c_new_maxneighs == NULL){
+        checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+    }
+
+    int resize = 1;
+    
+    /* extend c_neighbor arrays if necessary */
+    if(nall > nmax) {
+        nmax = nall;
+        if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
+        if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
+        checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
+    }
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        resize = 0;
+
+        checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) ));
+
+        // TODO call compute_neigborhood kernel here
+        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+        /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
+         * */
+        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
+                                                                    np, nstencil, c_stencil,
+                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
+                                                                    c_new_maxneighs,
+								                                    cutneighsq);
+
+	checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() );
+	checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() );
+
+        // TODO copy the value of c_new_maxneighs back to host and check if it has been modified
+        int new_maxneighs;
+        checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
+        if (new_maxneighs > c_neighbor->maxneighs){
+            resize = 1;
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", c_neighbor->maxneighs);
+            c_neighbor->maxneighs = new_maxneighs * 1.2;
+            printf("NEW SIZE %d\n", c_neighbor->maxneighs);
+            cudaFree(c_neighbor->neighbors);
+            checkCUDAError("c_neighbor->neighbors resize malloc",
+                           cudaMalloc((void**)(&c_neighbor->neighbors),
+                                      c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
+        }
+
+    }
+    neighbor->maxneighs = c_neighbor->maxneighs;
+
+    cudaProfilerStop();
+}
+}
@@ -0,0 +1,286 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+
+extern "C" {
+
+#include <pbc.h>
+#include <atom.h>
+#include <allocate.h>
+
+#define DELTA 20000
+
+}
+
+__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    Atom* atom = &a;
+    if( i >= atom->Nlocal ){
+        return;
+    }
+
+    if (atom_x(i) < 0.0) {
+        atom_x(i) += xprd;
+    } else if (atom_x(i) >= xprd) {
+        atom_x(i) -= xprd;
+    }
+
+    if (atom_y(i) < 0.0) {
+        atom_y(i) += yprd;
+    } else if (atom_y(i) >= yprd) {
+        atom_y(i) -= yprd;
+    }
+
+    if (atom_z(i) < 0.0) {
+        atom_z(i) += zprd;
+    } else if (atom_z(i) >= zprd) {
+        atom_z(i) -= zprd;
+    }
+}
+
+__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nghost = a.Nghost;
+    if( i >= Nghost ) {
+        return;
+    }
+    Atom* atom = &a;
+    int *border_map = atom->border_map;
+    int nlocal = atom->Nlocal;
+
+    atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
+    atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
+    atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
+}
+
+extern "C"{
+
+static int NmaxGhost;
+static int *PBCx, *PBCy, *PBCz;
+
+static int c_NmaxGhost = 0;
+static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
+
+static void growPbc(Atom *);
+
+/* exported subroutines */
+void initPbc(Atom *atom) {
+    NmaxGhost = 0;
+    atom->border_map = NULL;
+    PBCx = NULL;
+    PBCy = NULL;
+    PBCz = NULL;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc(Atom *atom, Parameter *param) {
+    int *border_map = atom->border_map;
+    int nlocal = atom->Nlocal;
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for (int i = 0; i < atom->Nghost; i++) {
+        atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
+        atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
+        atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
+    }
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc_cuda(Atom *atom, Parameter *param, Atom *c_atom, bool doReneighbor, const int num_threads_per_block) {
+    if (doReneighbor){
+        c_atom->Natoms = atom->Natoms;
+        c_atom->Nlocal = atom->Nlocal;
+        c_atom->Nghost = atom->Nghost;
+        c_atom->ntypes = atom->ntypes;
+
+        if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
+            c_atom->Nmax = atom->Nmax;
+            if(c_atom->x != NULL){ cudaFree(c_atom->x); }
+            if(c_atom->type != NULL){ cudaFree(c_atom->type); }
+            checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+            checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
+        }
+        // TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well
+        checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+
+        if(c_NmaxGhost < NmaxGhost){
+            c_NmaxGhost = NmaxGhost;
+            if(c_PBCx != NULL){ cudaFree(c_PBCx); }
+            if(c_PBCy != NULL){ cudaFree(c_PBCy); }
+            if(c_PBCz != NULL){ cudaFree(c_PBCz); }
+            if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); }
+            checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) );
+        }
+        checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+    }
+
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
+
+    /*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz,
+     *                                                          MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)
+     * */
+    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
+    checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
+}
+
+/* relocate atoms that have left domain according
+ * to periodic boundary conditions */
+void updateAtomsPbc(Atom *atom, Parameter *param) {
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+
+        if (atom_x(i) < 0.0) {
+            atom_x(i) += xprd;
+        } else if (atom_x(i) >= xprd) {
+            atom_x(i) -= xprd;
+        }
+
+        if (atom_y(i) < 0.0) {
+            atom_y(i) += yprd;
+        } else if (atom_y(i) >= yprd) {
+            atom_y(i) -= yprd;
+        }
+
+        if (atom_z(i) < 0.0) {
+            atom_z(i) += zprd;
+        } else if (atom_z(i) >= zprd) {
+            atom_z(i) -= zprd;
+        }
+    }
+}
+
+void updateAtomsPbc_cuda(Atom* atom, Parameter* param, Atom* c_atom, const int num_threads_per_block){
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+    /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
+    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
+
+    checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
+
+    checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+}
+
+/* setup periodic boundary conditions by
+ * defining ghost atoms around domain
+ * only creates mapping and coordinate corrections
+ * that are then enforced in updatePbc */
+#define ADDGHOST(dx, dy, dz)                              \
+    Nghost++;                                           \
+    border_map[Nghost] = i;                             \
+    PBCx[Nghost] = dx;                                  \
+    PBCy[Nghost] = dy;                                  \
+    PBCz[Nghost] = dz;                                  \
+    atom->type[atom->Nlocal + Nghost] = atom->type[i]
+
+void setupPbc(Atom *atom, Parameter *param) {
+    int *border_map = atom->border_map;
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+    MD_FLOAT Cutneigh = param->cutneigh;
+    int Nghost = -1;
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+
+        if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
+            growAtom(atom);
+        }
+        if (Nghost + 7 >= NmaxGhost) {
+            growPbc(atom);
+            border_map = atom->border_map;
+        }
+
+        MD_FLOAT x = atom_x(i);
+        MD_FLOAT y = atom_y(i);
+        MD_FLOAT z = atom_z(i);
+
+        /* Setup ghost atoms */
+        /* 6 planes */
+        if (x < Cutneigh) { ADDGHOST(+1, 0, 0); }
+        if (x >= (xprd - Cutneigh)) { ADDGHOST(-1, 0, 0); }
+        if (y < Cutneigh) { ADDGHOST(0, +1, 0); }
+        if (y >= (yprd - Cutneigh)) { ADDGHOST(0, -1, 0); }
+        if (z < Cutneigh) { ADDGHOST(0, 0, +1); }
+        if (z >= (zprd - Cutneigh)) { ADDGHOST(0, 0, -1); }
+        /* 8 corners */
+        if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1, +1, +1); }
+        if (x < Cutneigh && y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(+1, -1, +1); }
+        if (x < Cutneigh && y >= Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(+1, +1, -1); }
+        if (x < Cutneigh && y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(+1, -1, -1); }
+        if (x >= (xprd - Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1, +1, +1); }
+        if (x >= (xprd - Cutneigh) && y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(-1, -1, +1); }
+        if (x >= (xprd - Cutneigh) && y < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(-1, +1, -1); }
+        if (x >= (xprd - Cutneigh) && y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(-1, -1, -1); }
+        /* 12 edges */
+        if (x < Cutneigh && z < Cutneigh) { ADDGHOST(+1, 0, +1); }
+        if (x < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(+1, 0, -1); }
+        if (x >= (xprd - Cutneigh) && z < Cutneigh) { ADDGHOST(-1, 0, +1); }
+        if (x >= (xprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(-1, 0, -1); }
+        if (y < Cutneigh && z < Cutneigh) { ADDGHOST(0, +1, +1); }
+        if (y < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(0, +1, -1); }
+        if (y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(0, -1, +1); }
+        if (y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(0, -1, -1); }
+        if (y < Cutneigh && x < Cutneigh) { ADDGHOST(+1, +1, 0); }
+        if (y < Cutneigh && x >= (xprd - Cutneigh)) { ADDGHOST(-1, +1, 0); }
+        if (y >= (yprd - Cutneigh) && x < Cutneigh) { ADDGHOST(+1, -1, 0); }
+        if (y >= (yprd - Cutneigh) && x >= (xprd - Cutneigh)) { ADDGHOST(-1, -1, 0); }
+    }
+    // increase by one to make it the ghost atom count
+    atom->Nghost = Nghost + 1;
+}
+
+/* internal subroutines */
+void growPbc(Atom *atom) {
+    int nold = NmaxGhost;
+    NmaxGhost += DELTA;
+
+    atom->border_map = (int *) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCx = (int *) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCy = (int *) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCz = (int *) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+}
+}
@@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms)
 void computeThermo(int iflag, Parameter *param, Atom *atom)
 {
    MD_FLOAT t = 0.0, p;
-    MD_FLOAT* vx = atom->vx;
-    MD_FLOAT* vy = atom->vy;
-    MD_FLOAT* vz = atom->vz;

    for(int i = 0; i < atom->Nlocal; i++) {
-        t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
    }

    t = t * t_scale;
@@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom)
 {
    /* zero center-of-mass motion */
    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
-    MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;

    for(int i = 0; i < atom->Nlocal; i++) {
-        vxtot += vx[i];
-        vytot += vy[i];
-        vztot += vz[i];
+        vxtot += atom_vx(i);
+        vytot += atom_vy(i);
+        vztot += atom_vz(i);
    }

    vxtot = vxtot / atom->Natoms;
@@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom)
    vztot = vztot / atom->Natoms;

    for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] -= vxtot;
-        vy[i] -= vytot;
-        vz[i] -= vztot;
+        atom_vx(i) -= vxtot;
+        atom_vy(i) -= vytot;
+        atom_vz(i) -= vztot;
    }

    t_act = 0;
    MD_FLOAT t = 0.0;

    for(int i = 0; i < atom->Nlocal; i++) {
-        t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
    }

    t *= t_scale;
    MD_FLOAT factor = sqrt(param->temp / t);

    for(int i = 0; i < atom->Nlocal; i++) {
-        vx[i] *= factor;
-        vy[i] *= factor;
-        vz[i] *= factor;
+        atom_vx(i) *= factor;
+        atom_vy(i) *= factor;
+        atom_vz(i) *= factor;
    }
 }
@@ -1,106 +0,0 @@
-/*
- * =======================================================================================
- *
- *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
- *
- *   This file is part of MD-Bench.
- *
- *   MD-Bench is free software: you can redistribute it and/or modify it
- *   under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
- *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
- *   details.
- *
- *   You should have received a copy of the GNU Lesser General Public License along
- *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
- * =======================================================================================
- */
-#include <likwid-marker.h>
-
-#include <timing.h>
-#include <neighbor.h>
-#include <parameter.h>
-#include <atom.h>
-
-double computeForce(
-        Parameter *param,
-        Atom *atom,
-        Neighbor *neighbor
-        )
-{
-    int Nlocal = atom->Nlocal;
-    int* neighs;
-    MD_FLOAT* fx = atom->fx;
-    MD_FLOAT* fy = atom->fy;
-    MD_FLOAT* fz = atom->fz;
-#ifndef EXPLICIT_TYPES
-    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
-    MD_FLOAT sigma6 = param->sigma6;
-    MD_FLOAT epsilon = param->epsilon;
-#endif
-
-    for(int i = 0; i < Nlocal; i++) {
-        fx[i] = 0.0;
-        fy[i] = 0.0;
-        fz[i] = 0.0;
-    }
-
-    double S = getTimeStamp();
-    LIKWID_MARKER_START("force");
-
-#pragma omp parallel for
-    for(int i = 0; i < Nlocal; i++) {
-        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
-        int numneighs = neighbor->numneigh[i];
-        MD_FLOAT xtmp = atom_x(i);
-        MD_FLOAT ytmp = atom_y(i);
-        MD_FLOAT ztmp = atom_z(i);
-        MD_FLOAT fix = 0;
-        MD_FLOAT fiy = 0;
-        MD_FLOAT fiz = 0;
-
-#ifdef EXPLICIT_TYPES
-        const int type_i = atom->type[i];
-#endif
-
-
-        for(int k = 0; k < numneighs; k++) {
-            int j = neighs[k];
-            MD_FLOAT delx = xtmp - atom_x(j);
-            MD_FLOAT dely = ytmp - atom_y(j);
-            MD_FLOAT delz = ztmp - atom_z(j);
-            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
-
-#ifdef EXPLICIT_TYPES
-            const int type_j = atom->type[j];
-            const int type_ij = type_i * atom->ntypes + type_j;
-            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
-            const MD_FLOAT sigma6 = atom->sigma6[type_ij];
-            const MD_FLOAT epsilon = atom->epsilon[type_ij];
-#endif
-
-            if(rsq < cutforcesq) {
-                MD_FLOAT sr2 = 1.0 / rsq;
-                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
-                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
-                fix += delx * force;
-                fiy += dely * force;
-                fiz += delz * force;
-            }
-        }
-
-        fx[i] += fix;
-        fy[i] += fiy;
-        fz[i] += fiz;
-    }
-
-    LIKWID_MARKER_STOP("force");
-    double E = getTimeStamp();
-
-    return E-S;
-}
@@ -1,396 +0,0 @@
-/*
- * =======================================================================================
- *
- *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
- *
- *   This file is part of MD-Bench.
- *
- *   MD-Bench is free software: you can redistribute it and/or modify it
- *   under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
- *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
- *   details.
- *
- *   You should have received a copy of the GNU Lesser General Public License along
- *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-
-#include <neighbor.h>
-#include <parameter.h>
-#include <atom.h>
-
-#define SMALL 1.0e-6
-#define FACTOR 0.999
-
-static MD_FLOAT xprd, yprd, zprd;
-static MD_FLOAT bininvx, bininvy, bininvz;
-static int mbinxlo, mbinylo, mbinzlo;
-static int nbinx, nbiny, nbinz;
-static int mbinx, mbiny, mbinz; // n bins in x, y, z
-static int *bincount;
-static int *bins;
-static int mbins; //total number of bins
-static int atoms_per_bin;  // max atoms per bin
-static MD_FLOAT cutneigh;
-static MD_FLOAT cutneighsq;  // neighbor cutoff squared
-static int nmax;
-static int nstencil;      // # of bins in stencil
-static int* stencil;      // stencil list of bin offsets
-static MD_FLOAT binsizex, binsizey, binsizez;
-
-static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
-static MD_FLOAT bindist(int, int, int);
-
-/* exported subroutines */
-void initNeighbor(Neighbor *neighbor, Parameter *param)
-{
-    MD_FLOAT neighscale = 5.0 / 6.0;
-    xprd = param->nx * param->lattice;
-    yprd = param->ny * param->lattice;
-    zprd = param->nz * param->lattice;
-    cutneigh = param->cutneigh;
-    nbinx = neighscale * param->nx;
-    nbiny = neighscale * param->ny;
-    nbinz = neighscale * param->nz;
-    nmax = 0;
-    atoms_per_bin = 8;
-    stencil = NULL;
-    bins = NULL;
-    bincount = NULL;
-    neighbor->maxneighs = 100;
-    neighbor->numneigh = NULL;
-    neighbor->neighbors = NULL;
-}
-
-void setupNeighbor()
-{
-    MD_FLOAT coord;
-    int mbinxhi, mbinyhi, mbinzhi;
-    int nextx, nexty, nextz;
-    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
-    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
-    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
-
-    cutneighsq = cutneigh * cutneigh;
-    binsizex = xprd / nbinx;
-    binsizey = yprd / nbiny;
-    binsizez = zprd / nbinz;
-    bininvx = 1.0 / binsizex;
-    bininvy = 1.0 / binsizey;
-    bininvz = 1.0 / binsizez;
-
-    coord = xlo - cutneigh - SMALL * xprd;
-    mbinxlo = (int) (coord * bininvx);
-    if (coord < 0.0) {
-        mbinxlo = mbinxlo - 1;
-    }
-    coord = xhi + cutneigh + SMALL * xprd;
-    mbinxhi = (int) (coord * bininvx);
-
-    coord = ylo - cutneigh - SMALL * yprd;
-    mbinylo = (int) (coord * bininvy);
-    if (coord < 0.0) {
-        mbinylo = mbinylo - 1;
-    }
-    coord = yhi + cutneigh + SMALL * yprd;
-    mbinyhi = (int) (coord * bininvy);
-
-    coord = zlo - cutneigh - SMALL * zprd;
-    mbinzlo = (int) (coord * bininvz);
-    if (coord < 0.0) {
-        mbinzlo = mbinzlo - 1;
-    }
-    coord = zhi + cutneigh + SMALL * zprd;
-    mbinzhi = (int) (coord * bininvz);
-
-    mbinxlo = mbinxlo - 1;
-    mbinxhi = mbinxhi + 1;
-    mbinx = mbinxhi - mbinxlo + 1;
-
-    mbinylo = mbinylo - 1;
-    mbinyhi = mbinyhi + 1;
-    mbiny = mbinyhi - mbinylo + 1;
-
-    mbinzlo = mbinzlo - 1;
-    mbinzhi = mbinzhi + 1;
-    mbinz = mbinzhi - mbinzlo + 1;
-
-    nextx = (int) (cutneigh * bininvx);
-    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
-
-    nexty = (int) (cutneigh * bininvy);
-    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
-
-    nextz = (int) (cutneigh * bininvz);
-    if(nextz * binsizez < FACTOR * cutneigh) nextz++;
-
-    if (stencil) {
-        free(stencil);
-    }
-
-    stencil = (int*) malloc(
-            (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
-
-    nstencil = 0;
-    int kstart = -nextz;
-
-    for(int k = kstart; k <= nextz; k++) {
-        for(int j = -nexty; j <= nexty; j++) {
-            for(int i = -nextx; i <= nextx; i++) {
-                if(bindist(i, j, k) < cutneighsq) {
-                    stencil[nstencil++] =
-                        k * mbiny * mbinx + j * mbinx + i;
-                }
-            }
-        }
-    }
-
-    mbins = mbinx * mbiny * mbinz;
-
-    if (bincount) {
-        free(bincount);
-    }
-    bincount = (int*) malloc(mbins * sizeof(int));
-
-    if (bins) {
-        free(bins);
-    }
-    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
-}
-
-void buildNeighbor(Atom *atom, Neighbor *neighbor)
-{
-    int nall = atom->Nlocal + atom->Nghost;
-
-    /* extend atom arrays if necessary */
-    if(nall > nmax) {
-        nmax = nall;
-        if(neighbor->numneigh) free(neighbor->numneigh);
-        if(neighbor->neighbors) free(neighbor->neighbors);
-        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
-        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
-    }
-
-    /* bin local & ghost atoms */
-    binatoms(atom);
-    int resize = 1;
-
-    /* loop over each atom, storing neighbors */
-    while(resize) {
-        int new_maxneighs = neighbor->maxneighs;
-        resize = 0;
-
-        for(int i = 0; i < atom->Nlocal; i++) {
-            int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
-            int n = 0;
-            MD_FLOAT xtmp = atom_x(i);
-            MD_FLOAT ytmp = atom_y(i);
-            MD_FLOAT ztmp = atom_z(i);
-            int ibin = coord2bin(xtmp, ytmp, ztmp);
-            #ifdef EXPLICIT_TYPES
-            int type_i = atom->type[i];
-            #endif
-            for(int k = 0; k < nstencil; k++) {
-                int jbin = ibin + stencil[k];
-                int* loc_bin = &bins[jbin * atoms_per_bin];
-
-                for(int m = 0; m < bincount[jbin]; m++) {
-                    int j = loc_bin[m];
-
-                    if ( j == i ){
-                        continue;
-                    }
-
-                    MD_FLOAT delx = xtmp - atom_x(j);
-                    MD_FLOAT dely = ytmp - atom_y(j);
-                    MD_FLOAT delz = ztmp - atom_z(j);
-                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
-
-                    #ifdef EXPLICIT_TYPES
-                    int type_j = atom->type[j];
-                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
-                    #else
-                    const MD_FLOAT cutoff = cutneighsq;
-                    #endif
-                    if( rsq <= cutoff ) {
-                        neighptr[n++] = j;
-                    }
-                }
-            }
-
-            neighbor->numneigh[i] = n;
-
-            if(n >= neighbor->maxneighs) {
-                resize = 1;
-
-                if(n >= new_maxneighs) {
-                    new_maxneighs = n;
-                }
-            }
-        }
-
-        if(resize) {
-            printf("RESIZE %d\n", neighbor->maxneighs);
-            neighbor->maxneighs = new_maxneighs * 1.2;
-            free(neighbor->neighbors);
-            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
-        }
-    }
-}
-
-/* internal subroutines */
-MD_FLOAT bindist(int i, int j, int k)
-{
-    MD_FLOAT delx, dely, delz;
-
-    if(i > 0) {
-        delx = (i - 1) * binsizex;
-    } else if(i == 0) {
-        delx = 0.0;
-    } else {
-        delx = (i + 1) * binsizex;
-    }
-
-    if(j > 0) {
-        dely = (j - 1) * binsizey;
-    } else if(j == 0) {
-        dely = 0.0;
-    } else {
-        dely = (j + 1) * binsizey;
-    }
-
-    if(k > 0) {
-        delz = (k - 1) * binsizez;
-    } else if(k == 0) {
-        delz = 0.0;
-    } else {
-        delz = (k + 1) * binsizez;
-    }
-
-    return (delx * delx + dely * dely + delz * delz);
-}
-
-int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin)
-{
-    int ix, iy, iz;
-
-    if(xin >= xprd) {
-        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
-    } else if(xin >= 0.0) {
-        ix = (int)(xin * bininvx) - mbinxlo;
-    } else {
-        ix = (int)(xin * bininvx) - mbinxlo - 1;
-    }
-
-    if(yin >= yprd) {
-        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
-    } else if(yin >= 0.0) {
-        iy = (int)(yin * bininvy) - mbinylo;
-    } else {
-        iy = (int)(yin * bininvy) - mbinylo - 1;
-    }
-
-    if(zin >= zprd) {
-        iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
-    } else if(zin >= 0.0) {
-        iz = (int)(zin * bininvz) - mbinzlo;
-    } else {
-        iz = (int)(zin * bininvz) - mbinzlo - 1;
-    }
-
-    return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
-}
-
-void binatoms(Atom *atom)
-{
-    int nall = atom->Nlocal + atom->Nghost;
-    int resize = 1;
-
-    while(resize > 0) {
-        resize = 0;
-
-        for(int i = 0; i < mbins; i++) {
-            bincount[i] = 0;
-        }
-
-        for(int i = 0; i < nall; i++) {
-            int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
-
-            if(bincount[ibin] < atoms_per_bin) {
-                int ac = bincount[ibin]++;
-                bins[ibin * atoms_per_bin + ac] = i;
-            } else {
-                resize = 1;
-            }
-        }
-
-        if(resize) {
-            free(bins);
-            atoms_per_bin *= 2;
-            bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
-        }
-    }
-}
-
-void sortAtom(Atom* atom) {
-    binatoms(atom);
-    int Nmax = atom->Nmax;
-    int* binpos = bincount;
-
-    for(int i=1; i<mbins; i++) {
-        binpos[i] += binpos[i-1];
-    }
-
-#ifdef AOS
-    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
-#else
-    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-#endif
-    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
-    double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
-
-    for(int mybin = 0; mybin<mbins; mybin++) {
-        int start = mybin>0?binpos[mybin-1]:0;
-        int count = binpos[mybin] - start;
-        for(int k=0; k<count; k++) {
-            int new_i = start + k;
-            int old_i = bins[mybin * atoms_per_bin + k];
-#ifdef AOS
-            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
-            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
-            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
-#else
-            new_x[new_i] = old_x[old_i];
-            new_y[new_i] = old_y[old_i];
-            new_z[new_i] = old_z[old_i];
-#endif
-            new_vx[new_i] = old_vx[old_i];
-            new_vy[new_i] = old_vy[old_i];
-            new_vz[new_i] = old_vz[old_i];
-        }
-    }
-
-    free(atom->x);
-    atom->x = new_x;
-#ifndef AOS
-    free(atom->y);
-    free(atom->z);
-    atom->y = new_y; atom->z = new_z;
-#endif
-    free(atom->vx); free(atom->vy); free(atom->vz);
-    atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz;
-}
@@ -1,172 +0,0 @@
-/*
- * =======================================================================================
- *
- *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
- *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
- *
- *   This file is part of MD-Bench.
- *
- *   MD-Bench is free software: you can redistribute it and/or modify it
- *   under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
- *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
- *   details.
- *
- *   You should have received a copy of the GNU Lesser General Public License along
- *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <pbc.h>
-#include <atom.h>
-#include <allocate.h>
-
-#define DELTA 20000
-
-static int NmaxGhost;
-static int *PBCx, *PBCy, *PBCz;
-
-static void growPbc(Atom*);
-
-/* exported subroutines */
-void initPbc(Atom* atom)
-{
-    NmaxGhost = 0;
-    atom->border_map = NULL;
-    PBCx = NULL; PBCy = NULL; PBCz = NULL;
-}
-
-/* update coordinates of ghost atoms */
-/* uses mapping created in setupPbc */
-void updatePbc(Atom *atom, Parameter *param)
-{
-    int *border_map = atom->border_map;
-    int nlocal = atom->Nlocal;
-    MD_FLOAT xprd = param->xprd;
-    MD_FLOAT yprd = param->yprd;
-    MD_FLOAT zprd = param->zprd;
-
-    for(int i = 0; i < atom->Nghost; i++) {
-        atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
-        atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
-        atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
-    }
-}
-
-/* relocate atoms that have left domain according
- * to periodic boundary conditions */
-void updateAtomsPbc(Atom *atom, Parameter *param)
-{
-    MD_FLOAT xprd = param->xprd;
-    MD_FLOAT yprd = param->yprd;
-    MD_FLOAT zprd = param->zprd;
-
-    for(int i = 0; i < atom->Nlocal; i++) {
-
-        if(atom_x(i) < 0.0) {
-            atom_x(i) += xprd;
-        } else if(atom_x(i) >= xprd) {
-            atom_x(i) -= xprd;
-        }
-
-        if(atom_y(i) < 0.0) {
-            atom_y(i) += yprd;
-        } else if(atom_y(i) >= yprd) {
-            atom_y(i) -= yprd;
-        }
-
-        if(atom_z(i) < 0.0) {
-            atom_z(i) += zprd;
-        } else if(atom_z(i) >= zprd) {
-            atom_z(i) -= zprd;
-        }
-    }
-}
-
-/* setup periodic boundary conditions by
- * defining ghost atoms around domain
- * only creates mapping and coordinate corrections
- * that are then enforced in updatePbc */
-#define ADDGHOST(dx,dy,dz)                              \
-    Nghost++;                                           \
-    border_map[Nghost] = i;                             \
-    PBCx[Nghost] = dx;                                  \
-    PBCy[Nghost] = dy;                                  \
-    PBCz[Nghost] = dz;                                  \
-    atom->type[atom->Nlocal + Nghost] = atom->type[i]
-
-void setupPbc(Atom *atom, Parameter *param)
-{
-    int *border_map = atom->border_map;
-    MD_FLOAT xprd = param->xprd;
-    MD_FLOAT yprd = param->yprd;
-    MD_FLOAT zprd = param->zprd;
-    MD_FLOAT Cutneigh = param->cutneigh;
-    int Nghost = -1;
-
-    for(int i = 0; i < atom->Nlocal; i++) {
-
-        if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
-            growAtom(atom);
-        }
-        if (Nghost + 7 >= NmaxGhost) {
-            growPbc(atom);
-            border_map = atom->border_map;
-        }
-
-        MD_FLOAT x = atom_x(i);
-        MD_FLOAT y = atom_y(i);
-        MD_FLOAT z = atom_z(i);
-
-        /* Setup ghost atoms */
-        /* 6 planes */
-        if (x < Cutneigh)         { ADDGHOST(+1,0,0); }
-        if (x >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
-        if (y < Cutneigh)         { ADDGHOST(0,+1,0); }
-        if (y >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
-        if (z < Cutneigh)         { ADDGHOST(0,0,+1); }
-        if (z >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
-        /* 8 corners */
-        if (x < Cutneigh         && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(+1,+1,+1); }
-        if (x < Cutneigh         && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(+1,-1,+1); }
-        if (x < Cutneigh         && y >= Cutneigh        && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
-        if (x < Cutneigh         && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
-        if (x >= (xprd-Cutneigh) && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(-1,+1,+1); }
-        if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(-1,-1,+1); }
-        if (x >= (xprd-Cutneigh) && y < Cutneigh         && z >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
-        if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
-        /* 12 edges */
-        if (x < Cutneigh         && z < Cutneigh)         { ADDGHOST(+1,0,+1); }
-        if (x < Cutneigh         && z >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
-        if (x >= (xprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(-1,0,+1); }
-        if (x >= (xprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
-        if (y < Cutneigh         && z < Cutneigh)         { ADDGHOST(0,+1,+1); }
-        if (y < Cutneigh         && z >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
-        if (y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(0,-1,+1); }
-        if (y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
-        if (y < Cutneigh         && x < Cutneigh)         { ADDGHOST(+1,+1,0); }
-        if (y < Cutneigh         && x >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
-        if (y >= (yprd-Cutneigh) && x < Cutneigh)         { ADDGHOST(+1,-1,0); }
-        if (y >= (yprd-Cutneigh) && x >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
-    }
-    // increase by one to make it the ghost atom count
-    atom->Nghost = Nghost + 1;
-}
-
-/* internal subroutines */
-void growPbc(Atom* atom)
-{
-    int nold = NmaxGhost;
-    NmaxGhost += DELTA;
-
-    atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
-    PBCx = (int*) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
-    PBCy = (int*) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
-    PBCz = (int*) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
-}
Author	SHA1	Message	Date
Rafael Ravedutti	bc7b523979	Move src directory to lammps Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-08-04 17:25:31 +02:00
Rafael Ravedutti	eeba125a52	Remove likwid and architecture-specific compilation flags Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-08-04 17:09:17 +02:00
Martin Bauernfeind	b32254b03f	Changed data types in currently unused sort method to also work with single precision floating numbers	2022-07-22 13:55:27 +02:00
Martin Bauernfeind	4dac820784	Added newline in output to improve formatting	2022-07-20 23:04:22 +02:00
Martin Bauernfeind	fe56c50efd	Added one more output line to output the force kernel throughput	2022-07-20 22:43:57 +02:00
Martin Bauernfeind	7a61cbbabf	Instrumented the reneighbor function in order to obtain runtimes of its compontents	2022-07-19 20:38:11 +02:00
Martin Bauernfeind	176de0525b	Instrumented the reneighbor function with timers (via getTimestamp()) to measure the runtime of its different components/methods	2022-07-17 18:34:17 +02:00
Martin Bauernfeind	7bad7e84b6	Fixed compiler errors	2022-07-13 14:52:37 +02:00
Martin Bauernfeind	fb304f240b	Small changes in buildNeighbor to initialize the bincount list and other arrays only once	2022-07-13 14:42:34 +02:00
Martin Bauernfeind	5a6d1851ed	Ported updateAtomsPbc to cuda and changed the code to use the cuda version from now on	2022-07-13 14:07:19 +02:00
Martin Bauernfeind	f61f59ba3f	Fixed a compiler error and removed an unnecessary memcpy (from device to host) - performance seems to have crossed the 300M updates/second mark for the A100	2022-07-11 00:55:42 +02:00
Martin Bauernfeind	d1c2249b55	Added code to sort the contents of all bins to make it comparable to the CPU version	2022-07-11 00:24:48 +02:00
Martin Bauernfeind	c9db6e45fa	Fixed compiler errors	2022-07-10 21:13:37 +02:00
Martin Bauernfeind	0967e8f671	The program now does the binning on the GPU via the binatoms_cuda method	2022-07-10 18:05:06 +02:00
Martin Bauernfeind	fa409c016c	Added a struct to contain binning information such as the pointer to bincount and bins - not used yet	2022-07-08 13:52:45 +02:00
Martin Bauernfeind	b65199308d	Ported the binatoms method to cuda - not used in the program yet	2022-07-06 01:09:11 +02:00
Martin Bauernfeind	71798f5ec5	🐛 Fixed aforementioned correctness issue by deleting a superflous cudaMemcpy in computeForce() that was overwriting correct data with incorrect data	2022-07-05 00:54:11 +02:00
Martin Bauernfeind	4f0403d3ea	Fixed an correctness issue by conservatively copying over data from and to the GPU	2022-07-05 00:33:12 +02:00
Martin Bauernfeind	fa86e44f90	Fixed wrong number of threadblock being launched	2022-07-04 19:36:09 +02:00
Martin Bauernfeind	7e8fd96fa4	Fixed some compiler errors - the simulation seems to be off regarding how many ghost atoms are used -> some bugfixing might be needed	2022-07-03 21:14:33 +02:00
Martin Bauernfeind	463de5b1ed	Ported the updatePbc method to cuda	2022-07-03 19:53:33 +02:00
Martin Bauernfeind	4a32a62a98	🐛 Fixed some bugs - ✨ neighborhood computation now seems to be quite fast	2022-06-26 20:19:59 +02:00
Martin Bauernfeind	16e8b76012	Added debug output to find memory leak	2022-06-26 19:43:10 +02:00
Martin Bauernfeind	60ed524dd8	Fixed various compiler errors - now there's probably a memory leak remaining	2022-06-26 18:37:09 +02:00
Martin Bauernfeind	45f83c7607	Fixed some struct declaration mistakes	2022-06-26 17:52:09 +02:00
Martin Bauernfeind	c49278cb21	First crude attempt at parallelizing neighborhood computation (only the part after binning the atoms is parallelized with cuda)	2022-06-26 16:25:59 +02:00
Martin Bauernfeind	757d4329f3	Added a rough sketch for the next steps of porting neighborhood computation to cuda	2022-06-23 23:58:15 +02:00
Martin Bauernfeind	67f9c769ef	Fixing errors - hopefully it works this time	2022-06-23 22:25:55 +02:00
Martin Bauernfeind	b5b4d23c0c	🐛 further refactoring fixing	2022-06-23 19:46:29 +02:00
Martin Bauernfeind	fea1e41daa	🐛 further refactoring step fixing	2022-06-23 19:43:36 +02:00
Martin Bauernfeind	f1998b7acc	🐛 further refactor step fixing	2022-06-23 19:39:36 +02:00
Martin Bauernfeind	2fe3cd80a0	🐛 further refactor step fixing	2022-06-23 19:36:59 +02:00
Martin Bauernfeind	f4313f64e5	♻️ further refactoring step fixing	2022-06-23 19:34:16 +02:00
Martin Bauernfeind	7f068a6959	♻️ Fixing refactoring step	2022-06-23 19:32:09 +02:00
Martin Bauernfeind	62cfc22856	♻️ Refactoring: pulled definition of the GPU atom and neighbor representation from force.cu and put it into main	2022-06-23 18:54:56 +02:00
Maximilian Gaul	b024adaf5b	Re-measure for 2000 time steps	2022-02-05 14:13:36 +01:00
Maximilian Gaul	696e6da01d	Implement Neighbour list AoS memory layout + performance measurement	2022-01-31 20:27:59 +01:00
Maximilian Gaul	b2a6574426	Remove unnecessary atom force backcopy in computeForce	2022-01-24 18:09:27 +01:00
Maximilian Gaul	c4080e866e	Make integrate kernels aware of neighbour list update	2022-01-24 18:04:50 +01:00
Maximilian Gaul	7b592b5fc7	Moved presentation resources to second presentation	2022-01-05 12:48:37 +01:00
Maximilian Gaul	4690542db5	Added CPU metrics {Cache, FLOPS, L2, L3}, restructured resource folders	2022-01-05 12:31:47 +01:00
Maximilian Gaul	8c131a7699	Reminder for likwid perf measurements	2022-01-04 13:51:53 +01:00
Maximilian Gaul	dc4d5f1a9c	Porting atom velocity memory layout to AoS, porting velocity integration to CUDA, adding measurements + logbook update	2022-01-01 18:18:12 +01:00
Maximilian Gaul	50007216ed	Implemented atom force AoS memory layout, added performance measurements + logbook Update	2022-01-01 16:09:21 +01:00
Maximilian Gaul	72e4599acc	Copy neighbour lists only when reneighbouring happens, added measurements + logbook update	2022-01-01 12:56:42 +01:00
Maximilian Gaul	8fa03733e9	Copy parameters & cutforces threshold only once at the start + measurements	2021-12-28 16:48:26 +01:00
Maximilian Gaul	bf1ae3d013	Removed debug prints, only zero atom forces and not copy them, added measurements	2021-12-28 16:32:54 +01:00
Maximilian Gaul	8009b54113	Trying to debug segfault if cudaMemcpy is limited to neighbour list update	2021-12-25 15:36:08 +01:00
Maximilian Gaul	0ea0587442	Only malloc once at the beginning plus measurement csv	2021-12-25 13:52:33 +01:00
Maximilian Gaul	134e3f4b78	Also pinnend neighbor-struct memory, added additional performance measurements, added nvprof result to logbook	2021-12-18 15:58:56 +01:00
Maximilian Gaul	c2bfa3ca3f	Add scripts for perf measurement, made atom-memory allocation pinnend using 'cudaMallocHost', added measurements for atom pinnend memory	2021-12-18 13:02:04 +01:00
Maximilian Gaul	2a099da5b7	Started cuda profiling, added first result to logbook	2021-12-03 08:13:43 +01:00
Maximilian Gaul	7691b23d67	Measure memory transfer of CPU to GPU, add explanation how to distribute calculation among multiple GPUs	2021-12-01 17:16:32 +01:00
Maximilian Gaul	da90466f98	Added first performance measurements with threads per block from 1 to 32	2021-11-25 08:09:20 +01:00
Maximilian Gaul	8f723c1299	Added command line description of MD-Bench, added memory transfer rate from CPU to GPU to force.cu	2021-11-23 15:55:23 +01:00
Maximilian Gaul	0586ef150a	Fix num of threads instead of num of blocks, add logbook template	2021-11-15 19:39:09 +01:00
Maximilian Gaul	2e5d973f7d	Rough rewrite to execute outer loop of force calculation in parallel, not inner loop	2021-11-14 10:02:23 +01:00
Maximilian Gaul	e2fd1a0476	Fixed bug, results are now equal to master branch (but still slow)	2021-11-11 21:00:30 +01:00
Maximilian Gaul	4105c844c6	Runs fine (but slow), results seem to be slightly off from original	2021-11-11 20:47:06 +01:00
Maximilian Gaul	1f5c9c4b23	Fixed segfault error, added more cudaErrorChecks, added cudaFree to avoid memory leak	2021-11-11 20:29:14 +01:00
Maximilian Gaul	29e115464b	Fixed cudaMemcpy for AOS data layout, added debug outputs, added cudaErrorChecks	2021-11-11 20:14:30 +01:00
Maximilian Gaul	1a54314c8b	First run but segfault at the moment after a few seconds	2021-11-11 15:23:46 +01:00
Maximilian Gaul	280f595b7f	Fixed linker error by putting includes and cuda function in extern 'C'	2021-11-11 14:49:29 +01:00
Maximilian Gaul	3428974730	getTimeStamp() couldn't get linked	2021-11-11 08:03:56 +01:00
Maximilian Gaul	b54842f764	Added Makefile instructions for .cu files	2021-11-11 07:27:12 +01:00
Maximilian Gaul	9730164e6f	Rename force.c to force.cu because of cuda build errors	2021-11-10 16:20:04 +01:00
Maximilian Gaul	0f5fdd3708	Sum results after cuda function executed	2021-11-10 16:02:05 +01:00
Maximilian Gaul	3f7fb7f22a	cudaMemcpy of Atom and other properties, first draft implementation of CUDA kernel	2021-11-09 16:40:25 +01:00
Maximilian Gaul	bfa6c581c3	Copy necessary values for force calculation into cuda memory	2021-11-09 08:37:37 +01:00
Maximilian Gaul	fd886e77eb	Added make config for NVCC	2021-11-08 20:32:12 +01:00