Move src directory to lammps

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-04 17:25:31 +02:00
parent eeba125a52
commit bc7b523979
28 changed files with 0 additions and 0 deletions
--- a/lammps/allocate.c
+++ b/lammps/allocate.c
@@ -0,0 +1,89 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include <cuda_runtime.h>
+
+void checkCUDAError(const char *msg, cudaError_t err)
+{
+    if (err != cudaSuccess)
+    {
+        //print a human readable error message
+        printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+
+void* allocate (int alignment, size_t bytesize)
+{
+    int errorCode;
+    void* ptr;
+
+    checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
+
+    return ptr;
+
+    /*
+    errorCode =  posix_memalign(&ptr, alignment, bytesize);
+
+    if (errorCode) {
+        if (errorCode == EINVAL) {
+            fprintf(stderr,
+                    "Error: Alignment parameter is not a power of two\n");
+            exit(EXIT_FAILURE);
+        }
+        if (errorCode == ENOMEM) {
+            fprintf(stderr,
+                    "Error: Insufficient memory to fulfill the request\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (ptr == NULL) {
+        fprintf(stderr, "Error: posix_memalign failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return ptr;
+    */
+}
+
+void* reallocate (
+        void* ptr,
+        int alignment,
+        size_t newBytesize,
+        size_t oldBytesize)
+{
+    void* newarray = allocate(alignment, newBytesize);
+
+    if(ptr != NULL) {
+        memcpy(newarray, ptr, oldBytesize);
+        cudaFreeHost(ptr);
+    }
+
+    return newarray;
+}
--- a/lammps/atom.c
+++ b/lammps/atom.c
@@ -0,0 +1,185 @@
+/*
+ * =======================================================================================
+ *
+ *   Authors:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *              Rafael Ravedutti (rr), rafaelravedutti@gmail.com
+ *
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <atom.h>
+#include <allocate.h>
+#include <util.h>
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+#define DELTA 20000
+
+void initAtom(Atom *atom)
+{
+    atom->x  = NULL; atom->y  = NULL; atom->z  = NULL;
+    atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
+    atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
+    atom->Natoms = 0;
+    atom->Nlocal = 0;
+    atom->Nghost = 0;
+    atom->Nmax   = 0;
+    atom->type = NULL;
+    atom->ntypes = 0;
+    atom->epsilon = NULL;
+    atom->sigma6 = NULL;
+    atom->cutforcesq = NULL;
+    atom->cutneighsq = NULL;
+}
+
+void createAtom(Atom *atom, Parameter *param)
+{
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
+    atom->Natoms = 4 * param->nx * param->ny * param->nz;
+    atom->Nlocal = 0;
+    atom->ntypes = param->ntypes;
+    checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
+    int ilo = (int) (xlo / (0.5 * alat) - 1);
+    int ihi = (int) (xhi / (0.5 * alat) + 1);
+    int jlo = (int) (ylo / (0.5 * alat) - 1);
+    int jhi = (int) (yhi / (0.5 * alat) + 1);
+    int klo = (int) (zlo / (0.5 * alat) - 1);
+    int khi = (int) (zhi / (0.5 * alat) + 1);
+
+    ilo = MAX(ilo, 0);
+    ihi = MIN(ihi, 2 * param->nx - 1);
+    jlo = MAX(jlo, 0);
+    jhi = MIN(jhi, 2 * param->ny - 1);
+    klo = MAX(klo, 0);
+    khi = MIN(khi, 2 * param->nz - 1);
+
+    MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
+    int i, j, k, m, n;
+    int sx = 0; int sy = 0; int sz = 0;
+    int ox = 0; int oy = 0; int oz = 0;
+    int subboxdim = 8;
+
+    while(oz * subboxdim <= khi) {
+
+        k = oz * subboxdim + sz;
+        j = oy * subboxdim + sy;
+        i = ox * subboxdim + sx;
+
+        if(((i + j + k) % 2 == 0) &&
+                (i >= ilo) && (i <= ihi) &&
+                (j >= jlo) && (j <= jhi) &&
+                (k >= klo) && (k <= khi)) {
+
+            xtmp = 0.5 * alat * i;
+            ytmp = 0.5 * alat * j;
+            ztmp = 0.5 * alat * k;
+
+            if( xtmp >= xlo && xtmp < xhi &&
+                    ytmp >= ylo && ytmp < yhi &&
+                    ztmp >= zlo && ztmp < zhi ) {
+
+                n = k * (2 * param->ny) * (2 * param->nx) +
+                    j * (2 * param->nx) +
+                    i + 1;
+
+                for(m = 0; m < 5; m++) {
+                    myrandom(&n);
+                }
+                vxtmp = myrandom(&n);
+
+                for(m = 0; m < 5; m++){
+                    myrandom(&n);
+                }
+                vytmp = myrandom(&n);
+
+                for(m = 0; m < 5; m++) {
+                    myrandom(&n);
+                }
+                vztmp = myrandom(&n);
+
+                if(atom->Nlocal == atom->Nmax) {
+                    growAtom(atom);
+                }
+
+                atom_x(atom->Nlocal) = xtmp;
+                atom_y(atom->Nlocal) = ytmp;
+                atom_z(atom->Nlocal) = ztmp;
+                atom_vx(atom->Nlocal) = vxtmp;
+                atom_vy(atom->Nlocal) = vytmp;
+                atom_vz(atom->Nlocal) = vztmp;
+                atom->type[atom->Nlocal] = rand() % atom->ntypes;
+                atom->Nlocal++;
+            }
+        }
+
+        sx++;
+
+        if(sx == subboxdim) { sx = 0; sy++; }
+        if(sy == subboxdim) { sy = 0; sz++; }
+        if(sz == subboxdim) { sz = 0; ox++; }
+        if(ox * subboxdim > ihi) { ox = 0; oy++; }
+        if(oy * subboxdim > jhi) { oy = 0; oz++; }
+    }
+}
+
+void growAtom(Atom *atom)
+{
+    int nold = atom->Nmax;
+    atom->Nmax += DELTA;
+
+    #ifdef AOS
+    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+
+    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    #else
+    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->z  = (MD_FLOAT*) reallocate(atom->z,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
+    atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+
+    #endif
+
+    atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
+}
--- a/lammps/eam_utils.c
+++ b/lammps/eam_utils.c
@@ -0,0 +1,285 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <allocate.h>
+#include <atom.h>
+#include <eam.h>
+#include <parameter.h>
+#include <util.h>
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+void initEam(Eam* eam, Parameter* param) {
+    int ntypes = param->ntypes;
+    eam->nmax = 0;
+    eam->fp = NULL;
+    coeff(eam, param);
+    init_style(eam, param);
+}
+
+void coeff(Eam* eam, Parameter* param) {
+    read_file(&eam->file, param->input_file);
+    param->mass = eam->file.mass;
+    param->cutforce = eam->file.cut;
+    param->cutneigh = param->cutforce + 1.0;
+    param->temp = 600.0;
+    param->dt = 0.001;
+    param->rho = 0.07041125;
+    param->dtforce = 0.5 * param->dt / param->mass;
+}
+
+void init_style(Eam* eam, Parameter* param) {
+    // convert read-in file(s) to arrays and spline them
+    file2array(eam);
+    array2spline(eam, param);
+}
+
+void read_file(Funcfl* file, const char* filename) {
+    FILE* fptr;
+    char line[MAXLINE];
+
+    fptr = fopen(filename, "r");
+    if(fptr == NULL) {
+        printf("Can't open EAM Potential file: %s\n", filename);
+        exit(0);
+    }
+
+    int tmp;
+    fgets(line, MAXLINE, fptr);
+    fgets(line, MAXLINE, fptr);
+    sscanf(line, "%d %lg", &tmp, &(file->mass));
+    fgets(line, MAXLINE, fptr);
+    sscanf(line, "%d %lg %d %lg %lg", &file->nrho, &file->drho, &file->nr, &file->dr, &file->cut);
+
+    //printf("Read: %lf %i %lf %i %lf %lf\n",file->mass,file->nrho,file->drho,file->nr,file->dr,file->cut);
+    file->frho = (MD_FLOAT *) allocate(ALIGNMENT, (file->nrho + 1) * sizeof(MD_FLOAT));
+    file->rhor = (MD_FLOAT *) allocate(ALIGNMENT, (file->nr + 1) * sizeof(MD_FLOAT));
+    file->zr = (MD_FLOAT *) allocate(ALIGNMENT, (file->nr + 1) * sizeof(MD_FLOAT));
+    grab(fptr, file->nrho, file->frho);
+    grab(fptr, file->nr, file->zr);
+    grab(fptr, file->nr, file->rhor);
+    for(int i = file->nrho; i > 0; i--) file->frho[i] = file->frho[i - 1];
+    for(int i = file->nr; i > 0; i--) file->rhor[i] = file->rhor[i - 1];
+    for(int i = file->nr; i > 0; i--) file->zr[i] = file->zr[i - 1];
+    fclose(fptr);
+}
+
+void file2array(Eam* eam) {
+    int i, j, k, m, n;
+    double sixth = 1.0 / 6.0;
+
+    // determine max function params from all active funcfl files
+    // active means some element is pointing at it via map
+    int active;
+    double rmax, rhomax;
+    eam->dr = eam->drho = rmax = rhomax = 0.0;
+    active = 0;
+    Funcfl* file = &eam->file;
+    eam->dr = MAX(eam->dr, file->dr);
+    eam->drho = MAX(eam->drho, file->drho);
+    rmax = MAX(rmax, (file->nr - 1) * file->dr);
+    rhomax = MAX(rhomax, (file->nrho - 1) * file->drho);
+
+    // set nr,nrho from cutoff and spacings
+    // 0.5 is for round-off in divide
+    eam->nr = (int)(rmax / eam->dr + 0.5);
+    eam->nrho = (int)(rhomax / eam->drho + 0.5);
+
+    // ------------------------------------------------------------------
+    // setup frho arrays
+    // ------------------------------------------------------------------
+
+    // allocate frho arrays
+    // nfrho = # of funcfl files + 1 for zero array
+    eam->frho = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nrho + 1) * sizeof(MD_FLOAT));
+
+    // interpolate each file's frho to a single grid and cutoff
+    double r, p, cof1, cof2, cof3, cof4;
+    n = 0;
+
+    for(m = 1; m <= eam->nrho; m++) {
+        r = (m - 1) * eam->drho;
+        p = r / file->drho + 1.0;
+        k = (int)(p);
+        k = MIN(k, file->nrho - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        eam->frho[m] = cof1 * file->frho[k - 1] + cof2 * file->frho[k] +
+                       cof3 * file->frho[k + 1] + cof4 * file->frho[k + 2];
+    }
+
+
+    // ------------------------------------------------------------------
+    // setup rhor arrays
+    // ------------------------------------------------------------------
+
+    // allocate rhor arrays
+    // nrhor = # of funcfl files
+    eam->rhor = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nr + 1) * sizeof(MD_FLOAT));
+
+    // interpolate each file's rhor to a single grid and cutoff
+    for(m = 1; m <= eam->nr; m++) {
+        r = (m - 1) * eam->dr;
+        p = r / file->dr + 1.0;
+        k = (int)(p);
+        k = MIN(k, file->nr - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        eam->rhor[m] = cof1 * file->rhor[k - 1] + cof2 * file->rhor[k] +
+                       cof3 * file->rhor[k + 1] + cof4 * file->rhor[k + 2];
+        //if(m==119)printf("BuildRho: %e %e %e %e %e %e\n",rhor[m],cof1,cof2,cof3,cof4,file->rhor[k]);
+    }
+
+    // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+    // for funcfl files, I,J mapping only depends on I
+    // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+    // ------------------------------------------------------------------
+    // setup z2r arrays
+    // ------------------------------------------------------------------
+
+    // allocate z2r arrays
+    // nz2r = N*(N+1)/2 where N = # of funcfl files
+    eam->z2r = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nr + 1) * sizeof(MD_FLOAT));
+
+    // create a z2r array for each file against other files, only for I >= J
+    // interpolate zri and zrj to a single grid and cutoff
+    double zri, zrj;
+    Funcfl* ifile = &eam->file;
+    Funcfl* jfile = &eam->file;
+
+    for(m = 1; m <= eam->nr; m++) {
+        r = (m - 1) * eam->dr;
+        p = r / ifile->dr + 1.0;
+        k = (int)(p);
+        k = MIN(k, ifile->nr - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        zri = cof1 * ifile->zr[k - 1] + cof2 * ifile->zr[k] +
+        cof3 * ifile->zr[k + 1] + cof4 * ifile->zr[k + 2];
+
+        p = r / jfile->dr + 1.0;
+        k = (int)(p);
+        k = MIN(k, jfile->nr - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        zrj = cof1 * jfile->zr[k - 1] + cof2 * jfile->zr[k] +
+        cof3 * jfile->zr[k + 1] + cof4 * jfile->zr[k + 2];
+
+        eam->z2r[m] = 27.2 * 0.529 * zri * zrj;
+    }
+}
+
+void array2spline(Eam* eam, Parameter* param) {
+    eam->rdr = 1.0 / eam->dr;
+    eam->rdrho = 1.0 / eam->drho;
+    eam->nrho_tot = (eam->nrho + 1) * 7 + 64;
+    eam->nr_tot = (eam->nr + 1) * 7 + 64;
+    eam->nrho_tot -= eam->nrho_tot%64;
+    eam->nr_tot -= eam->nr_tot%64;
+
+    int ntypes = param->ntypes;
+    eam->frho_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nrho_tot * sizeof(MD_FLOAT));
+    eam->rhor_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nr_tot * sizeof(MD_FLOAT));
+    eam->z2r_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nr_tot * sizeof(MD_FLOAT));
+    interpolate(eam->nrho, eam->drho, eam->frho, eam->frho_spline);
+    interpolate(eam->nr, eam->dr, eam->rhor, eam->rhor_spline);
+    interpolate(eam->nr, eam->dr, eam->z2r, eam->z2r_spline);
+
+    // replicate data for multiple types;
+    for(int tt = 0; tt < ntypes * ntypes; tt++) {
+        for(int k = 0; k < eam->nrho_tot; k++)
+            eam->frho_spline[tt*eam->nrho_tot + k] = eam->frho_spline[k];
+        for(int k = 0; k < eam->nr_tot; k++)
+            eam->rhor_spline[tt*eam->nr_tot + k] = eam->rhor_spline[k];
+        for(int k = 0; k < eam->nr_tot; k++)
+            eam->z2r_spline[tt*eam->nr_tot + k] = eam->z2r_spline[k];
+    }
+}
+
+void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline) {
+    for(int m = 1; m <= n; m++) spline[m * 7 + 6] = f[m];
+
+    spline[1 * 7 + 5] = spline[2 * 7 + 6] - spline[1 * 7 + 6];
+    spline[2 * 7 + 5] = 0.5 * (spline[3 * 7 + 6] - spline[1 * 7 + 6]);
+    spline[(n - 1) * 7 + 5] = 0.5 * (spline[n * 7 + 6] - spline[(n - 2) * 7 + 6]);
+    spline[n * 7 + 5] = spline[n * 7 + 6] - spline[(n - 1) * 7 + 6];
+
+    for(int m = 3; m <= n - 2; m++)
+        spline[m * 7 + 5] = ((spline[(m - 2) * 7 + 6] - spline[(m + 2) * 7 + 6]) +
+                            8.0 * (spline[(m + 1) * 7 + 6] - spline[(m - 1) * 7 + 6])) / 12.0;
+
+    for(int m = 1; m <= n - 1; m++) {
+        spline[m * 7 + 4] = 3.0 * (spline[(m + 1) * 7 + 6] - spline[m * 7 + 6]) -
+                            2.0 * spline[m * 7 + 5] - spline[(m + 1) * 7 + 5];
+        spline[m * 7 + 3] = spline[m * 7 + 5] + spline[(m + 1) * 7 + 5] -
+                            2.0 * (spline[(m + 1) * 7 + 6] - spline[m * 7 + 6]);
+    }
+
+    spline[n * 7 + 4] = 0.0;
+    spline[n * 7 + 3] = 0.0;
+
+    for(int m = 1; m <= n; m++) {
+        spline[m * 7 + 2] = spline[m * 7 + 5] / delta;
+        spline[m * 7 + 1] = 2.0 * spline[m * 7 + 4] / delta;
+        spline[m * 7 + 0] = 3.0 * spline[m * 7 + 3] / delta;
+    }
+}
+
+void grab(FILE* fptr, int n, MD_FLOAT* list) {
+    char* ptr;
+    char line[MAXLINE];
+    int i = 0;
+
+    while(i < n) {
+        fgets(line, MAXLINE, fptr);
+        ptr = strtok(line, " \t\n\r\f");
+        list[i++] = atof(ptr);
+        while(ptr = strtok(NULL, " \t\n\r\f")) list[i++] = atof(ptr);
+    }
+}
--- a/lammps/force-tracing.c
+++ b/lammps/force-tracing.c
@@ -0,0 +1,232 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <likwid-marker.h>
+
+#include <timing.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#ifndef VECTOR_WIDTH
+#   define VECTOR_WIDTH                 8
+#endif
+
+#ifndef TRACER_CONDITION
+#   define TRACER_CONDITION                 (!(timestep % param->every))
+#endif
+
+#ifdef MEM_TRACER
+#   define MEM_TRACER_INIT                  FILE *mem_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char mem_tracer_fn[128]; \
+                                                snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
+                                                mem_tracer_fp = fopen(mem_tracer_fn, "w");
+                                            }
+
+#   define MEM_TRACER_END                   if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
+#   define MEM_TRACE(addr, op)              if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
+#else
+#   define MEM_TRACER_INIT
+#   define MEM_TRACER_END
+#   define MEM_TRACE(addr, op)
+#endif
+
+#ifdef INDEX_TRACER
+#   define INDEX_TRACER_INIT                FILE *index_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char index_tracer_fn[128]; \
+                                                snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
+                                                index_tracer_fp = fopen(index_tracer_fn, "w"); \
+                                            }
+
+#   define INDEX_TRACER_END                 if(TRACER_CONDITION) { fclose(index_tracer_fp); }
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)   if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
+#   define INDEX_TRACE_ATOM(a)              if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
+#   define INDEX_TRACE(l, e)                if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    fprintf(index_tracer_fp, "I: "); \
+                                                    for(int __j = 0; __j < __e; ++__j) { \
+                                                        fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
+                                                    } \
+                                                    fprintf(index_tracer_fp, "\n"); \
+                                                } \
+                                            }
+
+#   define DIST_TRACE_SORT(l, e)            if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        for(int __j = __i; __j < __i + __e - 1; ++__j) { \
+                                                            for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
+                                                                if(l[__k] > l[__k + 1]) { \
+                                                                    int __t = l[__k]; \
+                                                                    l[__k] = l[__k + 1]; \
+                                                                    l[__k + 1] = __t; \
+                                                                } \
+                                                            } \
+                                                        } \
+                                                    } \
+                                                } \
+                                            }
+
+#   define DIST_TRACE(l, e)                 if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        fprintf(index_tracer_fp, "D: "); \
+                                                        for(int __j = 0; __j < __e - 1; ++__j) { \
+                                                            int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
+                                                            fprintf(index_tracer_fp, "%d ", __dist); \
+                                                        } \
+                                                        fprintf(index_tracer_fp, "\n"); \
+                                                    } \
+                                                } \
+                                            }
+#else
+#   define INDEX_TRACER_INIT
+#   define INDEX_TRACER_END
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)
+#   define INDEX_TRACE_ATOM(a)
+#   define INDEX_TRACE(l, e)
+#   define DIST_TRACE_SORT(l, e)
+#   define DIST_TRACE(l, e)
+#endif
+
+double computeForceTracing(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
+    MEM_TRACER_INIT;
+    INDEX_TRACER_INIT;
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
+    #ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+    #endif
+
+    for(int i = 0; i < Nlocal; i++) {
+        fx[i] = 0.0;
+        fy[i] = 0.0;
+        fz[i] = 0.0;
+    }
+
+    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
+        #pragma omp parallel for
+        for(int i = 0; i < Nlocal; i++) {
+            neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+            int numneighs = neighbor->numneigh[i];
+            MD_FLOAT xtmp = atom_x(i);
+            MD_FLOAT ytmp = atom_y(i);
+            MD_FLOAT ztmp = atom_z(i);
+            MD_FLOAT fix = 0;
+            MD_FLOAT fiy = 0;
+            MD_FLOAT fiz = 0;
+
+            MEM_TRACE(atom_x(i), 'R');
+            MEM_TRACE(atom_y(i), 'R');
+            MEM_TRACE(atom_z(i), 'R');
+            INDEX_TRACE_ATOM(i);
+
+            #ifdef EXPLICIT_TYPES
+            const int type_i = atom->type[i];
+            MEM_TRACE(atom->type(i), 'R');
+            #endif
+
+            #if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
+            #define REPEAT_NEIGHBORS_LOOP
+            int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
+            for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
+            #endif
+
+                //DIST_TRACE_SORT(neighs, numneighs);
+                INDEX_TRACE(neighs, numneighs);
+                //DIST_TRACE(neighs, numneighs);
+
+                for(int k = 0; k < numneighs; k++) {
+                    int j = neighs[k];
+                    MD_FLOAT delx = xtmp - atom_x(j);
+                    MD_FLOAT dely = ytmp - atom_y(j);
+                    MD_FLOAT delz = ztmp - atom_z(j);
+                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                    MEM_TRACE(neighs[k], 'R');
+                    MEM_TRACE(atom_x(j), 'R');
+                    MEM_TRACE(atom_y(j), 'R');
+                    MEM_TRACE(atom_z(j), 'R');
+
+                    #ifdef EXPLICIT_TYPES
+                    const int type_j = atom->type[j];
+                    const int type_ij = type_i * atom->ntypes + type_j;
+                    const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+                    const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+                    const MD_FLOAT epsilon = atom->epsilon[type_ij];
+                    MEM_TRACE(atom->type(j), 'R');
+                    #endif
+
+                    if(rsq < cutforcesq) {
+                        MD_FLOAT sr2 = 1.0 / rsq;
+                        MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+                        MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+                        fix += delx * force;
+                        fiy += dely * force;
+                        fiz += delz * force;
+                    }
+                }
+
+            #ifdef REPEAT_NEIGHBORS_LOOP
+            }
+            #endif
+
+            fx[i] += fix;
+            fy[i] += fiy;
+            fz[i] += fiz;
+
+            addStat(stats->total_force_neighs, numneighs);
+            addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+            MEM_TRACE(fx[i], 'R');
+            MEM_TRACE(fx[i], 'W');
+            MEM_TRACE(fy[i], 'R');
+            MEM_TRACE(fy[i], 'W');
+            MEM_TRACE(fz[i], 'R');
+            MEM_TRACE(fz[i], 'W');
+        }
+    }
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+
+    INDEX_TRACER_END;
+    MEM_TRACER_END;
+    return E-S;
+}
--- a/lammps/force.cu
+++ b/lammps/force.cu
@@ -0,0 +1,212 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+extern "C" {
+    #include <likwid-marker.h>
+
+    #include <timing.h>
+    #include <neighbor.h>
+    #include <parameter.h>
+    #include <atom.h>
+    #include <allocate.h>
+}
+
+// cuda kernel
+__global__ void calc_force(
+    Atom a,
+    MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon,
+    int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    const int numneighs = neigh_numneigh[i];
+
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+
+    MD_FLOAT fix = 0;
+    MD_FLOAT fiy = 0;
+    MD_FLOAT fiz = 0;
+
+    for(int k = 0; k < numneighs; k++) {
+        int j = neigh_neighbors[atom->Nlocal * k + i];
+        MD_FLOAT delx = xtmp - atom_x(j);
+        MD_FLOAT dely = ytmp - atom_y(j);
+        MD_FLOAT delz = ztmp - atom_z(j);
+        MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+        const int type_j = atom->type[j];
+        const int type_ij = type_i * atom->ntypes + type_j;
+        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+        const MD_FLOAT epsilon = atom->epsilon[type_ij];
+#endif
+
+        if(rsq < cutforcesq) {
+            MD_FLOAT sr2 = 1.0 / rsq;
+            MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+            MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+            fix += delx * force;
+            fiy += dely * force;
+            fiz += delz * force;
+        }
+    }
+
+    atom_fx(i) = fix;
+    atom_fy(i) = fiy;
+    atom_fz(i) = fiz;
+}
+
+__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+    atom_x(i) = atom_x(i) + dt * atom_vx(i);
+    atom_y(i) = atom_y(i) + dt * atom_vy(i);
+    atom_z(i) = atom_z(i) + dt * atom_vz(i);
+}
+
+__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    Atom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+}
+
+extern "C" {
+
+
+
+void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
+
+    const int Nlocal = atom->Nlocal;
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
+
+    checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
+
+    const int Nlocal = atom->Nlocal;
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
+
+    checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
+
+    if(doReneighbour) {
+        checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+    }
+}
+
+double computeForce(
+        bool reneighbourHappenend,
+        Parameter *param,
+        Atom *atom,
+        Neighbor *neighbor,
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        int num_threads_per_block
+        )
+{
+    int Nlocal = atom->Nlocal;
+#ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+#endif
+
+    /*
+    int nDevices;
+    cudaGetDeviceCount(&nDevices);
+    size_t free, total;
+    for(int i = 0; i < nDevices; ++i) {
+        cudaMemGetInfo( &free, &total );
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
+    }
+    */
+
+
+    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
+
+    // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
+
+    cudaProfilerStart();
+
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
+
+    checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
+
+    cudaProfilerStop();
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+
+    return E-S;
+}
+}
--- a/lammps/force_eam.c
+++ b/lammps/force_eam.c
@@ -0,0 +1,213 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <likwid-marker.h>
+#include <math.h>
+
+#include <allocate.h>
+#include <timing.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <eam.h>
+#include <util.h>
+
+double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
+    if(eam->nmax < atom->Nmax) {
+        eam->nmax = atom->Nmax;
+        if(eam->fp != NULL) { free(eam->fp); }
+        eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
+    }
+
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
+    MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
+    int rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; int rdrho = eam->rdrho;
+    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
+    double S = getTimeStamp();
+
+    LIKWID_MARKER_START("force_eam_fp");
+    #pragma omp parallel for
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT rhoi = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+            if(rsq < cutforcesq) {
+                MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+#ifdef EXPLICIT_TYPES
+                rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                rhoi += ((rhor_spline[m * 7 + 3] * p +
+                          rhor_spline[m * 7 + 4]) * p +
+                          rhor_spline[m * 7 + 5]) * p +
+                          rhor_spline[m * 7 + 6];
+#endif
+            }
+        }
+
+#ifdef EXPLICIT_TYPES
+        const int type_ii = type_i * type_i;
+#endif
+        MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
+        int m = (int)(p);
+        m = MAX(1, MIN(m, nrho - 1));
+        p -= m;
+        p = MIN(p, 1.0);
+#ifdef EXPLICIT_TYPES
+        fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 2];
+#else
+        fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
+#endif
+    }
+
+    LIKWID_MARKER_STOP("force_eam_fp");
+
+    // We still need to update fp for PBC atoms
+    for(int i = 0; i < atom->Nghost; i++) {
+        fp[Nlocal + i] = fp[atom->border_map[i]];
+    }
+
+    LIKWID_MARKER_START("force_eam");
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+
+            if(rsq < cutforcesq) {
+                MD_FLOAT r = sqrt(rsq);
+                MD_FLOAT p = r * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+
+
+                // rhoip = derivative of (density at atom j due to atom i)
+                // rhojp = derivative of (density at atom i due to atom j)
+                // phi = pair potential energy
+                // phip = phi'
+                // z2 = phi * r
+                // z2p = (phi * r)' = (phi' r) + phi
+                // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+                //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+                //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+#ifdef EXPLICIT_TYPES
+                MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
+                MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
+                MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
+                                z2r_spline[m * 7 + 4]) * p +
+                                z2r_spline[m * 7 + 5]) * p +
+                                z2r_spline[m * 7 + 6];
+#endif
+
+                MD_FLOAT recip = 1.0 / r;
+                MD_FLOAT phi = z2 * recip;
+                MD_FLOAT phip = z2p * recip - phi * recip;
+                MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
+                MD_FLOAT fpair = -psip * recip;
+
+                fix += delx * fpair;
+                fiy += dely * fpair;
+                fiz += delz * fpair;
+                //fpair *= 0.5;
+            }
+        }
+
+        fx[i] = fix;
+        fy[i] = fiy;
+        fz[i] = fiz;
+        addStat(stats->total_force_neighs, numneighs);
+        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+    }
+
+    LIKWID_MARKER_STOP("force_eam");
+    double E = getTimeStamp();
+    return E-S;
+}
--- a/lammps/includes/allocate.h
+++ b/lammps/includes/allocate.h
@@ -0,0 +1,33 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+
+#include <cuda_runtime.h>
+
+#ifndef __ALLOCATE_H_
+#define __ALLOCATE_H_
+extern void* allocate (int alignment, size_t bytesize);
+extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
+
+extern void checkCUDAError(const char *msg, cudaError_t err);
+#endif
--- a/lammps/includes/atom.h
+++ b/lammps/includes/atom.h
@@ -0,0 +1,78 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <parameter.h>
+
+#ifndef __ATOM_H_
+#define __ATOM_H_
+
+typedef struct {
+    int Natoms, Nlocal, Nghost, Nmax;
+    MD_FLOAT *x, *y, *z;
+    MD_FLOAT *vx, *vy, *vz;
+    MD_FLOAT *fx, *fy, *fz;
+    int *border_map;
+    int *type;
+    int ntypes;
+    MD_FLOAT *epsilon;
+    MD_FLOAT *sigma6;
+    MD_FLOAT *cutforcesq;
+    MD_FLOAT *cutneighsq;
+} Atom;
+
+extern void initAtom(Atom*);
+extern void createAtom(Atom*, Parameter*);
+extern void growAtom(Atom*);
+
+#ifdef AOS
+#define POS_DATA_LAYOUT     "AoS"
+
+#define atom_x(i)           atom->x[(i) * 3 + 0]
+#define atom_y(i)           atom->x[(i) * 3 + 1]
+#define atom_z(i)           atom->x[(i) * 3 + 2]
+
+#define atom_fx(i)          atom->fx[(i) * 3 + 0]
+#define atom_fy(i)          atom->fx[(i) * 3 + 1]
+#define atom_fz(i)          atom->fx[(i) * 3 + 2]
+
+#define atom_vx(i)          atom->vx[(i) * 3 + 0]
+#define atom_vy(i)          atom->vx[(i) * 3 + 1]
+#define atom_vz(i)          atom->vx[(i) * 3 + 2]
+
+#else
+#define POS_DATA_LAYOUT     "SoA"
+
+#define atom_x(i)           atom->x[i]
+#define atom_y(i)           atom->y[i]
+#define atom_z(i)           atom->z[i]
+
+#define atom_fx(i)           atom->fx[i]
+#define atom_fy(i)           atom->fy[i]
+#define atom_fz(i)           atom->fz[i]
+
+#define atom_vx(i)           atom->vx[i]
+#define atom_vy(i)           atom->vy[i]
+#define atom_vz(i)           atom->vz[i]
+
+#endif
+
+#endif
--- a/lammps/includes/eam.h
+++ b/lammps/includes/eam.h
@@ -0,0 +1,55 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdio.h>
+
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __EAM_H_
+#define __EAM_H_
+typedef struct {
+    int nrho, nr;
+    MD_FLOAT drho, dr, cut, mass;
+    MD_FLOAT *frho, *rhor, *zr;
+} Funcfl;
+
+typedef struct {
+    MD_FLOAT* fp;
+    int nmax;
+    int nrho, nr;
+    int nrho_tot, nr_tot;
+    MD_FLOAT dr, rdr, drho, rdrho;
+    MD_FLOAT *frho, *rhor, *z2r;
+    MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
+    Funcfl file;
+} Eam;
+
+void initEam(Eam* eam, Parameter* param);
+void coeff(Eam* eam, Parameter* param);
+void init_style(Eam* eam, Parameter *param);
+void read_file(Funcfl* file, const char* filename);
+void file2array(Eam* eam);
+void array2spline(Eam* eam, Parameter* param);
+void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
+void grab(FILE* fptr, int n, MD_FLOAT* list);
+#endif
--- a/lammps/includes/likwid-marker.h
+++ b/lammps/includes/likwid-marker.h
@@ -0,0 +1,170 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid-marker.h
+ *
+ *      Description:  Header File of likwid Marker API
+ *
+ *      Version:   <VERSION>
+ *      Released:  <DATE>
+ *
+ *      Authors:  Thomas Gruber (tg), thomas.roehl@googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_MARKER_H
+#define LIKWID_MARKER_H
+
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_RESET(regionTag)
+Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#define LIKWID_MARKER_INIT likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else  /* LIKWID_PERFMON */
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_MARKER_RESET(regionTag)
+#endif /* LIKWID_PERFMON */
+
+
+/** \addtogroup NvMarkerAPI NvMarker API module (MarkerAPI for Nvidia GPUs)
+*  @{
+*/
+/*!
+\def LIKWID_NVMARKER_INIT
+Shortcut for likwid_gpuMarkerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_THREADINIT
+Shortcut for likwid_gpuMarkerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_REGISTER(regionTag)
+Shortcut for likwid_gpuMarkerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_START(regionTag)
+Shortcut for likwid_gpuMarkerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_STOP(regionTag)
+Shortcut for likwid_gpuMarkerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count)
+Shortcut for likwid_gpuMarkerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_SWITCH
+Shortcut for likwid_gpuMarkerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_RESET(regionTag)
+Shortcut for likwid_gpuMarkerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_CLOSE
+Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_NVMON
+#ifndef LIKWID_WITH_NVMON
+#define LIKWID_WITH_NVMON
+#endif
+#include <likwid.h>
+#define LIKWID_NVMARKER_INIT likwid_gpuMarkerInit()
+#define LIKWID_NVMARKER_THREADINIT likwid_gpuMarkerThreadInit()
+#define LIKWID_NVMARKER_SWITCH likwid_gpuMarkerNextGroup()
+#define LIKWID_NVMARKER_REGISTER(regionTag) likwid_gpuMarkerRegisterRegion(regionTag)
+#define LIKWID_NVMARKER_START(regionTag) likwid_gpuMarkerStartRegion(regionTag)
+#define LIKWID_NVMARKER_STOP(regionTag) likwid_gpuMarkerStopRegion(regionTag)
+#define LIKWID_NVMARKER_CLOSE likwid_gpuMarkerClose()
+#define LIKWID_NVMARKER_RESET(regionTag) likwid_gpuMarkerResetRegion(regionTag)
+#define LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) \
+    likwid_gpuMarkerGetRegion(regionTag, ngpus, nevents, events, time, count)
+#else /* LIKWID_NVMON */
+#define LIKWID_NVMARKER_INIT
+#define LIKWID_NVMARKER_THREADINIT
+#define LIKWID_NVMARKER_SWITCH
+#define LIKWID_NVMARKER_REGISTER(regionTag)
+#define LIKWID_NVMARKER_START(regionTag)
+#define LIKWID_NVMARKER_STOP(regionTag)
+#define LIKWID_NVMARKER_CLOSE
+#define LIKWID_NVMARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_NVMARKER_RESET(regionTag)
+#endif /* LIKWID_NVMON */
+
+
+
+#endif /* LIKWID_MARKER_H */
--- a/lammps/includes/neighbor.h
+++ b/lammps/includes/neighbor.h
@@ -0,0 +1,58 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __NEIGHBOR_H_
+#define __NEIGHBOR_H_
+typedef struct {
+    int every;
+    int ncalls;
+    int* neighbors;
+    int maxneighs;
+    int* numneigh;
+} Neighbor;
+
+typedef struct {
+    MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
+    MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
+    int mbinxlo; int mbinylo; int mbinzlo;
+    int nbinx; int nbiny; int nbinz;
+    int mbinx; int mbiny; int mbinz;
+} Neighbor_params;
+
+typedef struct {
+    int* bincount;
+    int* bins;
+    int mbins;
+    int atoms_per_bin;
+} Binning;
+
+extern void initNeighbor(Neighbor*, Parameter*);
+extern void setupNeighbor();
+extern void binatoms(Atom*);
+extern void buildNeighbor(Atom*, Neighbor*);
+extern void sortAtom(Atom*);
+extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
+extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*);
+#endif
--- a/lammps/includes/parameter.h
+++ b/lammps/includes/parameter.h
@@ -0,0 +1,57 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#ifndef __PARAMETER_H_
+#define __PARAMETER_H_
+
+#define FF_LJ   0
+#define FF_EAM  1
+
+#if PRECISION == 1
+#define MD_FLOAT float
+#else
+#define MD_FLOAT double
+#endif
+
+typedef struct {
+    int force_field;
+    char* input_file;
+    char* vtk_file;
+    MD_FLOAT epsilon;
+    MD_FLOAT sigma6;
+    MD_FLOAT temp;
+    MD_FLOAT rho;
+    MD_FLOAT mass;
+    int ntypes;
+    int ntimes;
+    int nstat;
+    int every;
+    MD_FLOAT dt;
+    MD_FLOAT dtforce;
+    MD_FLOAT cutforce;
+    MD_FLOAT cutneigh;
+    int nx, ny, nz;
+    MD_FLOAT lattice;
+    MD_FLOAT xprd, yprd, zprd;
+    double proc_freq;
+} Parameter;
+#endif
--- a/lammps/includes/pbc.h
+++ b/lammps/includes/pbc.h
@@ -0,0 +1,34 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __PBC_H_
+#define __PBC_H_
+extern void initPbc(Atom*);
+extern void updatePbc(Atom*, Parameter*);
+extern void updatePbc_cuda(Atom*, Parameter*, Atom*, bool, const int);
+extern void updateAtomsPbc(Atom*, Parameter*);
+extern void updateAtomsPbc_cuda(Atom*, Parameter*, Atom*, const int);
+extern void setupPbc(Atom*, Parameter*);
+#endif
--- a/lammps/includes/stats.h
+++ b/lammps/includes/stats.h
@@ -0,0 +1,46 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __STATS_H_
+#define __STATS_H_
+typedef struct {
+    long long int total_force_neighs;
+    long long int total_force_iters;
+} Stats;
+
+void initStats(Stats *s);
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
+
+#ifdef COMPUTE_STATS
+#   define addStat(stat, value)     stat += value;
+#   define beginStatTimer()         double Si = getTimeStamp();
+#   define endStatTimer(stat)       stat += getTimeStamp() - Si;
+#else
+#   define addStat(stat, value)
+#   define beginStatTimer()
+#   define endStatTimer(stat)
+#endif
+
+#endif
--- a/lammps/includes/thermo.h
+++ b/lammps/includes/thermo.h
@@ -0,0 +1,31 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <parameter.h>
+#include <atom.h>
+
+#ifndef __THERMO_H_
+#define __THERMO_H_
+extern void setupThermo(Parameter*, int);
+extern void computeThermo(int, Parameter*, Atom*);
+extern void adjustThermo(Parameter*, Atom*);
+#endif
--- a/lammps/includes/timers.h
+++ b/lammps/includes/timers.h
@@ -0,0 +1,16 @@
+#ifndef __TIMERS_H_
+#define __TIMERS_H_
+
+typedef enum {
+    TOTAL = 0,
+    NEIGH,
+    FORCE,
+    NEIGH_UPDATE_ATOMS_PBC,
+    NEIGH_SETUP_PBC,
+    NEIGH_UPDATE_PBC,
+    NEIGH_BINATOMS,
+    NEIGH_BUILD_LISTS,
+    NUMTIMER
+} timertype;
+
+#endif
--- a/lammps/includes/timing.h
+++ b/lammps/includes/timing.h
@@ -0,0 +1,30 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#ifndef __TIMING_H_
+#define __TIMING_H_
+
+extern double getTimeStamp();
+extern double getTimeResolution();
+extern double getTimeStamp_();
+
+#endif
--- a/lammps/includes/util.h
+++ b/lammps/includes/util.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#ifndef __UTIL_H_
+#define __UTIL_H_
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
+extern double myrandom(int*);
+#endif
--- a/lammps/includes/vtk.h
+++ b/lammps/includes/vtk.h
@@ -0,0 +1,28 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <atom.h>
+
+#ifndef __VTK_H_
+#define __VTK_H_
+extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+#endif
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
@@ -0,0 +1,253 @@
+#include <stdio.h>
+#include <string.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <timing.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <thermo.h>
+#include <pbc.h>
+#include <timers.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+#define LATTICE_DISTANCE    10.0
+#define NEIGH_DISTANCE      1.0
+
+extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int);
+
+void init(Parameter *param) {
+    param->epsilon = 1.0;
+    param->sigma6 = 1.0;
+    param->rho = 0.8442;
+    param->ntypes = 4;
+    param->ntimes = 200;
+    param->nx = 4;
+    param->ny = 4;
+    param->nz = 2;
+    param->lattice = LATTICE_DISTANCE;
+    param->cutforce = 5.0;
+    param->cutneigh = param->cutforce;
+    param->mass = 1.0;
+    // Unused
+    param->dt = 0.005;
+    param->dtforce = 0.5 * param->dt;
+    param->nstat = 100;
+    param->temp = 1.44;
+    param->every = 20;
+    param->proc_freq = 0.0;
+}
+
+// Show debug messages
+//#define DEBUG(msg)  printf(msg)
+// Do not show debug messages
+#define DEBUG(msg)
+
+#define ADD_ATOM(x, y, z, vx, vy, vz)   atom_x(atom->Nlocal) = base_x + x * NEIGH_DISTANCE; \
+                                        atom_y(atom->Nlocal) = base_y + y * NEIGH_DISTANCE; \
+                                        atom_z(atom->Nlocal) = base_z + z * NEIGH_DISTANCE; \
+                                        atom->vx[atom->Nlocal] = vy;                        \
+                                        atom->vy[atom->Nlocal] = vy;                        \
+                                        atom->vz[atom->Nlocal] = vz;                        \
+                                        atom->Nlocal++
+
+int main(int argc, const char *argv[]) {
+    Atom atom_data;
+    Atom *atom = (Atom *)(&atom_data);
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+    int atoms_per_unit_cell = 8;
+    int csv = 0;
+
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("force");
+    DEBUG("Initializing parameters...\n");
+    init(&param);
+
+    for(int i = 0; i < argc; i++)
+    {
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0))
+        {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nx") == 0))
+        {
+            param.nx = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-ny") == 0))
+        {
+            param.ny = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nz") == 0))
+        {
+            param.nz = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-na") == 0))
+        {
+            atoms_per_unit_cell = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-f") == 0))
+        {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-csv") == 0))
+        {
+            csv = 1;
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0))
+        {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
+            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
+            printf("-na <int>:            set number of atoms per unit cell\n");
+            printf("-f <real>:            set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
+            printf("-csv:                 set output as CSV style\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    param.xprd = param.nx * LATTICE_DISTANCE;
+    param.yprd = param.ny * LATTICE_DISTANCE;
+    param.zprd = param.nz * LATTICE_DISTANCE;
+
+    DEBUG("Initializing atoms...\n");
+    initAtom(atom);
+    initStats(&stats);
+
+    atom->ntypes = param.ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param.epsilon;
+        atom->sigma6[i] = param.sigma6;
+        atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
+        atom->cutforcesq[i] = param.cutforce * param.cutforce;
+    }
+
+    DEBUG("Creating atoms...\n");
+    for(int i = 0; i < param.nx; ++i) {
+        for(int j = 0; j < param.ny; ++j) {
+            for(int k = 0; k < param.nz; ++k) {
+                int added_atoms = 0;
+                int fac_x = 1;
+                int fac_y = 1;
+                int fac_z = 1;
+                int fmod = 0;
+                MD_FLOAT base_x = i * LATTICE_DISTANCE;
+                MD_FLOAT base_y = j * LATTICE_DISTANCE;
+                MD_FLOAT base_z = k * LATTICE_DISTANCE;
+                MD_FLOAT vx = 0.0;
+                MD_FLOAT vy = 0.0;
+                MD_FLOAT vz = 0.0;
+
+                while(atom->Nlocal > atom->Nmax - atoms_per_unit_cell) {
+                    growAtom(atom);
+                }
+
+                while(fac_x * fac_y * fac_z < atoms_per_unit_cell) {
+                    if(fmod == 0) { fac_x *= 2; }
+                    if(fmod == 1) { fac_y *= 2; }
+                    if(fmod == 2) { fac_z *= 2; }
+                    fmod = (fmod + 1) % 3;
+                }
+
+                MD_FLOAT offset_x = (fac_x > 1) ? 1.0 / (fac_x - 1) : (int)fac_x;
+                MD_FLOAT offset_y = (fac_y > 1) ? 1.0 / (fac_y - 1) : (int)fac_y;
+                MD_FLOAT offset_z = (fac_z > 1) ? 1.0 / (fac_z - 1) : (int)fac_z;
+                for(int ii = 0; ii < fac_x; ++ii) {
+                    for(int jj = 0; jj < fac_y; ++jj) {
+                        for(int kk = 0; kk < fac_z; ++kk) {
+                            if(added_atoms < atoms_per_unit_cell) {
+                                atom->type[atom->Nlocal] = rand() % atom->ntypes;
+                                ADD_ATOM(ii * offset_x, jj * offset_y, kk * offset_z, vx, vy, vz);
+                                added_atoms++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
+    const double estim_neighbors_volume = (double)(atom->Nlocal * (atoms_per_unit_cell - 1 + 2) * sizeof(int));
+    const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
+
+    if(!csv) {
+        printf("Number of timesteps: %d\n", param.ntimes);
+        printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
+        printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
+        printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
+        printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
+        printf("Total number of atoms: %d\n", atom->Nlocal);
+        printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
+        printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
+        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
+    }
+
+    DEBUG("Initializing neighbor lists...\n");
+    initNeighbor(&neighbor, &param);
+    DEBUG("Setting up neighbor lists...\n");
+    setupNeighbor();
+    DEBUG("Building neighbor lists...\n");
+    buildNeighbor(atom, &neighbor);
+    DEBUG("Computing forces...\n");
+    computeForce(&param, atom, &neighbor, &stats, 1, 1);
+
+    double S, E;
+    S = getTimeStamp();
+    for(int i = 0; i < param.ntimes; i++) {
+        computeForce(&param, atom, &neighbor, &stats, 0, i + 1);
+    }
+    E = getTimeStamp();
+    double T_accum = E-S;
+    double freq_hz = param.proc_freq * 1.e9;
+    const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
+    const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
+
+    if(!csv) {
+        printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
+        if(param.proc_freq > 0.0) {
+            printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
+        }
+    } else {
+        printf("steps,unit cells,atoms/unit cell,total atoms,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
+        if(param.proc_freq > 0.0) {
+            printf(",cy/atom,cy/neigh");
+        }
+        printf("\n");
+
+        printf("%d,%dx%dx%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
+            param.ntimes, param.nx, param.ny, param.nz, atoms_per_unit_cell, atom->Nlocal,
+            estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
+
+        if(param.proc_freq > 0.0) {
+            printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
+        }
+        printf("\n");
+    }
+
+    double timer[NUMTIMER];
+    timer[FORCE] = T_accum;
+    displayStatistics(atom, &param, &stats, timer);
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -0,0 +1,425 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+
+#include <likwid-marker.h>
+
+#include <timing.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <thermo.h>
+#include <pbc.h>
+#include <timers.h>
+#include <eam.h>
+#include <vtk.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+extern void cuda_final_integrate(bool doReneighbour, Parameter *param,
+                                 Atom *atom, Atom *c_atom,
+                                 const int num_threads_per_block);
+extern void cuda_initial_integrate(bool doReneighbour, Parameter *param,
+                                   Atom *atom, Atom *c_atom,
+                                   const int num_threads_per_block);
+
+extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int);
+extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
+extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
+
+void init(Parameter *param)
+{
+    param->input_file = NULL;
+    param->vtk_file = NULL;
+    param->force_field = FF_LJ;
+    param->epsilon = 1.0;
+    param->sigma6 = 1.0;
+    param->rho = 0.8442;
+    param->ntypes = 4;
+    param->ntimes = 200;
+    param->dt = 0.005;
+    param->nx = 32;
+    param->ny = 32;
+    param->nz = 32;
+    param->cutforce = 2.5;
+    param->cutneigh = param->cutforce + 0.30;
+    param->temp = 1.44;
+    param->nstat = 100;
+    param->mass = 1.0;
+    param->dtforce = 0.5 * param->dt;
+    param->every = 20;
+    param->proc_freq = 2.4;
+}
+
+void initCudaAtom(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
+
+    c_atom->Natoms = atom->Natoms;
+    c_atom->Nlocal = atom->Nlocal;
+    c_atom->Nghost = atom->Nghost;
+    c_atom->Nmax = atom->Nmax;
+    c_atom->ntypes = atom->ntypes;
+
+    c_atom->border_map = NULL;
+
+    const int Nlocal = atom->Nlocal;
+
+    checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+    checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) );
+
+    checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) );
+    checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
+    checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+    checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
+
+    checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) );
+    checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) );
+
+    checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+    checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+
+    checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
+}
+
+
+double setup(
+        Parameter *param,
+        Eam *eam,
+        Atom *atom,
+        Neighbor *neighbor,
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        Stats *stats,
+        const int num_threads_per_block,
+	double* timers)
+{
+    if(param->force_field == FF_EAM) { initEam(eam, param); }
+    double S, E;
+    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
+    param->xprd = param->nx * param->lattice;
+    param->yprd = param->ny * param->lattice;
+    param->zprd = param->nz * param->lattice;
+
+    S = getTimeStamp();
+    initAtom(atom);
+    initNeighbor(neighbor, param);
+    initPbc(atom);
+    initStats(stats);
+    setupNeighbor();
+    createAtom(atom, param);
+    setupThermo(param, atom->Natoms);
+    adjustThermo(param, atom);
+    setupPbc(atom, param);
+    initCudaAtom(atom, neighbor, c_atom, c_neighbor);
+    updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
+    E = getTimeStamp();
+
+
+    return E-S;
+}
+
+double reneighbour(
+        Parameter *param,
+        Atom *atom,
+        Neighbor *neighbor,
+        Atom *c_atom,
+        Neighbor *c_neighbor,
+        const int num_threads_per_block,
+	double* timers)
+{
+    double S, E, beforeEvent, afterEvent;
+
+    S = getTimeStamp();
+    beforeEvent = S;
+    LIKWID_MARKER_START("reneighbour");
+    updateAtomsPbc_cuda(atom, param, c_atom, num_threads_per_block);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_UPDATE_ATOMS_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
+    setupPbc(atom, param);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_SETUP_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
+    updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
+    //sortAtom(atom);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
+    LIKWID_MARKER_STOP("reneighbour");
+    E = getTimeStamp();
+    afterEvent = E;
+    timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent;
+
+    return E-S;
+}
+
+void initialIntegrate(Parameter *param, Atom *atom)
+{
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
+        atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
+        atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
+        atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
+    }
+}
+
+void finalIntegrate(Parameter *param, Atom *atom)
+{
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
+    }
+}
+
+void printAtomState(Atom *atom)
+{
+    printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
+            atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
+
+    /*     int nall = atom->Nlocal + atom->Nghost; */
+
+    /*     for (int i=0; i<nall; i++) { */
+    /*         printf("%d  %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
+    /*     } */
+}
+
+int str2ff(const char *string)
+{
+    if(strncmp(string, "lj", 2) == 0) return FF_LJ;
+    if(strncmp(string, "eam", 3) == 0) return FF_EAM;
+    return -1;
+}
+
+const char* ff2str(int ff)
+{
+    if(ff == FF_LJ) { return "lj"; }
+    if(ff == FF_EAM) { return "eam"; }
+    return "invalid";
+}
+
+int get_num_threads() {
+
+    const char *num_threads_env = getenv("NUM_THREADS");
+    int num_threads = 0;
+    if(num_threads_env == 0)
+        num_threads = 32;
+    else {
+        num_threads = atoi(num_threads_env);
+    }
+
+    return num_threads;
+}
+
+int main(int argc, char** argv)
+{
+    double timer[NUMTIMER];
+    Eam eam;
+    Atom atom;
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+    Atom c_atom;
+    Neighbor c_neighbor;
+
+    LIKWID_MARKER_INIT;
+#pragma omp parallel
+    {
+        LIKWID_MARKER_REGISTER("force");
+        //LIKWID_MARKER_REGISTER("reneighbour");
+        //LIKWID_MARKER_REGISTER("pbc");
+    }
+    init(&param);
+
+    for(int i = 0; i < argc; i++)
+    {
+        if((strcmp(argv[i], "-f") == 0))
+        {
+            if((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-i") == 0))
+        {
+            param.input_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0))
+        {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nx") == 0))
+        {
+            param.nx = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-ny") == 0))
+        {
+            param.ny = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nz") == 0))
+        {
+            param.nz = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--freq") == 0))
+        {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--vtk") == 0))
+        {
+            param.vtk_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0))
+        {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-f <string>:          force field (lj or eam), default lj\n");
+            printf("-i <string>:          input file for EAM\n");
+            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
+            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
+            printf("--freq <real>:        processor frequency (GHz)\n");
+            printf("--vtk <string>:       VTK file for visualization\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    // this should be multiple of 32 as operations are performed at the level of warps
+    const int num_threads_per_block = get_num_threads();
+
+    setup(&param, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer);
+    computeThermo(0, &param, &atom);
+    if(param.force_field == FF_EAM) {
+        computeForceEam(&eam, &param, &atom, &neighbor, &stats, 1, 0);
+    } else {
+#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
+        computeForceTracing(&param, &atom, &neighbor, &stats, 1, 0);
+#else
+        computeForce(true, &param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
+#endif
+    }
+
+    timer[FORCE] = 0.0;
+    timer[NEIGH] = 0.0;
+    timer[TOTAL] = getTimeStamp();
+    timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
+    timer[NEIGH_SETUP_PBC] = 0.0;
+    timer[NEIGH_UPDATE_PBC] = 0.0;
+    timer[NEIGH_BINATOMS] = 0.0;
+    timer[NEIGH_BUILD_LISTS] = 0.0;
+
+    if(param.vtk_file != NULL) {
+        write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
+    }
+
+    for(int n = 0; n < param.ntimes; n++) {
+
+        const bool doReneighbour = (n + 1) % param.every == 0;
+
+        cuda_initial_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);
+
+        if(doReneighbour) {
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer);
+        } else {
+	    double before = getTimeStamp();
+            updatePbc_cuda(&atom, &param, &c_atom, false, num_threads_per_block);
+	    double after = getTimeStamp();
+	    timer[NEIGH_UPDATE_PBC] += after - before;
+        }
+
+        if(param.force_field == FF_EAM) {
+            timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats, 0, n + 1);
+        } else {
+#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
+            timer[FORCE] += computeForceTracing(&param, &atom, &neighbor, &stats, 0, n + 1);
+#else
+            timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
+#endif
+        }
+
+        cuda_final_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);
+
+        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
+	    checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost) );
+            computeThermo(n + 1, &param, &atom);
+        }
+
+        if(param.vtk_file != NULL) {
+            write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
+        }
+    }
+
+    timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS];
+    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
+    computeThermo(-1, &param, &atom);
+
+    printf(HLINE);
+    printf("Force field: %s\n", ff2str(param.force_field));
+    printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
+#if PRECISION == 1
+    printf("Using single precision floating point.\n");
+#else
+    printf("Using double precision floating point.\n");
+#endif
+    printf(HLINE);
+    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
+    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs   NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n",
+            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]);
+    printf(HLINE);
+    printf("Performance: %.2f million atom updates per second\n",
+            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+    double atomUpdatesTotal = (double) atom.Natoms * param.ntimes;
+    printf("Force_perf in millions per sec: %.2f\n", 1e-6 * atomUpdatesTotal / timer[FORCE]);
+    double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every;
+    printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]);
+#ifdef COMPUTE_STATS
+    displayStatistics(&atom, &param, &stats, timer);
+#endif
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/lammps/neighbor.cu
+++ b/lammps/neighbor.cu
@@ -0,0 +1,719 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+extern "C" {
+
+#include <neighbor.h>
+#include <parameter.h>
+#include <allocate.h>
+#include <atom.h>
+#include <timing.h>
+#include <timers.h>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+}
+
+__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, 
+                                Neighbor_params np)
+{
+    int ix, iy, iz;
+
+    if(xin >= np.xprd) {
+        ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo;
+    } else {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
+    }
+
+    if(yin >= np.yprd) {
+        iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * np.bininvy) - np.mbinylo;
+    } else {
+        iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
+    }
+
+    if(zin >= np.zprd) {
+        iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo;
+    } else {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
+    }
+
+    return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
+}
+
+/* sorts the contents of a bin to make it comparable to the CPU version */
+/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
+__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= mbins){
+        return;
+    }
+
+    int atoms_in_bin = bincount[i];
+    int* bin_ptr = &bins[i * atoms_per_bin];
+    int sorted;
+    do {
+        sorted = 1;
+        int tmp;
+        for(int index = 0; index < atoms_in_bin - 1; index++){
+            if (bin_ptr[index] > bin_ptr[index + 1]){
+                tmp = bin_ptr[index];
+                bin_ptr[index] = bin_ptr[index + 1];
+                bin_ptr[index + 1] = tmp;
+                sorted = 0;
+            }
+        }
+    } while (!sorted);
+}
+
+__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
+    Atom* atom = &a;
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int nall = atom->Nlocal + atom->Nghost;
+    if(i >= nall){
+        return;
+    }
+    
+    MD_FLOAT x = atom_x(i);
+    MD_FLOAT y = atom_y(i);
+    MD_FLOAT z = atom_z(i);
+    int ibin = coord2bin_device(x, y, z, np);
+
+    int ac = atomicAdd(&bincount[ibin], 1);
+            
+    if(ac < atoms_per_bin){
+        bins[ibin * atoms_per_bin + ac] = i;
+    } else {
+        atomicMax(resize_needed, ac);
+    }
+}
+
+__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nlocal = a.Nlocal;
+    if( i >= Nlocal ) {
+        return;
+    }
+    
+    Atom *atom = &a;
+    Neighbor *neighbor = &neigh;
+    
+    int* neighptr = &(neighbor->neighbors[i]);
+    int n = 0;
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+    int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
+#ifdef EXPLICIT_TYPES
+    int type_i = atom->type[i];
+#endif
+    for(int k = 0; k < nstencil; k++) {
+        int jbin = ibin + stencil[k];
+        int* loc_bin = &bins[jbin * atoms_per_bin];
+
+        for(int m = 0; m < bincount[jbin]; m++) {
+            int j = loc_bin[m];
+
+            if ( j == i ){
+                continue;
+            }
+
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+            int type_j = atom->type[j];
+                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+#else
+            const MD_FLOAT cutoff = cutneighsq;
+#endif
+
+            if( rsq <= cutoff ) {
+                int idx = atom->Nlocal * n;
+                neighptr[idx] = j;
+                n += 1;
+            }
+        }
+    }
+
+    neighbor->numneigh[i] = n;
+
+    if(n > neighbor->maxneighs) {
+        atomicMax(new_maxneighs, n);
+    }
+}
+
+extern "C" {
+    
+    
+static MD_FLOAT xprd, yprd, zprd;
+static MD_FLOAT bininvx, bininvy, bininvz;
+static int mbinxlo, mbinylo, mbinzlo;
+static int nbinx, nbiny, nbinz;
+static int mbinx, mbiny, mbinz; // n bins in x, y, z
+static int *bincount;
+static int *bins;
+static int mbins; //total number of bins
+static int atoms_per_bin;  // max atoms per bin
+static MD_FLOAT cutneigh;
+static MD_FLOAT cutneighsq;  // neighbor cutoff squared
+static int nmax;
+static int nstencil;      // # of bins in stencil
+static int* stencil;      // stencil list of bin offsets
+static MD_FLOAT binsizex, binsizey, binsizez;
+
+static int* c_stencil = NULL;
+static int* c_resize_needed = NULL;
+static int* c_new_maxneighs = NULL;
+static Binning c_binning{
+        .bincount = NULL,
+        .bins = NULL,
+        .mbins = 0,
+        .atoms_per_bin = 0
+};
+
+
+static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
+static MD_FLOAT bindist(int, int, int);
+
+/* exported subroutines */
+void initNeighbor(Neighbor *neighbor, Parameter *param)
+{
+    MD_FLOAT neighscale = 5.0 / 6.0;
+    xprd = param->nx * param->lattice;
+    yprd = param->ny * param->lattice;
+    zprd = param->nz * param->lattice;
+    cutneigh = param->cutneigh;
+    nbinx = neighscale * param->nx;
+    nbiny = neighscale * param->ny;
+    nbinz = neighscale * param->nz;
+    nmax = 0;
+    atoms_per_bin = 8;
+    stencil = NULL;
+    bins = NULL;
+    bincount = NULL;
+    neighbor->maxneighs = 100;
+    neighbor->numneigh = NULL;
+    neighbor->neighbors = NULL;
+}
+
+void setupNeighbor()
+{
+    MD_FLOAT coord;
+    int mbinxhi, mbinyhi, mbinzhi;
+    int nextx, nexty, nextz;
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
+
+    cutneighsq = cutneigh * cutneigh;
+    binsizex = xprd / nbinx;
+    binsizey = yprd / nbiny;
+    binsizez = zprd / nbinz;
+    bininvx = 1.0 / binsizex;
+    bininvy = 1.0 / binsizey;
+    bininvz = 1.0 / binsizez;
+
+    coord = xlo - cutneigh - SMALL * xprd;
+    mbinxlo = (int) (coord * bininvx);
+    if (coord < 0.0) {
+        mbinxlo = mbinxlo - 1;
+    }
+    coord = xhi + cutneigh + SMALL * xprd;
+    mbinxhi = (int) (coord * bininvx);
+
+    coord = ylo - cutneigh - SMALL * yprd;
+    mbinylo = (int) (coord * bininvy);
+    if (coord < 0.0) {
+        mbinylo = mbinylo - 1;
+    }
+    coord = yhi + cutneigh + SMALL * yprd;
+    mbinyhi = (int) (coord * bininvy);
+
+    coord = zlo - cutneigh - SMALL * zprd;
+    mbinzlo = (int) (coord * bininvz);
+    if (coord < 0.0) {
+        mbinzlo = mbinzlo - 1;
+    }
+    coord = zhi + cutneigh + SMALL * zprd;
+    mbinzhi = (int) (coord * bininvz);
+
+    mbinxlo = mbinxlo - 1;
+    mbinxhi = mbinxhi + 1;
+    mbinx = mbinxhi - mbinxlo + 1;
+
+    mbinylo = mbinylo - 1;
+    mbinyhi = mbinyhi + 1;
+    mbiny = mbinyhi - mbinylo + 1;
+
+    mbinzlo = mbinzlo - 1;
+    mbinzhi = mbinzhi + 1;
+    mbinz = mbinzhi - mbinzlo + 1;
+
+    nextx = (int) (cutneigh * bininvx);
+    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
+
+    nexty = (int) (cutneigh * bininvy);
+    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
+
+    nextz = (int) (cutneigh * bininvz);
+    if(nextz * binsizez < FACTOR * cutneigh) nextz++;
+
+    if (stencil) {
+        free(stencil);
+    }
+
+    stencil = (int*) malloc(
+            (2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
+
+    nstencil = 0;
+    int kstart = -nextz;
+
+    for(int k = kstart; k <= nextz; k++) {
+        for(int j = -nexty; j <= nexty; j++) {
+            for(int i = -nextx; i <= nextx; i++) {
+                if(bindist(i, j, k) < cutneighsq) {
+                    stencil[nstencil++] =
+                        k * mbiny * mbinx + j * mbinx + i;
+                }
+            }
+        }
+    }
+
+    mbins = mbinx * mbiny * mbinz;
+
+    if (bincount) {
+        free(bincount);
+    }
+    bincount = (int*) malloc(mbins * sizeof(int));
+
+    if (bins) {
+        free(bins);
+    }
+    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+}
+
+void buildNeighbor(Atom *atom, Neighbor *neighbor)
+{
+    int nall = atom->Nlocal + atom->Nghost;
+
+    /* extend atom arrays if necessary */
+    if(nall > nmax) {
+        nmax = nall;
+        if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh);
+        if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors);
+        checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) );
+        checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) );
+        // neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
+        // neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+    }
+
+    /* bin local & ghost atoms */
+    binatoms(atom);
+    int resize = 1;
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        int new_maxneighs = neighbor->maxneighs;
+        resize = 0;
+
+        for(int i = 0; i < atom->Nlocal; i++) {
+            int* neighptr = &(neighbor->neighbors[i]);
+            int n = 0;
+            MD_FLOAT xtmp = atom_x(i);
+            MD_FLOAT ytmp = atom_y(i);
+            MD_FLOAT ztmp = atom_z(i);
+            int ibin = coord2bin(xtmp, ytmp, ztmp);
+            #ifdef EXPLICIT_TYPES
+            int type_i = atom->type[i];
+            #endif
+            for(int k = 0; k < nstencil; k++) {
+                int jbin = ibin + stencil[k];
+                int* loc_bin = &bins[jbin * atoms_per_bin];
+
+                for(int m = 0; m < bincount[jbin]; m++) {
+                    int j = loc_bin[m];
+
+                    if ( j == i ){
+                        continue;
+                    }
+
+                    MD_FLOAT delx = xtmp - atom_x(j);
+                    MD_FLOAT dely = ytmp - atom_y(j);
+                    MD_FLOAT delz = ztmp - atom_z(j);
+                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                    #ifdef EXPLICIT_TYPES
+                    int type_j = atom->type[j];
+                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+                    #else
+                    const MD_FLOAT cutoff = cutneighsq;
+                    #endif
+
+                    if( rsq <= cutoff ) {
+                        int idx = atom->Nlocal * n;
+                        neighptr[idx] = j;
+                        n += 1;
+                    }
+                }
+            }
+
+            neighbor->numneigh[i] = n;
+
+            if(n >= neighbor->maxneighs) {
+                resize = 1;
+
+                if(n >= new_maxneighs) {
+                    new_maxneighs = n;
+                }
+            }
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", neighbor->maxneighs);
+            neighbor->maxneighs = new_maxneighs * 1.2;
+            free(neighbor->neighbors);
+            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+        }
+    }
+}
+
+/* internal subroutines */
+MD_FLOAT bindist(int i, int j, int k)
+{
+    MD_FLOAT delx, dely, delz;
+
+    if(i > 0) {
+        delx = (i - 1) * binsizex;
+    } else if(i == 0) {
+        delx = 0.0;
+    } else {
+        delx = (i + 1) * binsizex;
+    }
+
+    if(j > 0) {
+        dely = (j - 1) * binsizey;
+    } else if(j == 0) {
+        dely = 0.0;
+    } else {
+        dely = (j + 1) * binsizey;
+    }
+
+    if(k > 0) {
+        delz = (k - 1) * binsizez;
+    } else if(k == 0) {
+        delz = 0.0;
+    } else {
+        delz = (k + 1) * binsizez;
+    }
+
+    return (delx * delx + dely * dely + delz * delz);
+}
+
+int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin)
+{
+    int ix, iy, iz;
+
+    if(xin >= xprd) {
+        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+
+    if(zin >= zprd) {
+        iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * bininvz) - mbinzlo;
+    } else {
+        iz = (int)(zin * bininvz) - mbinzlo - 1;
+    }
+
+    return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
+}
+
+void binatoms(Atom *atom)
+{
+    int nall = atom->Nlocal + atom->Nghost;
+    int resize = 1;
+
+    while(resize > 0) {
+        resize = 0;
+
+        for(int i = 0; i < mbins; i++) {
+            bincount[i] = 0;
+        }
+
+        for(int i = 0; i < nall; i++) {
+            MD_FLOAT x = atom_x(i);
+            MD_FLOAT y = atom_y(i);
+            MD_FLOAT z = atom_z(i);
+            int ibin = coord2bin(x, y, z);
+
+            if(bincount[ibin] < atoms_per_bin) {
+                int ac = bincount[ibin]++;
+                bins[ibin * atoms_per_bin + ac] = i;
+            } else {
+                resize = 1;
+            }
+        }
+
+        if(resize) {
+            free(bins);
+            atoms_per_bin *= 2;
+            bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+        }
+    }
+}
+
+void sortAtom(Atom* atom) {
+    binatoms(atom);
+    int Nmax = atom->Nmax;
+    int* binpos = bincount;
+
+    for(int i=1; i<mbins; i++) {
+        binpos[i] += binpos[i-1];
+    }
+
+#ifdef AOS
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+#else
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+#endif
+
+    MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
+    MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
+
+    for(int mybin = 0; mybin<mbins; mybin++) {
+        int start = mybin>0?binpos[mybin-1]:0;
+        int count = binpos[mybin] - start;
+        for(int k=0; k<count; k++) {
+            int new_i = start + k;
+            int old_i = bins[mybin * atoms_per_bin + k];
+#ifdef AOS
+            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
+            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
+            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
+
+            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
+            new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
+            new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
+#else
+            new_x[new_i] = old_x[old_i];
+            new_y[new_i] = old_y[old_i];
+            new_z[new_i] = old_z[old_i];
+
+            new_vx[new_i] = old_vx[old_i];
+            new_vy[new_i] = old_vy[old_i];
+            new_vz[new_i] = old_vz[old_i];
+#endif
+
+        }
+    }
+
+    free(atom->x);
+    atom->x = new_x;
+
+    free(atom->vx);
+    atom->vx = new_vx;
+#ifndef AOS
+    free(atom->y);
+    free(atom->z);
+    atom->y = new_y; atom->z = new_z;
+
+    free(atom->vy); free(atom->vz);
+    atom->vy = new_vy; atom->vz = new_vz;
+#endif
+}
+
+void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neighbor_params *np, const int threads_per_block)
+{
+    int nall = c_atom->Nlocal + c_atom->Nghost;
+    int resize = 1;
+
+    const int num_blocks = ceil((float)nall / (float)threads_per_block);
+
+    while(resize > 0) {
+        resize = 0;
+        checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
+        checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
+
+        /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
+        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
+
+	    checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
+	    checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
+        
+	    checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        if(resize) {
+            cudaFree(c_binning->bins);
+            c_binning->atoms_per_bin *= 2;
+            checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
+        }
+    }
+    atoms_per_bin = c_binning->atoms_per_bin;
+    const int sortBlocks = ceil((float)mbins / (float)threads_per_block);
+    /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/
+    sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
+    checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
+}
+
+void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers)
+{
+    int nall = atom->Nlocal + atom->Nghost;
+    c_neighbor->maxneighs = neighbor->maxneighs;
+
+    cudaProfilerStart();
+    /* upload stencil */
+    // TODO move all of this initialization into its own method
+    if(c_stencil == NULL){
+        checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+    }
+
+    if(c_binning.mbins == 0){
+        c_binning.mbins = mbins;
+        c_binning.atoms_per_bin = atoms_per_bin;
+        checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
+        checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
+    }
+
+    Neighbor_params np{
+            .xprd = xprd,
+            .yprd = yprd,
+            .zprd = zprd,
+            .bininvx = bininvx,
+            .bininvy = bininvy,
+            .bininvz = bininvz,
+            .mbinxlo = mbinxlo,
+            .mbinylo = mbinylo,
+            .mbinzlo = mbinzlo,
+            .nbinx = nbinx,
+            .nbiny = nbiny,
+            .nbinz = nbinz,
+            .mbinx = mbinx,
+            .mbiny = mbiny,
+            .mbinz = mbinz
+    };
+
+    if(c_resize_needed == NULL){
+        checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
+    }
+
+    /* bin local & ghost atoms */
+    double beforeBinning = getTimeStamp();
+    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+    double afterBinning = getTimeStamp();
+    timers[NEIGH_BINATOMS] += afterBinning - beforeBinning;
+
+    if(c_new_maxneighs == NULL){
+        checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+    }
+
+    int resize = 1;
+    
+    /* extend c_neighbor arrays if necessary */
+    if(nall > nmax) {
+        nmax = nall;
+        if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
+        if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
+        checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
+    }
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        resize = 0;
+
+        checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) ));
+
+        // TODO call compute_neigborhood kernel here
+        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+        /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
+                                     int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
+         * */
+        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
+                                                                    np, nstencil, c_stencil,
+                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
+                                                                    c_new_maxneighs,
+								                                    cutneighsq);
+
+	checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() );
+	checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() );
+
+        // TODO copy the value of c_new_maxneighs back to host and check if it has been modified
+        int new_maxneighs;
+        checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
+        if (new_maxneighs > c_neighbor->maxneighs){
+            resize = 1;
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", c_neighbor->maxneighs);
+            c_neighbor->maxneighs = new_maxneighs * 1.2;
+            printf("NEW SIZE %d\n", c_neighbor->maxneighs);
+            cudaFree(c_neighbor->neighbors);
+            checkCUDAError("c_neighbor->neighbors resize malloc",
+                           cudaMalloc((void**)(&c_neighbor->neighbors),
+                                      c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
+        }
+
+    }
+    neighbor->maxneighs = c_neighbor->maxneighs;
+
+    cudaProfilerStop();
+}
+}
--- a/lammps/pbc.cu
+++ b/lammps/pbc.cu
@@ -0,0 +1,286 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+
+extern "C" {
+
+#include <pbc.h>
+#include <atom.h>
+#include <allocate.h>
+
+#define DELTA 20000
+
+}
+
+__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    Atom* atom = &a;
+    if( i >= atom->Nlocal ){
+        return;
+    }
+
+    if (atom_x(i) < 0.0) {
+        atom_x(i) += xprd;
+    } else if (atom_x(i) >= xprd) {
+        atom_x(i) -= xprd;
+    }
+
+    if (atom_y(i) < 0.0) {
+        atom_y(i) += yprd;
+    } else if (atom_y(i) >= yprd) {
+        atom_y(i) -= yprd;
+    }
+
+    if (atom_z(i) < 0.0) {
+        atom_z(i) += zprd;
+    } else if (atom_z(i) >= zprd) {
+        atom_z(i) -= zprd;
+    }
+}
+
+__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    const int Nghost = a.Nghost;
+    if( i >= Nghost ) {
+        return;
+    }
+    Atom* atom = &a;
+    int *border_map = atom->border_map;
+    int nlocal = atom->Nlocal;
+
+    atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
+    atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
+    atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
+}
+
+extern "C"{
+
+static int NmaxGhost;
+static int *PBCx, *PBCy, *PBCz;
+
+static int c_NmaxGhost = 0;
+static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
+
+static void growPbc(Atom *);
+
+/* exported subroutines */
+void initPbc(Atom *atom) {
+    NmaxGhost = 0;
+    atom->border_map = NULL;
+    PBCx = NULL;
+    PBCy = NULL;
+    PBCz = NULL;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc(Atom *atom, Parameter *param) {
+    int *border_map = atom->border_map;
+    int nlocal = atom->Nlocal;
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for (int i = 0; i < atom->Nghost; i++) {
+        atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
+        atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
+        atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
+    }
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc_cuda(Atom *atom, Parameter *param, Atom *c_atom, bool doReneighbor, const int num_threads_per_block) {
+    if (doReneighbor){
+        c_atom->Natoms = atom->Natoms;
+        c_atom->Nlocal = atom->Nlocal;
+        c_atom->Nghost = atom->Nghost;
+        c_atom->ntypes = atom->ntypes;
+
+        if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
+            c_atom->Nmax = atom->Nmax;
+            if(c_atom->x != NULL){ cudaFree(c_atom->x); }
+            if(c_atom->type != NULL){ cudaFree(c_atom->type); }
+            checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
+            checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
+        }
+        // TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well
+        checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
+
+        if(c_NmaxGhost < NmaxGhost){
+            c_NmaxGhost = NmaxGhost;
+            if(c_PBCx != NULL){ cudaFree(c_PBCx); }
+            if(c_PBCy != NULL){ cudaFree(c_PBCy); }
+            if(c_PBCz != NULL){ cudaFree(c_PBCz); }
+            if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); }
+            checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) );
+            checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) );
+        }
+        checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+        checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
+    }
+
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
+
+    /*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz,
+     *                                                          MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)
+     * */
+    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
+    checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
+}
+
+/* relocate atoms that have left domain according
+ * to periodic boundary conditions */
+void updateAtomsPbc(Atom *atom, Parameter *param) {
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+
+        if (atom_x(i) < 0.0) {
+            atom_x(i) += xprd;
+        } else if (atom_x(i) >= xprd) {
+            atom_x(i) -= xprd;
+        }
+
+        if (atom_y(i) < 0.0) {
+            atom_y(i) += yprd;
+        } else if (atom_y(i) >= yprd) {
+            atom_y(i) -= yprd;
+        }
+
+        if (atom_z(i) < 0.0) {
+            atom_z(i) += zprd;
+        } else if (atom_z(i) >= zprd) {
+            atom_z(i) -= zprd;
+        }
+    }
+}
+
+void updateAtomsPbc_cuda(Atom* atom, Parameter* param, Atom* c_atom, const int num_threads_per_block){
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+    /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
+    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
+
+    checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
+
+    checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+}
+
+/* setup periodic boundary conditions by
+ * defining ghost atoms around domain
+ * only creates mapping and coordinate corrections
+ * that are then enforced in updatePbc */
+#define ADDGHOST(dx, dy, dz)                              \
+    Nghost++;                                           \
+    border_map[Nghost] = i;                             \
+    PBCx[Nghost] = dx;                                  \
+    PBCy[Nghost] = dy;                                  \
+    PBCz[Nghost] = dz;                                  \
+    atom->type[atom->Nlocal + Nghost] = atom->type[i]
+
+void setupPbc(Atom *atom, Parameter *param) {
+    int *border_map = atom->border_map;
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+    MD_FLOAT Cutneigh = param->cutneigh;
+    int Nghost = -1;
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+
+        if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
+            growAtom(atom);
+        }
+        if (Nghost + 7 >= NmaxGhost) {
+            growPbc(atom);
+            border_map = atom->border_map;
+        }
+
+        MD_FLOAT x = atom_x(i);
+        MD_FLOAT y = atom_y(i);
+        MD_FLOAT z = atom_z(i);
+
+        /* Setup ghost atoms */
+        /* 6 planes */
+        if (x < Cutneigh) { ADDGHOST(+1, 0, 0); }
+        if (x >= (xprd - Cutneigh)) { ADDGHOST(-1, 0, 0); }
+        if (y < Cutneigh) { ADDGHOST(0, +1, 0); }
+        if (y >= (yprd - Cutneigh)) { ADDGHOST(0, -1, 0); }
+        if (z < Cutneigh) { ADDGHOST(0, 0, +1); }
+        if (z >= (zprd - Cutneigh)) { ADDGHOST(0, 0, -1); }
+        /* 8 corners */
+        if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1, +1, +1); }
+        if (x < Cutneigh && y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(+1, -1, +1); }
+        if (x < Cutneigh && y >= Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(+1, +1, -1); }
+        if (x < Cutneigh && y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(+1, -1, -1); }
+        if (x >= (xprd - Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1, +1, +1); }
+        if (x >= (xprd - Cutneigh) && y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(-1, -1, +1); }
+        if (x >= (xprd - Cutneigh) && y < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(-1, +1, -1); }
+        if (x >= (xprd - Cutneigh) && y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(-1, -1, -1); }
+        /* 12 edges */
+        if (x < Cutneigh && z < Cutneigh) { ADDGHOST(+1, 0, +1); }
+        if (x < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(+1, 0, -1); }
+        if (x >= (xprd - Cutneigh) && z < Cutneigh) { ADDGHOST(-1, 0, +1); }
+        if (x >= (xprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(-1, 0, -1); }
+        if (y < Cutneigh && z < Cutneigh) { ADDGHOST(0, +1, +1); }
+        if (y < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(0, +1, -1); }
+        if (y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(0, -1, +1); }
+        if (y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(0, -1, -1); }
+        if (y < Cutneigh && x < Cutneigh) { ADDGHOST(+1, +1, 0); }
+        if (y < Cutneigh && x >= (xprd - Cutneigh)) { ADDGHOST(-1, +1, 0); }
+        if (y >= (yprd - Cutneigh) && x < Cutneigh) { ADDGHOST(+1, -1, 0); }
+        if (y >= (yprd - Cutneigh) && x >= (xprd - Cutneigh)) { ADDGHOST(-1, -1, 0); }
+    }
+    // increase by one to make it the ghost atom count
+    atom->Nghost = Nghost + 1;
+}
+
+/* internal subroutines */
+void growPbc(Atom *atom) {
+    int nold = NmaxGhost;
+    NmaxGhost += DELTA;
+
+    atom->border_map = (int *) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCx = (int *) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCy = (int *) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCz = (int *) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+}
+}
--- a/lammps/stats.c
+++ b/lammps/stats.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+
+#include <atom.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timers.h>
+
+void initStats(Stats *s) {
+    s->total_force_neighs = 0;
+    s->total_force_iters = 0;
+}
+
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
+#ifdef COMPUTE_STATS
+    double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
+                                          (double)(stats->total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
+    double avg_neigh = stats->total_force_neighs / (double)(atom->Nlocal * (param->ntimes + 1));
+    double avg_simd = stats->total_force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
+#ifdef EXPLICIT_TYPES
+    force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->total_force_neighs) * sizeof(int);
+#endif
+    printf("Statistics:\n");
+    printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
+    printf("\tAverage neighbors per atom: %.4f\n", avg_neigh);
+    printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
+    printf("\tTotal number of computed pair interactions: %lld\n", stats->total_force_neighs);
+    printf("\tTotal number of SIMD iterations: %lld\n", stats->total_force_iters);
+    printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
+    printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->total_force_iters);
+#endif
+}
--- a/lammps/thermo.c
+++ b/lammps/thermo.c
@@ -0,0 +1,133 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <thermo.h>
+
+static int *steparr;
+static MD_FLOAT *tmparr;
+static MD_FLOAT *engarr;
+static MD_FLOAT *prsarr;
+static MD_FLOAT mvv2e;
+static MD_FLOAT dof_boltz;
+static MD_FLOAT t_scale;
+static MD_FLOAT p_scale;
+static MD_FLOAT e_scale;
+static MD_FLOAT t_act;
+static MD_FLOAT p_act;
+static MD_FLOAT e_act;
+static int mstat;
+
+/* exported subroutines */
+void setupThermo(Parameter *param, int natoms)
+{
+    int maxstat = param->ntimes / param->nstat + 2;
+
+    steparr = (int*) malloc(maxstat * sizeof(int));
+    tmparr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
+    engarr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
+    prsarr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
+
+    if(param->force_field == FF_LJ) {
+        mvv2e = 1.0;
+        dof_boltz = (natoms * 3 - 3);
+        t_scale = mvv2e / dof_boltz;
+        p_scale = 1.0 / 3 / param->xprd / param->yprd / param->zprd;
+        e_scale = 0.5;
+    } else {
+        mvv2e = 1.036427e-04;
+        dof_boltz = (natoms * 3 - 3) * 8.617343e-05;
+        t_scale = mvv2e / dof_boltz;
+        p_scale = 1.602176e+06 / 3 / param->xprd / param->yprd / param->zprd;
+        e_scale = 524287.985533;//16.0;
+        param->dtforce /= mvv2e;
+    }
+
+    printf("step\ttemp\t\tpressure\n");
+}
+
+void computeThermo(int iflag, Parameter *param, Atom *atom)
+{
+    MD_FLOAT t = 0.0, p;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
+    }
+
+    t = t * t_scale;
+    p = (t * dof_boltz) * p_scale;
+    int istep = iflag;
+
+    if(iflag == -1){
+        istep = param->ntimes;
+    }
+    if(iflag == 0){
+        mstat = 0;
+    }
+
+    steparr[mstat] = istep;
+    tmparr[mstat] = t;
+    prsarr[mstat] = p;
+    mstat++;
+    fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
+}
+
+void adjustThermo(Parameter *param, Atom *atom)
+{
+    /* zero center-of-mass motion */
+    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        vxtot += atom_vx(i);
+        vytot += atom_vy(i);
+        vztot += atom_vz(i);
+    }
+
+    vxtot = vxtot / atom->Natoms;
+    vytot = vytot / atom->Natoms;
+    vztot = vztot / atom->Natoms;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) -= vxtot;
+        atom_vy(i) -= vytot;
+        atom_vz(i) -= vztot;
+    }
+
+    t_act = 0;
+    MD_FLOAT t = 0.0;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
+    }
+
+    t *= t_scale;
+    MD_FLOAT factor = sqrt(param->temp / t);
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) *= factor;
+        atom_vy(i) *= factor;
+        atom_vz(i) *= factor;
+    }
+}
--- a/lammps/timing.c
+++ b/lammps/timing.c
@@ -0,0 +1,43 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <time.h>
+
+double getTimeStamp()
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeResolution()
+{
+    struct timespec ts;
+    clock_getres(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeStamp_()
+{
+    return getTimeStamp();
+}
--- a/lammps/util.c
+++ b/lammps/util.c
@@ -0,0 +1,42 @@
+/*
+ * =======================================================================================
+ *
+ *   Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *   Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *   This file is part of MD-Bench.
+ *
+ *   MD-Bench is free software: you can redistribute it and/or modify it
+ *   under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
+ *   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *   PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ *   details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License along
+ *   with MD-Bench.  If not, see <https://www.gnu.org/licenses/>.
+ * =======================================================================================
+ */
+#include <util.h>
+
+/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
+#define IA 16807
+#define IM 2147483647
+#define AM (1.0/IM)
+#define IQ 127773
+#define IR 2836
+#define MASK 123459876
+
+double myrandom(int* idum)
+{
+    int k= (*idum) / IQ;
+    double ans;
+
+    *idum = IA * (*idum - k * IQ) - IR * k;
+    if(*idum < 0) *idum += IM;
+    ans = AM * (*idum);
+    return ans;
+}
--- a/lammps/vtk.c
+++ b/lammps/vtk.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atom.h>
+
+int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}