Refactor code

2024-05-15 14:20:40 +02:00
parent 9712d7e2c8
commit 8d0a8b5f9c
122 changed files with 418 additions and 4527 deletions
--- a/src/clusterpair/atom.c
+++ b/src/clusterpair/atom.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include <atom.h>
+#include <allocate.h>
+#include <util.h>
+
+void initAtom(Atom *atom) {
+    atom->x  = NULL; atom->y  = NULL; atom->z  = NULL;
+    atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
+    atom->cl_x = NULL;
+    atom->cl_v = NULL;
+    atom->cl_f = NULL;
+    atom->cl_type = NULL;
+    atom->Natoms = 0;
+    atom->Nlocal = 0;
+    atom->Nghost = 0;
+    atom->Nmax   = 0;
+    atom->Nclusters = 0;
+    atom->Nclusters_local = 0;
+    atom->Nclusters_ghost = 0;
+    atom->Nclusters_max = 0;
+    atom->type = NULL;
+    atom->ntypes = 0;
+    atom->epsilon = NULL;
+    atom->sigma6 = NULL;
+    atom->cutforcesq = NULL;
+    atom->cutneighsq = NULL;
+    atom->iclusters = NULL;
+    atom->jclusters = NULL;
+    atom->icluster_bin = NULL;
+    initMasks(atom);
+}
+
+void createAtom(Atom *atom, Parameter *param) {
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
+    atom->Natoms = 4 * param->nx * param->ny * param->nz;
+    atom->Nlocal = 0;
+    atom->ntypes = param->ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
+    int ilo = (int) (xlo / (0.5 * alat) - 1);
+    int ihi = (int) (xhi / (0.5 * alat) + 1);
+    int jlo = (int) (ylo / (0.5 * alat) - 1);
+    int jhi = (int) (yhi / (0.5 * alat) + 1);
+    int klo = (int) (zlo / (0.5 * alat) - 1);
+    int khi = (int) (zhi / (0.5 * alat) + 1);
+
+    ilo = MAX(ilo, 0);
+    ihi = MIN(ihi, 2 * param->nx - 1);
+    jlo = MAX(jlo, 0);
+    jhi = MIN(jhi, 2 * param->ny - 1);
+    klo = MAX(klo, 0);
+    khi = MIN(khi, 2 * param->nz - 1);
+
+    MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
+    int i, j, k, m, n;
+    int sx = 0; int sy = 0; int sz = 0;
+    int ox = 0; int oy = 0; int oz = 0;
+    int subboxdim = 8;
+
+    while(oz * subboxdim <= khi) {
+        k = oz * subboxdim + sz;
+        j = oy * subboxdim + sy;
+        i = ox * subboxdim + sx;
+
+        if(((i + j + k) % 2 == 0) && (i >= ilo) && (i <= ihi) && (j >= jlo) && (j <= jhi) && (k >= klo) && (k <= khi)) {
+            xtmp = 0.5 * alat * i;
+            ytmp = 0.5 * alat * j;
+            ztmp = 0.5 * alat * k;
+
+            if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
+                n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
+                for(m = 0; m < 5; m++) { myrandom(&n); }
+                vxtmp = myrandom(&n);
+                for(m = 0; m < 5; m++){ myrandom(&n); }
+                vytmp = myrandom(&n);
+                for(m = 0; m < 5; m++) { myrandom(&n); }
+                vztmp = myrandom(&n);
+
+                if(atom->Nlocal == atom->Nmax) { growAtom(atom); }
+                atom_x(atom->Nlocal) = xtmp;
+                atom_y(atom->Nlocal) = ytmp;
+                atom_z(atom->Nlocal) = ztmp;
+                atom->vx[atom->Nlocal] = vxtmp;
+                atom->vy[atom->Nlocal] = vytmp;
+                atom->vz[atom->Nlocal] = vztmp;
+                atom->type[atom->Nlocal] = rand() % atom->ntypes;
+                atom->Nlocal++;
+            }
+        }
+
+        sx++;
+        if(sx == subboxdim) { sx = 0; sy++; }
+        if(sy == subboxdim) { sy = 0; sz++; }
+        if(sz == subboxdim) { sz = 0; ox++; }
+        if(ox * subboxdim > ihi) { ox = 0; oy++; }
+        if(oy * subboxdim > jhi) { oy = 0; oz++; }
+    }
+}
+
+int type_str2int(const char *type) {
+    if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
+    fprintf(stderr, "Invalid atom type: %s\n", type);
+    exit(-1);
+    return -1;
+}
+
+int readAtom(Atom* atom, Parameter* param) {
+    int len = strlen(param->input_file);
+    if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
+    if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
+    if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
+    fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
+    exit(-1);
+    return -1;
+}
+
+int readAtom_pdb(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int read_atoms = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    while(!feof(fp)) {
+        readline(line, fp);
+        char *item = strtok(line, " ");
+        if(strncmp(item, "CRYST1", 6) == 0) {
+            param->xlo = 0.0;
+            param->xhi = atof(strtok(NULL, " "));
+            param->ylo = 0.0;
+            param->yhi = atof(strtok(NULL, " "));
+            param->zlo = 0.0;
+            param->zhi = atof(strtok(NULL, " "));
+            param->xprd = param->xhi - param->xlo;
+            param->yprd = param->yhi - param->ylo;
+            param->zprd = param->zhi - param->zlo;
+            // alpha, beta, gamma, sGroup, z
+        } else if(strncmp(item, "ATOM", 4) == 0) {
+            char *label;
+            int atom_id, comp_id;
+            MD_FLOAT occupancy, charge;
+            atom_id = atoi(strtok(NULL, " ")) - 1;
+
+            while(atom_id + 1 >= atom->Nmax) {
+                growAtom(atom);
+            }
+
+            atom->type[atom_id] = type_str2int(strtok(NULL, " "));
+            label = strtok(NULL, " ");
+            comp_id = atoi(strtok(NULL, " "));
+            atom_x(atom_id) = atof(strtok(NULL, " "));
+            atom_y(atom_id) = atof(strtok(NULL, " "));
+            atom_z(atom_id) = atof(strtok(NULL, " "));
+            atom->vx[atom_id] = 0.0;
+            atom->vy[atom_id] = 0.0;
+            atom->vz[atom_id] = 0.0;
+            occupancy = atof(strtok(NULL, " "));
+            charge = atof(strtok(NULL, " "));
+            atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
+            atom->Natoms++;
+            atom->Nlocal++;
+            read_atoms++;
+        } else if(strncmp(item, "HEADER", 6) == 0 ||
+                  strncmp(item, "REMARK", 6) == 0 ||
+                  strncmp(item, "MODEL", 5) == 0 ||
+                  strncmp(item, "TER", 3) == 0 ||
+                  strncmp(item, "ENDMDL", 6) == 0) {
+            // Do nothing
+        } else {
+            fprintf(stderr, "Invalid item: %s\n", item);
+            exit(-1);
+            return -1;
+        }
+    }
+
+    if(!read_atoms) {
+        fprintf(stderr, "Input error: No atoms read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    fclose(fp);
+    return read_atoms;
+}
+
+int readAtom_gro(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    char desc[MAXLINE];
+    int read_atoms = 0;
+    int atoms_to_read = 0;
+    int i = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    readline(desc, fp);
+    for(i = 0; desc[i] != '\n'; i++);
+    desc[i] = '\0';
+    readline(line, fp);
+    atoms_to_read = atoi(strtok(line, " "));
+    fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
+
+    while(!feof(fp) && read_atoms < atoms_to_read) {
+        readline(line, fp);
+        char *label = strtok(line, " ");
+        int type = type_str2int(strtok(NULL, " "));
+        int atom_id = atoi(strtok(NULL, " ")) - 1;
+        atom_id = read_atoms;
+        while(atom_id + 1 >= atom->Nmax) {
+            growAtom(atom);
+        }
+
+        atom->type[atom_id] = type;
+        atom_x(atom_id) = atof(strtok(NULL, " "));
+        atom_y(atom_id) = atof(strtok(NULL, " "));
+        atom_z(atom_id) = atof(strtok(NULL, " "));
+        atom->vx[atom_id] = atof(strtok(NULL, " "));
+        atom->vy[atom_id] = atof(strtok(NULL, " "));
+        atom->vz[atom_id] = atof(strtok(NULL, " "));
+        atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
+        atom->Natoms++;
+        atom->Nlocal++;
+        read_atoms++;
+    }
+
+    if(!feof(fp)) {
+        readline(line, fp);
+        param->xlo = 0.0;
+        param->xhi = atof(strtok(line, " "));
+        param->ylo = 0.0;
+        param->yhi = atof(strtok(NULL, " "));
+        param->zlo = 0.0;
+        param->zhi = atof(strtok(NULL, " "));
+        param->xprd = param->xhi - param->xlo;
+        param->yprd = param->yhi - param->ylo;
+        param->zprd = param->zhi - param->zlo;
+    }
+
+    if(read_atoms != atoms_to_read) {
+        fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    fclose(fp);
+    return read_atoms;
+}
+
+int readAtom_dmp(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int natoms = 0;
+    int read_atoms = 0;
+    int atom_id = -1;
+    int ts = -1;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    while(!feof(fp) && ts < 1 && !read_atoms) {
+        readline(line, fp);
+        if(strncmp(line, "ITEM: ", 6) == 0) {
+            char *item = &line[6];
+
+            if(strncmp(item, "TIMESTEP", 8) == 0) {
+                readline(line, fp);
+                ts = atoi(line);
+            } else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
+                readline(line, fp);
+                natoms = atoi(line);
+                atom->Natoms = natoms;
+                atom->Nlocal = natoms;
+                while(atom->Nlocal >= atom->Nmax) {
+                    growAtom(atom);
+                }
+            } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
+                readline(line, fp);
+                param->xlo = atof(strtok(line, " "));
+                param->xhi = atof(strtok(NULL, " "));
+                param->xprd = param->xhi - param->xlo;
+
+                readline(line, fp);
+                param->ylo = atof(strtok(line, " "));
+                param->yhi = atof(strtok(NULL, " "));
+                param->yprd = param->yhi - param->ylo;
+
+                readline(line, fp);
+                param->zlo = atof(strtok(line, " "));
+                param->zhi = atof(strtok(NULL, " "));
+                param->zprd = param->zhi - param->zlo;
+            } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
+                for(int i = 0; i < natoms; i++) {
+                    readline(line, fp);
+                    atom_id = atoi(strtok(line, " ")) - 1;
+                    atom->type[atom_id] = atoi(strtok(NULL, " "));
+                    atom_x(atom_id) = atof(strtok(NULL, " "));
+                    atom_y(atom_id) = atof(strtok(NULL, " "));
+                    atom_z(atom_id) = atof(strtok(NULL, " "));
+                    atom->vx[atom_id] = atof(strtok(NULL, " "));
+                    atom->vy[atom_id] = atof(strtok(NULL, " "));
+                    atom->vz[atom_id] = atof(strtok(NULL, " "));
+                    atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
+                    read_atoms++;
+                }
+            } else {
+                fprintf(stderr, "Invalid item: %s\n", item);
+                exit(-1);
+                return -1;
+            }
+        } else {
+            fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
+            exit(-1);
+            return -1;
+        }
+    }
+
+    if(ts < 0 || !natoms || !read_atoms) {
+        fprintf(stderr, "Input error: atom data was not read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    fclose(fp);
+    return natoms;
+}
+
+void initMasks(Atom *atom) {
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
+    unsigned int mask0, mask1, mask2, mask3;
+
+    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
+    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
+    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
+    //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
+
+    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
+        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+    }
+
+    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
+        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
+    }
+
+    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
+        atom->exclusion_filter[i] = (1U << i);
+    }
+
+    #if CLUSTER_M == CLUSTER_N
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x3 * cond0);
+        mask2 = (unsigned int)(0xf - 0x7 * cond0);
+        mask3 = (unsigned int)(0xf - 0xf * cond0);
+        atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x2 * cond0);
+        mask2 = (unsigned int)(0xf - 0x4 * cond0);
+        mask3 = (unsigned int)(0xf - 0x8 * cond0);
+        atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+        atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
+
+        atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
+    }
+    #else
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
+            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
+            #endif
+
+            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
+            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+
+            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+            #if CLUSTER_M < CLUSTER_N
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
+
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+        }
+    }
+    #endif
+}
+
+void growAtom(Atom *atom) {
+    int nold = atom->Nmax;
+    atom->Nmax += DELTA;
+
+    #ifdef AOS
+    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    #else
+    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->z  = (MD_FLOAT*) reallocate(atom->z,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    #endif
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
+}
+
+void growClusters(Atom *atom) {
+    int nold = atom->Nclusters_max;
+    int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
+    atom->Nclusters_max += DELTA;
+    atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
+    atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
+    atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
+    atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
+}
--- a/src/clusterpair/atom.h
+++ b/src/clusterpair/atom.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <parameter.h>
+
+#ifndef __ATOM_H_
+#define __ATOM_H_
+
+#define DELTA 20000
+
+// Nbnxn layouts (as of GROMACS):
+// Simd4xN: M=4, N=VECTOR_WIDTH
+// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
+// Cuda: M=8, N=VECTOR_WIDTH
+
+#ifdef CUDA_TARGET
+#   undef VECTOR_WIDTH
+#   define VECTOR_WIDTH             8
+#   define KERNEL_NAME              "CUDA"
+#   define CLUSTER_M                8
+#   define CLUSTER_N                VECTOR_WIDTH
+#   define UNROLL_J                 1
+#   define computeForceLJ           computeForceLJ_cuda
+#   define initialIntegrate         cudaInitialIntegrate
+#   define finalIntegrate           cudaFinalIntegrate
+#   define updatePbc                cudaUpdatePbc
+#else
+#   define CLUSTER_M                4
+// Simd2xNN (here used for single-precision)
+#   if VECTOR_WIDTH > CLUSTER_M * 2
+#       define KERNEL_NAME          "Simd2xNN"
+#       define CLUSTER_N            (VECTOR_WIDTH / 2)
+#       define UNROLL_I             4
+#       define UNROLL_J             2
+#       define computeForceLJ       computeForceLJ_2xnn
+// Simd4xN
+#   else
+#       define KERNEL_NAME          "Simd4xN"
+#       define CLUSTER_N            VECTOR_WIDTH
+#       define UNROLL_I             4
+#       define UNROLL_J             1
+#       define computeForceLJ       computeForceLJ_4xn
+#   endif
+#   ifdef USE_REFERENCE_VERSION
+#       undef KERNEL_NAME
+#       undef computeForceLJ
+#       define KERNEL_NAME          "Reference"
+#       define computeForceLJ       computeForceLJ_ref
+#   endif
+#   define initialIntegrate         cpuInitialIntegrate
+#   define finalIntegrate           cpuFinalIntegrate
+#   define updatePbc                cpuUpdatePbc
+#endif
+
+#if CLUSTER_M == CLUSTER_N
+#   define CJ0_FROM_CI(a)           (a)
+#   define CJ1_FROM_CI(a)           (a)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#elif CLUSTER_M == CLUSTER_N * 2 // M > N
+#   define CJ0_FROM_CI(a)           ((a) << 1)
+#   define CJ1_FROM_CI(a)           (((a) << 1) | 0x1)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_M * (b))
+#   define CJ_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
+#elif CLUSTER_M == CLUSTER_N / 2 // M < N
+#   define CJ0_FROM_CI(a)           ((a) >> 1)
+#   define CJ1_FROM_CI(a)           ((a) >> 1)
+#   define CI_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#else
+#   error "Invalid cluster configuration!"
+#endif
+
+#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
+#   error "Cluster N dimension can be only 2, 4 and 8"
+#endif
+
+#define CI_SCALAR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 1))
+#define CI_VECTOR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 3))
+#define CJ_SCALAR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 1))
+#define CJ_VECTOR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 3))
+
+#if CLUSTER_M >= CLUSTER_N
+#   define CL_X_OFFSET              (0 * CLUSTER_M)
+#   define CL_Y_OFFSET              (1 * CLUSTER_M)
+#   define CL_Z_OFFSET              (2 * CLUSTER_M)
+#else
+#   define CL_X_OFFSET              (0 * CLUSTER_N)
+#   define CL_Y_OFFSET              (1 * CLUSTER_N)
+#   define CL_Z_OFFSET              (2 * CLUSTER_N)
+#endif
+
+typedef struct {
+    int natoms;
+    MD_FLOAT bbminx, bbmaxx;
+    MD_FLOAT bbminy, bbmaxy;
+    MD_FLOAT bbminz, bbmaxz;
+} Cluster;
+
+typedef struct {
+    int Natoms, Nlocal, Nghost, Nmax;
+    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
+    MD_FLOAT *x, *y, *z;
+    MD_FLOAT *vx, *vy, *vz;
+    int *border_map;
+    int *type;
+    int ntypes;
+    MD_FLOAT *epsilon;
+    MD_FLOAT *sigma6;
+    MD_FLOAT *cutforcesq;
+    MD_FLOAT *cutneighsq;
+    int *PBCx, *PBCy, *PBCz;
+    // Data in cluster format
+    MD_FLOAT *cl_x;
+    MD_FLOAT *cl_v;
+    MD_FLOAT *cl_f;
+    int *cl_type;
+    Cluster *iclusters, *jclusters;
+    int *icluster_bin;
+    int dummy_cj;
+    MD_UINT *exclusion_filter;
+    MD_FLOAT *diagonal_4xn_j_minus_i;
+    MD_FLOAT *diagonal_2xnn_j_minus_i;
+    unsigned int masks_2xnn_hn[8];
+    unsigned int masks_2xnn_fn[8];
+    unsigned int masks_4xn_hn[16];
+    unsigned int masks_4xn_fn[16];
+} Atom;
+
+extern void initAtom(Atom*);
+extern void initMasks(Atom*);
+extern void createAtom(Atom*, Parameter*);
+extern int readAtom(Atom*, Parameter*);
+extern int readAtom_pdb(Atom*, Parameter*);
+extern int readAtom_gro(Atom*, Parameter*);
+extern int readAtom_dmp(Atom*, Parameter*);
+extern void growAtom(Atom*);
+extern void growClusters(Atom*);
+
+#ifdef AOS
+#   define POS_DATA_LAYOUT     "AoS"
+#   define atom_x(i)           atom->x[(i) * 3 + 0]
+#   define atom_y(i)           atom->x[(i) * 3 + 1]
+#   define atom_z(i)           atom->x[(i) * 3 + 2]
+/*
+#   define atom_vx(i)          atom->vx[(i) * 3 + 0]
+#   define atom_vy(i)          atom->vx[(i) * 3 + 1]
+#   define atom_vz(i)          atom->vx[(i) * 3 + 2]
+#   define atom_fx(i)          atom->fx[(i) * 3 + 0]
+#   define atom_fy(i)          atom->fx[(i) * 3 + 1]
+#   define atom_fz(i)          atom->fx[(i) * 3 + 2]
+*/
+#else
+#   define POS_DATA_LAYOUT     "SoA"
+#   define atom_x(i)           atom->x[i]
+#   define atom_y(i)           atom->y[i]
+#   define atom_z(i)           atom->z[i]
+#endif
+
+// TODO: allow to switch velocites and forces to AoS
+#   define atom_vx(i)          atom->vx[i]
+#   define atom_vy(i)          atom->vy[i]
+#   define atom_vz(i)          atom->vz[i]
+#   define atom_fx(i)          atom->fx[i]
+#   define atom_fy(i)          atom->fy[i]
+#   define atom_fz(i)          atom->fz[i]
+
+#endif
--- a/src/clusterpair/cuda/force_lj.cu
+++ b/src/clusterpair/cuda/force_lj.cu
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+extern "C" {
+
+#include <stdio.h>
+//---
+#include <cuda.h>
+#include <driver_types.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <atom.h>
+#include <device.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timing.h>
+#include <util.h>
+
+}
+
+extern "C" {
+    MD_FLOAT *cuda_cl_x;
+    MD_FLOAT *cuda_cl_v;
+    MD_FLOAT *cuda_cl_f;
+    int *cuda_neighbors;
+    int *cuda_numneigh;
+    int *cuda_natoms;
+    int *natoms;
+    int *ngatoms;
+    int *cuda_border_map;
+    int *cuda_jclusters_natoms;
+    MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
+    MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
+    MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
+    int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
+    int isReneighboured;
+}
+
+extern "C"
+void initDevice(Atom *atom, Neighbor *neighbor) {
+    cuda_assert("cudaDeviceSetup", cudaDeviceReset());
+    cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
+    cuda_cl_x               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_cl_v               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_cl_f               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_natoms             =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_jclusters_natoms   =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_border_map         =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCx               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCy               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCz               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_numneigh           =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_neighbors          =   (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
+    natoms                  =   (int *) malloc(atom->Nclusters_max * sizeof(int));
+    ngatoms                 =   (int *) malloc(atom->Nclusters_max * sizeof(int));
+    isReneighboured = 1;
+}
+
+extern "C"
+void copyDataToCUDADevice(Atom *atom) {
+    memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        natoms[ci] = atom->iclusters[ci].natoms;
+    }
+
+    memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
+
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
+        const int cj = ncj + cg;
+        ngatoms[cg] = atom->jclusters[cj].natoms;
+    }
+
+    memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
+}
+
+extern "C"
+void copyDataFromCUDADevice(Atom *atom) {
+    memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+}
+
+extern "C"
+void cudaDeviceFree() {
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
+    free(natoms);
+    free(ngatoms);
+}
+
+__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
+                                         int *cuda_natoms,
+                                         int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (ci_pos >= Nclusters_local) return;
+
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
+    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+
+    for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
+        ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
+        ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
+        ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
+        ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
+        ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
+        ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
+    }
+}
+
+__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
+                                   int *cuda_jclusters_natoms,
+                                   int *cuda_PBCx,
+                                   int *cuda_PBCy,
+                                   int *cuda_PBCz,
+                                   int Nclusters_local,
+                                   int Nclusters_ghost,
+                                   MD_FLOAT param_xprd,
+                                   MD_FLOAT param_yprd,
+                                   MD_FLOAT param_zprd) {
+    unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
+    if (cg >= Nclusters_ghost) return;
+
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = Nclusters_local / jfac;
+    MD_FLOAT xprd = param_xprd;
+    MD_FLOAT yprd = param_yprd;
+    MD_FLOAT zprd = param_zprd;
+
+    const int cj = ncj + cg;
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
+    MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+    MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
+
+    for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
+        cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
+        cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
+    }
+}
+
+__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
+                                         int Nclusters_local, int Nclusters_max,
+                                         int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
+                                         MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
+    if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
+
+    int ci_cj0 = CJ0_FROM_CI(ci_pos);
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+    int numneighs = cuda_numneigh[ci_pos];
+    for(int k = 0; k < numneighs; k++) {
+        int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+        MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
+
+        MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
+        MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
+        MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+
+        int cond;
+#if CLUSTER_M == CLUSTER_N
+        cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
+                            (ci_cj0 != cj || cii_pos != cjj_pos);
+#elif CLUSTER_M < CLUSTER_N
+        cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
+                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
+#endif
+        if(cond) {
+            MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
+            MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
+            MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+            if(rsq < cutforcesq) {
+                MD_FLOAT sr2 = 1.0 / rsq;
+                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+
+                if(half_neigh) {
+                    atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
+                    atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
+                    atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
+                }
+
+                fix += delx * force;
+                fiy += dely * force;
+                fiz += delz * force;
+
+                atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
+                atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
+                atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
+            }
+        }
+    }
+}
+
+__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
+                                          int *cuda_natoms,
+                                          int Nclusters_local, MD_FLOAT dtforce) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (ci_pos >= Nclusters_local) return;
+
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+
+    for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
+        ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
+        ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
+        ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
+    }
+}
+
+extern "C"
+void cudaInitialIntegrate(Parameter *param, Atom *atom) {
+    const int threads_num = 16;
+    dim3 block_size = dim3(threads_num, 1, 1);
+    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
+    cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
+                                                         cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
+    cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
+    cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+extern "C"
+void cudaUpdatePbc(Atom *atom, Parameter *param) {
+    const int threads_num = 512;
+    dim3 block_size = dim3(threads_num, 1, 1);;
+    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
+    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
+                                       cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
+                                       atom->Nclusters_local, atom->Nclusters_ghost,
+                                       param->xprd, param->yprd, param->zprd);
+    cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
+    cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
+}
+
+extern "C"
+double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+
+    memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    if (isReneighboured) {
+        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+            memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
+            memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
+        }
+
+        isReneighboured = 0;
+    }
+
+    const int threads_num = 1;
+    dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
+    dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+    computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
+                                                        atom->Nclusters_local, atom->Nclusters_max,
+                                                        cuda_numneigh, cuda_neighbors,
+                                                        neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
+                                                        sigma6, epsilon);
+    cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
+    cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    return E-S;
+}
+
+extern "C"
+void cudaFinalIntegrate(Parameter *param, Atom *atom) {
+    const int threads_num = 16;
+    dim3 block_size = dim3(threads_num, 1, 1);
+    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
+    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
+    cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
+    cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
+}
--- a/src/clusterpair/force_eam.c
+++ b/src/clusterpair/force_eam.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <likwid-marker.h>
+#include <math.h>
+
+#include <allocate.h>
+#include <timing.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <eam.h>
+#include <util.h>
+
+double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    /*
+    if(eam->nmax < atom->Nmax) {
+        eam->nmax = atom->Nmax;
+        if(eam->fp != NULL) { free(eam->fp); }
+        eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
+    }
+
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
+    MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
+    MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
+    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
+    */
+    double S = getTimeStamp();
+
+    LIKWID_MARKER_START("force_eam_fp");
+    /*
+    #pragma omp parallel for
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT rhoi = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+            if(rsq < cutforcesq) {
+                MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+#ifdef EXPLICIT_TYPES
+                rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                rhoi += ((rhor_spline[m * 7 + 3] * p +
+                          rhor_spline[m * 7 + 4]) * p +
+                          rhor_spline[m * 7 + 5]) * p +
+                          rhor_spline[m * 7 + 6];
+#endif
+            }
+        }
+
+#ifdef EXPLICIT_TYPES
+        const int type_ii = type_i * type_i;
+#endif
+        MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
+        int m = (int)(p);
+        m = MAX(1, MIN(m, nrho - 1));
+        p -= m;
+        p = MIN(p, 1.0);
+#ifdef EXPLICIT_TYPES
+        fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 2];
+#else
+        fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
+#endif
+    }
+
+    LIKWID_MARKER_STOP("force_eam_fp");
+
+    // We still need to update fp for PBC atoms
+    for(int i = 0; i < atom->Nghost; i++) {
+        fp[Nlocal + i] = fp[atom->border_map[i]];
+    }
+
+    LIKWID_MARKER_START("force_eam");
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+
+            if(rsq < cutforcesq) {
+                MD_FLOAT r = sqrt(rsq);
+                MD_FLOAT p = r * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+
+
+                // rhoip = derivative of (density at atom j due to atom i)
+                // rhojp = derivative of (density at atom i due to atom j)
+                // phi = pair potential energy
+                // phip = phi'
+                // z2 = phi * r
+                // z2p = (phi * r)' = (phi' r) + phi
+                // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+                //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+                //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+#ifdef EXPLICIT_TYPES
+                MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
+                MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
+                MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
+                                z2r_spline[m * 7 + 4]) * p +
+                                z2r_spline[m * 7 + 5]) * p +
+                                z2r_spline[m * 7 + 6];
+#endif
+
+                MD_FLOAT recip = 1.0 / r;
+                MD_FLOAT phi = z2 * recip;
+                MD_FLOAT phip = z2p * recip - phi * recip;
+                MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
+                MD_FLOAT fpair = -psip * recip;
+
+                fix += delx * fpair;
+                fiy += dely * fpair;
+                fiz += delz * fpair;
+                //fpair *= 0.5;
+            }
+        }
+
+        fx[i] = fix;
+        fy[i] = fiy;
+        fz[i] = fiz;
+        addStat(stats->total_force_neighs, numneighs);
+        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+    }
+
+    */
+    LIKWID_MARKER_STOP("force_eam");
+    double E = getTimeStamp();
+    return E-S;
+}
--- a/src/clusterpair/force_lj.c
+++ b/src/clusterpair/force_lj.c
--- a/src/clusterpair/integrate.h
+++ b/src/clusterpair/integrate.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+//---
+#include <atom.h>
+#include <parameter.h>
+#include <util.h>
+
+void cpuInitialIntegrate(Parameter *param, Atom *atom) {
+    DEBUG_MESSAGE("cpuInitialIntegrate start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+            ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
+            ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
+            ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
+        }
+    }
+
+    DEBUG_MESSAGE("cpuInitialIntegrate end\n");
+}
+
+void cpuFinalIntegrate(Parameter *param, Atom *atom) {
+    DEBUG_MESSAGE("cpuFinalIntegrate start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+        }
+    }
+
+    DEBUG_MESSAGE("cpuFinalIntegrate end\n");
+}
+
+#ifdef CUDA_TARGET
+void cudaInitialIntegrate(Parameter*, Atom*);
+void cudaFinalIntegrate(Parameter*, Atom*);
+#endif
--- a/src/clusterpair/main-stub.c
+++ b/src/clusterpair/main-stub.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <timing.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <thermo.h>
+#include <eam.h>
+#include <pbc.h>
+#include <timers.h>
+#include <util.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
+
+// Patterns
+#define P_SEQ   0
+#define P_FIX   1
+#define P_RAND  2
+
+void init(Parameter *param) {
+    param->input_file = NULL;
+    param->force_field = FF_LJ;
+    param->epsilon = 1.0;
+    param->sigma6 = 1.0;
+    param->rho = 0.8442;
+    param->ntypes = 4;
+    param->ntimes = 200;
+    param->nx = 1;
+    param->ny = 1;
+    param->nz = 1;
+    param->lattice = 1.0;
+    param->cutforce = 1000000.0;
+    param->cutneigh = param->cutforce;
+    param->mass = 1.0;
+    param->half_neigh = 0;
+    // Unused
+    param->dt = 0.005;
+    param->dtforce = 0.5 * param->dt;
+    param->nstat = 100;
+    param->temp = 1.44;
+    param->reneigh_every = 20;
+    param->proc_freq = 2.4;
+    param->eam_file = NULL;
+}
+
+void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
+    const int maxneighs = nneighs * nreps;
+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    const int ncj = atom->Nclusters_local / jfac;
+    const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
+    neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
+    neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
+    neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
+    neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
+
+    if(pattern == P_RAND && ncj <= nneighs) {
+        fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
+        exit(-1);
+    }
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+        unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
+        int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
+        int m = (pattern == P_SEQ) ? ncj : nneighs;
+        int k = 0;
+
+        for(int k = 0; k < nneighs; k++) {
+            if(pattern == P_RAND) {
+                int found = 0;
+                do {
+                    int cj = rand() % ncj;
+                    neighptr[k] = cj;
+                    neighptr_imask[k] = imask;
+                    found = 0;
+                    for(int l = 0; l < k; l++) {
+                        if(neighptr[l] == cj) {
+                            found = 1;
+                        }
+                    }
+                } while(found == 1);
+            } else {
+                neighptr[k] = j;
+                neighptr_imask[k] = imask;
+                j = (j + 1) % m;
+            }
+        }
+
+        for(int r = 1; r < nreps; r++) {
+            for(int k = 0; k < nneighs; k++) {
+                neighptr[r * nneighs + k] = neighptr[k];
+                neighptr_imask[r * nneighs + k] = neighptr_imask[k];
+            }
+        }
+
+        neighbor->numneigh[ci] = nneighs * nreps;
+        neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
+    }
+}
+
+int main(int argc, const char *argv[]) {
+    Eam eam;
+    Atom atom_data;
+    Atom *atom = (Atom *)(&atom_data);
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+    char *pattern_str = NULL;
+    int pattern = P_SEQ;
+    int niclusters = 256;               // Number of local i-clusters
+    int iclusters_natoms = CLUSTER_M;   // Number of valid atoms within i-clusters
+    int nneighs = 9;                    // Number of j-cluster neighbors per i-cluster
+    int masked = 0;                     // Use masked loop 
+    int nreps = 1;
+    int csv = 0;
+
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("force");
+    DEBUG_MESSAGE("Initializing parameters...\n");
+    init(&param);
+
+    for(int i = 0; i < argc; i++) {
+        if((strcmp(argv[i], "-f") == 0)) {
+            if((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-p") == 0)) {
+            pattern_str = strdup(argv[++i]);
+            if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
+            else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
+            else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
+            else {
+                fprintf(stderr, "Invalid pattern!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-e") == 0)) {
+            param.eam_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-m") == 0)) {
+            masked = 1;
+            continue;
+        }
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-ni") == 0)) {
+            niclusters = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-na") == 0)) {
+            iclusters_natoms = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nn") == 0)) {
+            nneighs = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nr") == 0)) {
+            nreps = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--freq") == 0)) {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--csv") == 0)) {
+            csv = 1;
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-f <string>:          force field (lj or eam), default lj\n");
+            printf("-p <string>:          pattern for data accesses (seq, fix or rand)\n");
+            printf("-n / --nsteps <int>:  number of timesteps for simulation\n");
+            printf("-ni <int>:            number of i-clusters (default 256)\n");
+            printf("-na <int>:            number of atoms per i-cluster (default %d)\n", CLUSTER_M);
+            printf("-nn <int>:            number of j-cluster neighbors per i-cluster (default 9)\n");
+            printf("-nr <int>:            number of times neighbor lists should be replicated (default 1)\n");
+            printf("--freq <real>:        set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
+            printf("--csv:                set output as CSV style\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    if(pattern_str == NULL) {
+        pattern_str = strdup("seq\0");
+    }
+
+    if(param.force_field == FF_EAM) {
+        DEBUG_MESSAGE("Initializing EAM parameters...\n");
+        initEam(&eam, &param);
+    }
+
+    DEBUG_MESSAGE("Initializing atoms...\n");
+    initAtom(atom);
+    initStats(&stats);
+
+    atom->ntypes = param.ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param.epsilon;
+        atom->sigma6[i] = param.sigma6;
+        atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
+        atom->cutforcesq[i] = param.cutforce * param.cutforce;
+    }
+
+    DEBUG_MESSAGE("Creating atoms...\n");
+    while(atom->Nmax < niclusters * iclusters_natoms) {
+        growAtom(atom);
+    }
+
+    while(atom->Nclusters_max < niclusters) {
+        growClusters(atom);
+    }
+
+    for(int ci = 0; ci < niclusters; ++ci) {
+        int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        int *ci_type = &atom->cl_type[ci_sca_base];
+
+        for(int cii = 0; cii < iclusters_natoms; ++cii) {
+            ci_x[CL_X_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
+            ci_x[CL_Y_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
+            ci_x[CL_Z_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
+            ci_v[CL_X_OFFSET + cii] = 0.0;
+            ci_v[CL_Y_OFFSET + cii] = 0.0;
+            ci_v[CL_Z_OFFSET + cii] = 0.0;
+            ci_type[cii] = rand() % atom->ntypes;
+            atom->Nlocal++;
+        }
+
+        for(int cii = iclusters_natoms; cii < CLUSTER_M; cii++) {
+            ci_x[CL_X_OFFSET + cii] = INFINITY;
+            ci_x[CL_Y_OFFSET + cii] = INFINITY;
+            ci_x[CL_Z_OFFSET + cii] = INFINITY;
+        }
+
+        atom->iclusters[ci].natoms = iclusters_natoms;
+        atom->Nclusters_local++;
+    }
+
+    const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
+    const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
+    const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
+
+    if(!csv) {
+        printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
+        printf("Floating-point precision: %s\n", PRECISION_STRING);
+        printf("Pattern: %s\n", pattern_str);
+        printf("Number of timesteps: %d\n", param.ntimes);
+        printf("Number of i-clusters: %d\n", niclusters);
+        printf("Number of atoms per i-cluster: %d\n", iclusters_natoms);
+        printf("Number of j-cluster neighbors per i-cluster: %d\n", nneighs);
+        printf("Number of times to replicate neighbor lists: %d\n", nreps);
+        printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
+        printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
+        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
+    }
+
+    DEBUG_MESSAGE("Defining j-clusters...\n");
+    defineJClusters(atom);
+    DEBUG_MESSAGE("Initializing neighbor lists...\n");
+    initNeighbor(&neighbor, &param);
+    DEBUG_MESSAGE("Creating neighbor lists...\n");
+    createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
+    DEBUG_MESSAGE("Computing forces...\n");
+
+    double T_accum = 0.0;
+    for(int i = 0; i < param.ntimes; i++) {
+        #if defined(MEM_TRACER) || defined(INDEX_TRACER)
+        traceAddresses(&param, atom, &neighbor, i + 1);
+        #endif
+
+        if(param.force_field == FF_EAM) {
+            T_accum += computeForceEam(&eam, &param, atom, &neighbor, &stats);
+        } else {
+            T_accum += computeForceLJ(&param, atom, &neighbor, &stats);
+        }
+    }
+
+    double freq_hz = param.proc_freq * 1.e9;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
+    const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
+
+    if(!csv) {
+        printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
+        if(param.proc_freq > 0.0) {
+            printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
+        }
+    } else {
+        printf("steps,pattern,niclusters,iclusters_natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
+        if(param.proc_freq > 0.0) {
+            printf(",cy/atom,cy/neigh");
+        }
+        printf("\n");
+
+        printf("%d,%s,%d,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
+            param.ntimes, pattern_str, niclusters, iclusters_natoms, nneighs, nreps,
+            estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
+
+        if(param.proc_freq > 0.0) {
+            printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
+        }
+        printf("\n");
+    }
+
+    double timer[NUMTIMER];
+    timer[FORCE] = T_accum;
+    displayStatistics(atom, &param, &stats, timer);
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/src/clusterpair/main.c
+++ b/src/clusterpair/main.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+//--
+#include <likwid-marker.h>
+//--
+#include <atom.h>
+#include <allocate.h>
+#include <device.h>
+#include <eam.h>
+#include <integrate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <pbc.h>
+#include <stats.h>
+#include <thermo.h>
+#include <timers.h>
+#include <timing.h>
+#include <util.h>
+#include <vtk.h>
+#include <xtc.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
+
+#ifdef CUDA_TARGET
+extern int isReneighboured;
+extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
+extern void copyDataToCUDADevice(Atom *atom);
+extern void copyDataFromCUDADevice(Atom *atom);
+extern void cudaDeviceFree();
+#endif
+
+double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    if(param->force_field == FF_EAM) { initEam(eam, param); }
+    double S, E;
+    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
+    param->xprd = param->nx * param->lattice;
+    param->yprd = param->ny * param->lattice;
+    param->zprd = param->nz * param->lattice;
+
+    S = getTimeStamp();
+    initAtom(atom);
+    initPbc(atom);
+    initStats(stats);
+    initNeighbor(neighbor, param);
+    if(param->input_file == NULL) {
+        createAtom(atom, param);
+    } else {
+        readAtom(atom, param);
+    }
+
+    setupNeighbor(param, atom);
+    setupThermo(param, atom->Natoms);
+    if(param->input_file == NULL) { adjustThermo(param, atom); }
+    buildClusters(atom);
+    defineJClusters(atom);
+    setupPbc(atom, param);
+    binClusters(atom);
+    buildNeighbor(atom, neighbor);
+    initDevice(atom, neighbor);
+    E = getTimeStamp();
+    return E-S;
+}
+
+double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
+    double S, E;
+    S = getTimeStamp();
+    LIKWID_MARKER_START("reneighbour");
+    updateSingleAtoms(atom);
+    updateAtomsPbc(atom, param);
+    buildClusters(atom);
+    defineJClusters(atom);
+    setupPbc(atom, param);
+    binClusters(atom);
+    buildNeighbor(atom, neighbor);
+    LIKWID_MARKER_STOP("reneighbour");
+    E = getTimeStamp();
+    return E-S;
+}
+
+void printAtomState(Atom *atom) {
+    printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
+            atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
+
+    /*     int nall = atom->Nlocal + atom->Nghost; */
+
+    /*     for (int i=0; i<nall; i++) { */
+    /*         printf("%d  %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
+    /*     } */
+}
+
+int main(int argc, char** argv) {
+    double timer[NUMTIMER];
+    Eam eam;
+    Atom atom;
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+
+    LIKWID_MARKER_INIT;
+#pragma omp parallel
+    {
+        LIKWID_MARKER_REGISTER("force");
+        //LIKWID_MARKER_REGISTER("reneighbour");
+        //LIKWID_MARKER_REGISTER("pbc");
+    }
+
+    initParameter(&param);
+    for(int i = 0; i < argc; i++) {
+        if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
+            readParameter(&param, argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-f") == 0)) {
+            if((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-i") == 0)) {
+            param.input_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-e") == 0)) {
+            param.eam_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nx") == 0)) {
+            param.nx = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-ny") == 0)) {
+            param.ny = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nz") == 0)) {
+            param.nz = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-half") == 0)) {
+            param.half_neigh = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
+            param.mass = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
+            param.cutforce = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
+            param.skin = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--freq") == 0)) {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--vtk") == 0)) {
+            param.vtk_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--xtc") == 0)) {
+            #ifndef XTC_OUTPUT
+            fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!");
+            exit(-1);
+            #else
+            param.xtc_file = strdup(argv[++i]);
+            #endif
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
+            printf("-f <string>:          force field (lj or eam), default lj\n");
+            printf("-i <string>:          input file with atom positions (dump)\n");
+            printf("-e <string>:          input file for EAM\n");
+            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
+            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
+            printf("-r / --radius <real>: set cutoff radius\n");
+            printf("-s / --skin <real>:   set skin (verlet buffer)\n");
+            printf("--freq <real>:        processor frequency (GHz)\n");
+            printf("--vtk <string>:       VTK file for visualization\n");
+            printf("--xtc <string>:       XTC file for visualization\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    param.cutneigh = param.cutforce + param.skin;
+    setup(&param, &eam, &atom, &neighbor, &stats);
+    printParameter(&param);
+    printf(HLINE);
+
+    printf("step\ttemp\t\tpressure\n");
+    computeThermo(0, &param, &atom);
+    #if defined(MEM_TRACER) || defined(INDEX_TRACER)
+    traceAddresses(&param, &atom, &neighbor, n + 1);
+    #endif
+
+    #ifdef CUDA_TARGET
+    copyDataToCUDADevice(&atom);
+    #endif
+
+    if(param.force_field == FF_EAM) {
+        timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
+    } else {
+        timer[FORCE] = computeForceLJ(&param, &atom, &neighbor, &stats);
+    }
+
+    timer[NEIGH] = 0.0;
+    timer[TOTAL] = getTimeStamp();
+
+    if(param.vtk_file != NULL) {
+        write_data_to_vtk_file(param.vtk_file, &atom, 0);
+    }
+
+    if(param.xtc_file != NULL) {
+        xtc_init(param.xtc_file, &atom, 0);
+    }
+
+    for(int n = 0; n < param.ntimes; n++) {
+        initialIntegrate(&param, &atom);
+
+        if((n + 1) % param.reneigh_every) {
+            if(!((n + 1) % param.prune_every)) {
+                pruneNeighbor(&param, &atom, &neighbor);
+            }
+
+            updatePbc(&atom, &param, 0);
+        } else {
+            #ifdef CUDA_TARGET
+            copyDataFromCUDADevice(&atom);
+            #endif
+
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
+
+            #ifdef CUDA_TARGET
+            copyDataToCUDADevice(&atom);
+            isReneighboured = 1;
+            #endif
+        }
+
+        #if defined(MEM_TRACER) || defined(INDEX_TRACER)
+        traceAddresses(&param, &atom, &neighbor, n + 1);
+        #endif
+
+        if(param.force_field == FF_EAM) {
+            timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
+        } else {
+            timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
+        }
+
+        finalIntegrate(&param, &atom);
+
+        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
+            computeThermo(n + 1, &param, &atom);
+        }
+
+        int write_pos = !((n + 1) % param.x_out_every);
+        int write_vel = !((n + 1) % param.v_out_every);
+        if(write_pos || write_vel) {
+            if(param.vtk_file != NULL) {
+                write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
+            }
+
+            if(param.xtc_file != NULL) {
+                xtc_write(&atom, n + 1, write_pos, write_vel);
+            }
+        }
+    }
+
+    #ifdef CUDA_TARGET
+    copyDataFromCUDADevice(&atom);
+    #endif
+
+    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
+    updateSingleAtoms(&atom);
+    computeThermo(-1, &param, &atom);
+
+    if(param.xtc_file != NULL) {
+        xtc_end();
+    }
+
+    #ifdef CUDA_TARGET
+    cudaDeviceFree();
+    #endif
+
+    printf(HLINE);
+    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
+    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
+            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
+    printf(HLINE);
+    
+    int nthreads = 0;
+    int chunkSize = 0;
+    omp_sched_t schedKind;
+    char schedType[10];
+#pragma omp parallel
+#pragma omp master
+    {
+	omp_get_schedule(&schedKind, &chunkSize);
+
+    	switch (schedKind)
+    	{
+        	case omp_sched_static:  strcpy(schedType, "static"); break;
+        	case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
+        	case omp_sched_guided:  strcpy(schedType, "guided"); break;
+        	case omp_sched_auto:    strcpy(schedType, "auto"); break;
+    	}
+
+    	nthreads = omp_get_max_threads();
+    }
+
+    printf("Num threads: %d\n", nthreads);
+    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
+
+    printf("Performance: %.2f million atom updates per second\n",
+            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+    #ifdef COMPUTE_STATS
+    displayStatistics(&atom, &param, &stats, timer);
+    #endif
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/src/clusterpair/neighbor.c
+++ b/src/clusterpair/neighbor.c
@@ -0,0 +1,939 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <util.h>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+
+static MD_FLOAT xprd, yprd, zprd;
+static MD_FLOAT bininvx, bininvy;
+static int mbinxlo, mbinylo;
+static int nbinx, nbiny;
+static int mbinx, mbiny; // n bins in x, y
+static int *bincount;
+static int *bins;
+static int *bin_nclusters;
+static int *bin_clusters;
+static int mbins; //total number of bins
+static int atoms_per_bin;  // max atoms per bin
+static int clusters_per_bin;  // max clusters per bin
+static MD_FLOAT cutneigh;
+static MD_FLOAT cutneighsq;  // neighbor cutoff squared
+static int nmax;
+static int nstencil;      // # of bins in stencil
+static int* stencil;      // stencil list of bin offsets
+static MD_FLOAT binsizex, binsizey;
+
+static int coord2bin(MD_FLOAT, MD_FLOAT);
+static MD_FLOAT bindist(int, int);
+
+/* exported subroutines */
+void initNeighbor(Neighbor *neighbor, Parameter *param) {
+    MD_FLOAT neighscale = 5.0 / 6.0;
+    xprd = param->nx * param->lattice;
+    yprd = param->ny * param->lattice;
+    zprd = param->nz * param->lattice;
+    cutneigh = param->cutneigh;
+    nmax = 0;
+    atoms_per_bin = 8;
+    clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
+    stencil = NULL;
+    bins = NULL;
+    bincount = NULL;
+    bin_clusters = NULL;
+    bin_nclusters = NULL;
+    neighbor->half_neigh = param->half_neigh;
+    neighbor->maxneighs = 100;
+    neighbor->numneigh = NULL;
+    neighbor->numneigh_masked = NULL;
+    neighbor->neighbors = NULL;
+    neighbor->neighbors_imask = NULL;
+}
+
+void setupNeighbor(Parameter *param, Atom *atom) {
+    MD_FLOAT coord;
+    int mbinxhi, mbinyhi;
+    int nextx, nexty, nextz;
+
+    if(param->input_file != NULL) {
+        xprd = param->xprd;
+        yprd = param->yprd;
+        zprd = param->zprd;
+    }
+
+    // TODO: update lo and hi for standard case and use them here instead
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
+
+    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
+    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
+    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
+    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
+    nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
+    nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
+    binsizex = (xhi - xlo) / nbinx;
+    binsizey = (yhi - ylo) / nbiny;
+    bininvx = 1.0 / binsizex;
+    bininvy = 1.0 / binsizey;
+    cutneighsq = cutneigh * cutneigh;
+
+    coord = xlo - cutneigh - SMALL * xprd;
+    mbinxlo = (int)(coord * bininvx);
+    if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
+    coord = xhi + cutneigh + SMALL * xprd;
+    mbinxhi = (int)(coord * bininvx);
+
+    coord = ylo - cutneigh - SMALL * yprd;
+    mbinylo = (int)(coord * bininvy);
+    if(coord < 0.0) { mbinylo = mbinylo - 1; }
+    coord = yhi + cutneigh + SMALL * yprd;
+    mbinyhi = (int)(coord * bininvy);
+
+    mbinxlo = mbinxlo - 1;
+    mbinxhi = mbinxhi + 1;
+    mbinx = mbinxhi - mbinxlo + 1;
+
+    mbinylo = mbinylo - 1;
+    mbinyhi = mbinyhi + 1;
+    mbiny = mbinyhi - mbinylo + 1;
+
+    nextx = (int)(cutneigh * bininvx);
+    nexty = (int)(cutneigh * bininvy);
+    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
+    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
+
+    if (stencil) { free(stencil); }
+    stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
+    nstencil = 0;
+
+    for(int j = -nexty; j <= nexty; j++) {
+        for(int i = -nextx; i <= nextx; i++) {
+            if(bindist(i, j) < cutneighsq) {
+                stencil[nstencil++] = j * mbinx + i;
+            }
+        }
+    }
+
+    if(bincount) { free(bincount); }
+    if(bins) { free(bins); }
+    if(bin_nclusters) { free(bin_nclusters); }
+    if(bin_clusters) { free(bin_clusters); }
+    mbins = mbinx * mbiny;
+    bincount = (int*) malloc(mbins * sizeof(int));
+    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+    bin_nclusters = (int*) malloc(mbins * sizeof(int));
+    bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+
+    /*
+    DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
+    DEBUG_MESSAGE("binsize = %e, %e\n", binsizex, binsizey);
+    DEBUG_MESSAGE("mbin lo, hi = (%d, %d), (%d, %d)\n", mbinxlo, mbinylo, mbinxhi, mbinyhi);
+    DEBUG_MESSAGE("mbins = %d (%d x %d)\n", mbins, mbinx, mbiny);
+    DEBUG_MESSAGE("nextx = %d, nexty = %d\n", nextx, nexty);
+    */
+}
+
+MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
+    MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
+    MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
+    MD_FLOAT dm = MAX(dl, dh);
+    MD_FLOAT dm0 = MAX(dm, 0.0);
+    MD_FLOAT d2 = dm0 * dm0;
+
+    dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
+    dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
+    dm = MAX(dl, dh);
+    dm0 = MAX(dm, 0.0);
+    d2 += dm0 * dm0;
+
+    dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
+    dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
+    dm = MAX(dl, dh);
+    dm0 = MAX(dm, 0.0);
+    d2 += dm0 * dm0;
+    return d2;
+}
+
+int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+
+    for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
+            MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
+            MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
+            if(delx * delx + dely * dely + delz * delz < rsq) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
+    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
+                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
+                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+#if VECTOR_WIDTH == 2
+#   define get_imask_simd_4xn get_imask_simd_j2
+#elif VECTOR_WIDTH== 4
+#   define get_imask_simd_4xn get_imask_simd_j4
+#elif VECTOR_WIDTH == 8
+#   define get_imask_simd_4xn get_imask_simd_j8
+#   define get_imask_simd_2xnn get_imask_simd_j4
+#elif VECTOR_WIDTH == 16
+#   define get_imask_simd_2xnn get_imask_simd_j8
+#else
+#   error "Invalid cluster configuration"
+#endif
+
+void buildNeighbor(Atom *atom, Neighbor *neighbor) {
+    DEBUG_MESSAGE("buildNeighbor start\n");
+
+    /* extend atom arrays if necessary */
+    if(atom->Nclusters_local > nmax) {
+        nmax = atom->Nclusters_local;
+        if(neighbor->numneigh) free(neighbor->numneigh);
+        if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
+        if(neighbor->neighbors) free(neighbor->neighbors);
+        if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
+        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
+        neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
+        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
+        neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
+    }
+
+    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
+    MD_FLOAT bby = 0.5 * (binsizey + binsizey);
+    MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
+    rbb_sq = rbb_sq * rbb_sq;
+    int resize = 1;
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        int new_maxneighs = neighbor->maxneighs;
+        resize = 0;
+
+        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+            int ci_cj1 = CJ1_FROM_CI(ci);
+            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+            unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
+            int n = 0, nmasked = 0;
+            int ibin = atom->icluster_bin[ci];
+            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
+            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
+            MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
+            MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
+            MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
+            MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
+
+            for(int k = 0; k < nstencil; k++) {
+                int jbin = ibin + stencil[k];
+                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
+                int cj, m = -1;
+                MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
+                const int c = bin_nclusters[jbin];
+
+                if(c > 0) {
+                    MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
+
+                    do {
+                        m++;
+                        cj = loc_bin[m];
+                        if(neighbor->half_neigh && ci_cj1 > cj) {
+                            continue;
+                        }
+                        jbb_zmin = atom->jclusters[cj].bbminz;
+                        jbb_zmax = atom->jclusters[cj].bbmaxz;
+                        dl = ibb_zmin - jbb_zmax;
+                        dh = jbb_zmin - ibb_zmax;
+                        dm = MAX(dl, dh);
+                        dm0 = MAX(dm, 0.0);
+                        d_bb_sq = dm0 * dm0;
+                    } while(m + 1 < c && d_bb_sq > cutneighsq);
+
+                    jbb_xmin = atom->jclusters[cj].bbminx;
+                    jbb_xmax = atom->jclusters[cj].bbmaxx;
+                    jbb_ymin = atom->jclusters[cj].bbminy;
+                    jbb_ymax = atom->jclusters[cj].bbmaxy;
+
+                    while(m < c) {
+                        if(!neighbor->half_neigh || ci_cj1 <= cj) {
+                            dl = ibb_zmin - jbb_zmax;
+                            dh = jbb_zmin - ibb_zmax;
+                            dm = MAX(dl, dh);
+                            dm0 = MAX(dm, 0.0);
+                            d_bb_sq = dm0 * dm0;
+
+                            /*if(d_bb_sq > cutneighsq) {
+                                break;
+                            }*/
+
+                            dl = ibb_ymin - jbb_ymax;
+                            dh = jbb_ymin - ibb_ymax;
+                            dm = MAX(dl, dh);
+                            dm0 = MAX(dm, 0.0);
+                            d_bb_sq += dm0 * dm0;
+
+                            dl = ibb_xmin - jbb_xmax;
+                            dh = jbb_xmin - ibb_xmax;
+                            dm = MAX(dl, dh);
+                            dm0 = MAX(dm, 0.0);
+                            d_bb_sq += dm0 * dm0;
+
+                            if(d_bb_sq < cutneighsq) {
+                                if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
+                                    // We use true (1) for rdiag because we only care if there are masks
+                                    // at all, and when this is set to false (0) the self-exclusions are
+                                    // not accounted for, which  makes the optimized version to not work!
+                                    unsigned int imask;
+                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
+                                    imask = get_imask_simd_2xnn(1, ci, cj);
+                                    #else // 4xn
+                                    imask = get_imask_simd_4xn(1, ci, cj);
+                                    #endif
+
+                                    if(n < neighbor->maxneighs) {
+                                        if(imask == NBNXN_INTERACTION_MASK_ALL) {
+                                            neighptr[n] = cj;
+                                            neighptr_imask[n] = imask;
+                                        } else {
+                                            neighptr[n] = neighptr[nmasked];
+                                            neighptr_imask[n] = neighptr_imask[nmasked];
+                                            neighptr[nmasked] = cj;
+                                            neighptr_imask[nmasked] = imask;
+                                            nmasked++;
+                                        }
+                                    }
+
+                                    n++;
+                                }
+                            }
+                        }
+
+                        m++;
+                        if(m < c) {
+                            cj = loc_bin[m];
+                            jbb_xmin = atom->jclusters[cj].bbminx;
+                            jbb_xmax = atom->jclusters[cj].bbmaxx;
+                            jbb_ymin = atom->jclusters[cj].bbminy;
+                            jbb_ymax = atom->jclusters[cj].bbmaxy;
+                            jbb_zmin = atom->jclusters[cj].bbminz;
+                            jbb_zmax = atom->jclusters[cj].bbmaxz;
+                        }
+                    }
+                }
+            }
+
+            // Fill neighbor list with dummy values to fit vector width
+            if(CLUSTER_N < VECTOR_WIDTH) {
+                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
+                    neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr_imask[n] = 0;
+                    n++;
+                }
+            }
+
+            neighbor->numneigh[ci] = n;
+            neighbor->numneigh_masked[ci] = nmasked;
+            if(n >= neighbor->maxneighs) {
+                resize = 1;
+
+                if(n >= new_maxneighs) {
+                    new_maxneighs = n;
+                }
+            }
+        }
+
+        if(resize) {
+            neighbor->maxneighs = new_maxneighs * 1.2;
+            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
+            free(neighbor->neighbors);
+            free(neighbor->neighbors_imask);
+            neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
+            neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
+        }
+    }
+
+    /*
+    DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
+    for(int ci = 0; ci < 6; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+
+        DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
+            ci,
+            atom->iclusters[ci].bbminx,
+            atom->iclusters[ci].bbmaxx,
+            atom->iclusters[ci].bbminy,
+            atom->iclusters[ci].bbmaxy,
+            atom->iclusters[ci].bbminz,
+            atom->iclusters[ci].bbmaxz);
+
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
+            DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+
+        DEBUG_MESSAGE("Neighbors:\n");
+        for(int k = 0; k < neighbor->numneigh[ci]; k++) {
+            int cj = neighptr[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+
+            DEBUG_MESSAGE("    Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
+                cj,
+                atom->jclusters[cj].bbminx,
+                atom->jclusters[cj].bbmaxx,
+                atom->jclusters[cj].bbminy,
+                atom->jclusters[cj].bbmaxy,
+                atom->jclusters[cj].bbminz,
+                atom->jclusters[cj].bbmaxz);
+
+            for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+                DEBUG_MESSAGE("    %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
+            }
+        }
+    }
+    */
+
+    DEBUG_MESSAGE("buildNeighbor end\n");
+}
+
+void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
+    DEBUG_MESSAGE("pruneNeighbor start\n");
+    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
+    MD_FLOAT cutsq = cutneighsq;
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[ci];
+        int numneighs_masked = neighbor->numneigh_masked[ci];
+        int k = 0;
+
+        // Remove dummy clusters if necessary
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(neighs[numneighs - 1] == atom->dummy_cj) {
+                numneighs--;
+            }
+        }
+
+        while(k < numneighs) {
+            int cj = neighs[k];
+            if(atomDistanceInRange(atom, ci, cj, cutsq)) {
+                k++;
+            } else {
+                numneighs--;
+                if(k < numneighs_masked) {
+                    numneighs_masked--;
+                }
+                neighs[k] = neighs[numneighs];
+            }
+        }
+
+        // Readd dummy clusters if necessary
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
+                neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs_imask[numneighs] = 0;
+                numneighs++;
+            }
+        }
+
+        neighbor->numneigh[ci] = numneighs;
+        neighbor->numneigh_masked[ci] = numneighs_masked;
+    }
+
+    DEBUG_MESSAGE("pruneNeighbor end\n");
+}
+
+/* internal subroutines */
+MD_FLOAT bindist(int i, int j) {
+    MD_FLOAT delx, dely, delz;
+
+    if(i > 0) {
+        delx = (i - 1) * binsizex;
+    } else if(i == 0) {
+        delx = 0.0;
+    } else {
+        delx = (i + 1) * binsizex;
+    }
+
+    if(j > 0) {
+        dely = (j - 1) * binsizey;
+    } else if(j == 0) {
+        dely = 0.0;
+    } else {
+        dely = (j + 1) * binsizey;
+    }
+
+    return (delx * delx + dely * dely);
+}
+
+int coord2bin(MD_FLOAT xin, MD_FLOAT yin) {
+    int ix, iy;
+
+    if(xin >= xprd) {
+        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+
+    return (iy * mbinx + ix + 1);
+}
+
+void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) {
+    if(xin >= xprd) {
+        *ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        *ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        *ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        *iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        *iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        *iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+}
+
+void binAtoms(Atom *atom) {
+    DEBUG_MESSAGE("binAtoms start\n");
+    int resize = 1;
+
+    while(resize > 0) {
+        resize = 0;
+
+        for(int i = 0; i < mbins; i++) {
+            bincount[i] = 0;
+        }
+
+        for(int i = 0; i < atom->Nlocal; i++) {
+            int ibin = coord2bin(atom_x(i), atom_y(i));
+            if(bincount[ibin] < atoms_per_bin) {
+                int ac = bincount[ibin]++;
+                bins[ibin * atoms_per_bin + ac] = i;
+            } else {
+                resize = 1;
+            }
+        }
+
+        if(resize) {
+            free(bins);
+            atoms_per_bin *= 2;
+            bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+        }
+    }
+
+    DEBUG_MESSAGE("binAtoms end\n");
+}
+
+// TODO: Use pigeonhole sorting
+void sortAtomsByZCoord(Atom *atom) {
+    DEBUG_MESSAGE("sortAtomsByZCoord start\n");
+    for(int bin = 0; bin < mbins; bin++) {
+        int c = bincount[bin];
+        int *bin_ptr = &bins[bin * atoms_per_bin];
+
+        for(int ac_i = 0; ac_i < c; ac_i++) {
+            int i = bin_ptr[ac_i];
+            int min_ac = ac_i;
+            int min_idx = i;
+            MD_FLOAT min_z = atom_z(i);
+
+            for(int ac_j = ac_i + 1; ac_j < c; ac_j++) {
+                int j = bin_ptr[ac_j];
+                MD_FLOAT zj = atom_z(j);
+                if(zj < min_z) {
+                    min_ac = ac_j;
+                    min_idx = j;
+                    min_z = zj;
+                }
+            }
+
+            bin_ptr[ac_i] = min_idx;
+            bin_ptr[min_ac] = i;
+        }
+    }
+
+    DEBUG_MESSAGE("sortAtomsByZCoord end\n");
+}
+
+void buildClusters(Atom *atom) {
+    DEBUG_MESSAGE("buildClusters start\n");
+    atom->Nclusters_local = 0;
+
+    /* bin local atoms */
+    binAtoms(atom);
+    sortAtomsByZCoord(atom);
+
+    for(int bin = 0; bin < mbins; bin++) {
+        int c = bincount[bin];
+        int ac = 0;
+        int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
+        if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
+        for(int cl = 0; cl < nclusters; cl++) {
+            const int ci = atom->Nclusters_local;
+            if(ci >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+            int *ci_type = &atom->cl_type[ci_sca_base];
+            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            atom->iclusters[ci].natoms = 0;
+            for(int cii = 0; cii < CLUSTER_M; cii++) {
+                if(ac < c) {
+                    int i = bins[bin * atoms_per_bin + ac];
+                    MD_FLOAT xtmp = atom_x(i);
+                    MD_FLOAT ytmp = atom_y(i);
+                    MD_FLOAT ztmp = atom_z(i);
+
+                    ci_x[CL_X_OFFSET + cii] = xtmp;
+                    ci_x[CL_Y_OFFSET + cii] = ytmp;
+                    ci_x[CL_Z_OFFSET + cii] = ztmp;
+                    ci_v[CL_X_OFFSET + cii] = atom->vx[i];
+                    ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
+                    ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
+
+                    // TODO: To create the bounding boxes faster, we can use SIMD operations
+                    if(bbminx > xtmp) { bbminx = xtmp; }
+                    if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                    if(bbminy > ytmp) { bbminy = ytmp; }
+                    if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                    if(bbminz > ztmp) { bbminz = ztmp; }
+                    if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+
+                    ci_type[cii] = atom->type[i];
+                    atom->iclusters[ci].natoms++;
+                } else {
+                    ci_x[CL_X_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Y_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Z_OFFSET + cii] = INFINITY;
+                }
+
+                ac++;
+            }
+
+            atom->icluster_bin[ci] = bin;
+            atom->iclusters[ci].bbminx = bbminx;
+            atom->iclusters[ci].bbmaxx = bbmaxx;
+            atom->iclusters[ci].bbminy = bbminy;
+            atom->iclusters[ci].bbmaxy = bbmaxy;
+            atom->iclusters[ci].bbminz = bbminz;
+            atom->iclusters[ci].bbmaxz = bbmaxz;
+            atom->Nclusters_local++;
+        }
+    }
+
+    DEBUG_MESSAGE("buildClusters end\n");
+}
+
+void defineJClusters(Atom *atom) {
+    DEBUG_MESSAGE("defineJClusters start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int cj0 = CJ0_FROM_CI(ci);
+
+        if(CLUSTER_M == CLUSTER_N) {
+            atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
+            atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
+            atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
+            atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
+            atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
+            atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
+            atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
+
+        } else if(CLUSTER_M > CLUSTER_N) {
+            int cj1 = CJ1_FROM_CI(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj0].bbminx = bbminx;
+            atom->jclusters[cj0].bbmaxx = bbmaxx;
+            atom->jclusters[cj0].bbminy = bbminy;
+            atom->jclusters[cj0].bbmaxy = bbmaxy;
+            atom->jclusters[cj0].bbminz = bbminz;
+            atom->jclusters[cj0].bbmaxz = bbmaxz;
+            atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
+
+            bbminx = INFINITY, bbmaxx = -INFINITY;
+            bbminy = INFINITY, bbmaxy = -INFINITY;
+            bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj1].bbminx = bbminx;
+            atom->jclusters[cj1].bbmaxx = bbmaxx;
+            atom->jclusters[cj1].bbminy = bbminy;
+            atom->jclusters[cj1].bbmaxy = bbmaxy;
+            atom->jclusters[cj1].bbminz = bbminz;
+            atom->jclusters[cj1].bbmaxz = bbmaxz;
+            atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
+
+        } else {
+            if(ci % 2 == 0) {
+                const int ci1 = ci + 1;
+                atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
+                atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
+                atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
+                atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
+                atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
+                atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
+                atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
+            }
+        }
+    }
+
+    DEBUG_MESSAGE("defineJClusters end\n");
+}
+
+void binClusters(Atom *atom) {
+    DEBUG_MESSAGE("binClusters start\n");
+
+    /*
+    DEBUG_MESSAGE("Nghost = %d\n", atom->Nclusters_ghost);
+    for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + 4; ci++) {
+        MD_FLOAT *cptr = cluster_pos_ptr(ci);
+        DEBUG_MESSAGE("Cluster %d:\n", ci);
+        DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
+            atom->icluster_bin[ci],
+            atom->clusters[ci].natoms,
+            atom->clusters[ci].bbminx,
+            atom->clusters[ci].bbmaxx,
+            atom->clusters[ci].bbminy,
+            atom->clusters[ci].bbmaxy,
+            atom->clusters[ci].bbminz,
+            atom->clusters[ci].bbmaxz);
+
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
+            DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        }
+    }
+    */
+
+    const int nlocal = atom->Nclusters_local;
+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    const int ncj = atom->Nclusters_local / jfac;
+
+    int resize = 1;
+    while(resize > 0) {
+        resize = 0;
+
+        for(int bin = 0; bin < mbins; bin++) {
+            bin_nclusters[bin] = 0;
+        }
+
+        for(int ci = 0; ci < nlocal && !resize; ci++) {
+            // Assure we add this j-cluster only once in the bin
+            if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
+                int bin = atom->icluster_bin[ci];
+                int c = bin_nclusters[bin];
+                if(c + 1 < clusters_per_bin) {
+                    bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
+                    bin_nclusters[bin]++;
+
+                    if(CLUSTER_M > CLUSTER_N) {
+                        int cj1 = CJ1_FROM_CI(ci);
+                        if(atom->jclusters[cj1].natoms > 0) {
+                            bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
+                            bin_nclusters[bin]++;
+                        }
+                    }
+                } else {
+                    resize = 1;
+                }
+            }
+        }
+
+        for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
+            const int cj = ncj + cg;
+            int ix = -1, iy = -1;
+            MD_FLOAT xtmp, ytmp;
+
+            if(atom->jclusters[cj].natoms > 0) {
+                int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+                MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+                MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
+
+                xtmp = cj_x[CL_X_OFFSET + 0];
+                ytmp = cj_x[CL_Y_OFFSET + 0];
+                coord2bin2D(xtmp, ytmp, &ix, &iy);
+                ix = MAX(MIN(ix, mbinx - 1), 0);
+                iy = MAX(MIN(iy, mbiny - 1), 0);
+                for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
+                    int nix, niy;
+                    xtmp = cj_x[CL_X_OFFSET + cjj];
+                    ytmp = cj_x[CL_Y_OFFSET + cjj];
+                    coord2bin2D(xtmp, ytmp, &nix, &niy);
+                    nix = MAX(MIN(nix, mbinx - 1), 0);
+                    niy = MAX(MIN(niy, mbiny - 1), 0);
+
+                    // Always put the cluster on the bin of its innermost atom so
+                    // the cluster should be closer to local clusters
+                    if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
+                    if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
+                    if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
+                    if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
+                }
+
+                int bin = iy * mbinx + ix + 1;
+                int c = bin_nclusters[bin];
+                if(c < clusters_per_bin) {
+                    // Insert the current ghost cluster in the bin keeping clusters
+                    // sorted by z coordinate
+                    int inserted = 0;
+                    for(int i = 0; i < c; i++) {
+                        int last_cl = bin_clusters[bin * clusters_per_bin + i];
+                        if(atom->jclusters[last_cl].bbminz > cj_minz) {
+                            bin_clusters[bin * clusters_per_bin + i] = cj;
+
+                            for(int j = i + 1; j <= c; j++) {
+                                int tmp = bin_clusters[bin * clusters_per_bin + j];
+                                bin_clusters[bin * clusters_per_bin + j] = last_cl;
+                                last_cl = tmp;
+                            }
+
+                            inserted = 1;
+                            break;
+                        }
+                    }
+
+                    if(!inserted) {
+                        bin_clusters[bin * clusters_per_bin + c] = cj;
+                    }
+
+                    bin_nclusters[bin]++;
+                } else {
+                    resize = 1;
+                }
+            }
+        }
+
+        if(resize) {
+            free(bin_clusters);
+            clusters_per_bin *= 2;
+            bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+        }
+    }
+
+    /*
+    DEBUG_MESSAGE("bin_nclusters\n");
+    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
+    DEBUG_MESSAGE("\n");
+    */
+
+    DEBUG_MESSAGE("binClusters stop\n");
+}
+
+void updateSingleAtoms(Atom *atom) {
+    DEBUG_MESSAGE("updateSingleAtoms start\n");
+    int Natom = 0;
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
+            atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
+            atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
+            atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
+            atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
+            atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
+            Natom++;
+        }
+    }
+
+    if(Natom != atom->Nlocal) {
+        fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n");
+    }
+
+    DEBUG_MESSAGE("updateSingleAtoms stop\n");
+}
--- a/src/clusterpair/neighbor.h
+++ b/src/clusterpair/neighbor.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __NEIGHBOR_H_
+#define __NEIGHBOR_H_
+// Interaction masks from GROMACS, things to remember (maybe these confused just me):
+//   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
+//      interaction masks (1 = interaction, 0 = no interaction)
+//   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
+//      so read them from right to left (least significant to most significant bit)
+// All interaction mask is the same for all kernels
+#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
+// 4x4 kernel diagonal mask
+#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
+// 4x2 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
+#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
+// 4x8 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
+#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
+
+typedef struct {
+    int every;
+    int ncalls;
+    int maxneighs;
+    int* numneigh;
+    int* numneigh_masked;
+    int half_neigh;
+    int* neighbors;
+    unsigned int* neighbors_imask;
+} Neighbor;
+
+extern void initNeighbor(Neighbor*, Parameter*);
+extern void setupNeighbor(Parameter*, Atom*);
+extern void binatoms(Atom*);
+extern void buildNeighbor(Atom*, Neighbor*);
+extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
+extern void sortAtom(Atom*);
+extern void buildClusters(Atom*);
+extern void defineJClusters(Atom*);
+extern void binClusters(Atom*);
+extern void updateSingleAtoms(Atom*);
+#endif
--- a/src/clusterpair/pbc.c
+++ b/src/clusterpair/pbc.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <pbc.h>
+#include <atom.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <util.h>
+
+#define DELTA 20000
+
+static int NmaxGhost;
+
+static void growPbc(Atom*);
+
+/* exported subroutines */
+void initPbc(Atom* atom) {
+    NmaxGhost = 0;
+    atom->border_map = NULL;
+    atom->PBCx = NULL; atom->PBCy = NULL; atom->PBCz = NULL;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
+    DEBUG_MESSAGE("updatePbc start\n");
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
+        const int cj = ncj + cg;
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+        MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
+        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
+            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
+            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
+
+            cj_x[CL_X_OFFSET + cjj] = xtmp;
+            cj_x[CL_Y_OFFSET + cjj] = ytmp;
+            cj_x[CL_Z_OFFSET + cjj] = ztmp;
+
+            if(firstUpdate) {
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+        }
+
+        if(firstUpdate) {
+            for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+                cj_x[CL_X_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Z_OFFSET + cjj] = INFINITY;
+            }
+
+            atom->jclusters[cj].bbminx = bbminx;
+            atom->jclusters[cj].bbmaxx = bbmaxx;
+            atom->jclusters[cj].bbminy = bbminy;
+            atom->jclusters[cj].bbmaxy = bbmaxy;
+            atom->jclusters[cj].bbminz = bbminz;
+            atom->jclusters[cj].bbmaxz = bbmaxz;
+        }
+    }
+
+    DEBUG_MESSAGE("updatePbc end\n");
+}
+
+/* relocate atoms that have left domain according
+ * to periodic boundary conditions */
+void updateAtomsPbc(Atom *atom, Parameter *param) {
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        if(atom_x(i) < 0.0) {
+            atom_x(i) += xprd;
+        } else if(atom_x(i) >= xprd) {
+            atom_x(i) -= xprd;
+        }
+
+        if(atom_y(i) < 0.0) {
+            atom_y(i) += yprd;
+        } else if(atom_y(i) >= yprd) {
+            atom_y(i) -= yprd;
+        }
+
+        if(atom_z(i) < 0.0) {
+            atom_z(i) += zprd;
+        } else if(atom_z(i) >= zprd) {
+            atom_z(i) -= zprd;
+        }
+    }
+}
+
+/* setup periodic boundary conditions by
+ * defining ghost atoms around domain
+ * only creates mapping and coordinate corrections
+ * that are then enforced in updatePbc */
+#define ADDGHOST(dx,dy,dz);                                                     \
+    Nghost++;                                                                   \
+    const int cg = ncj + Nghost;                                                \
+    const int cj_natoms = atom->jclusters[cj].natoms;                           \
+    atom->border_map[Nghost] = cj;                                              \
+    atom->PBCx[Nghost] = dx;                                                    \
+    atom->PBCy[Nghost] = dy;                                                    \
+    atom->PBCz[Nghost] = dz;                                                    \
+    atom->jclusters[cg].natoms = cj_natoms;                                     \
+    Nghost_atoms += cj_natoms;                                                  \
+    int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);                                 \
+    int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg);                                 \
+    for(int cjj = 0; cjj < cj_natoms; cjj++) {                                  \
+        atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj];    \
+    }
+
+/* internal subroutines */
+void growPbc(Atom* atom) {
+    int nold = NmaxGhost;
+    NmaxGhost += DELTA;
+
+    atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+}
+
+void setupPbc(Atom *atom, Parameter *param) {
+    DEBUG_MESSAGE("setupPbc start\n");
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+    MD_FLOAT Cutneigh = param->cutneigh;
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    int Nghost = -1;
+    int Nghost_atoms = 0;
+
+    for(int cj = 0; cj < ncj; cj++) {
+        if(atom->jclusters[cj].natoms > 0) {
+            if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            if((Nghost + 7) * jfac >= NmaxGhost) {
+                growPbc(atom);
+            }
+
+            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
+            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
+            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
+            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
+            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
+            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
+
+            /* Setup ghost atoms */
+            /* 6 planes */
+            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
+            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
+            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
+            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
+            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
+            /* 8 corners */
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
+            /* 12 edges */
+            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
+            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
+            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
+            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
+            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
+            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
+        }
+    }
+
+    if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
+        growClusters(atom);
+    }
+
+    // Add dummy cluster at the end
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
+    }
+
+    // increase by one to make it the ghost atom count
+    atom->dummy_cj = ncj + Nghost + 1;
+    atom->Nghost = Nghost_atoms;
+    atom->Nclusters_ghost = Nghost + 1;
+    atom->Nclusters = atom->Nclusters_local + Nghost + 1;
+
+    // Update created ghost clusters positions
+    cpuUpdatePbc(atom, param, 1);
+    DEBUG_MESSAGE("setupPbc end\n");
+}
--- a/src/clusterpair/pbc.h
+++ b/src/clusterpair/pbc.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __PBC_H_
+#define __PBC_H_
+extern void initPbc();
+extern void cpuUpdatePbc(Atom*, Parameter*, int);
+extern void updateAtomsPbc(Atom*, Parameter*);
+extern void setupPbc(Atom*, Parameter*);
+
+#ifdef CUDA_TARGET
+extern void cudaUpdatePbc(Atom*, Parameter*, int);
+#endif
+#endif
--- a/src/clusterpair/stats.c
+++ b/src/clusterpair/stats.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+
+#include <atom.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timers.h>
+
+void initStats(Stats *s) {
+    s->calculated_forces = 0;
+    s->num_neighs = 0;
+    s->force_iters = 0;
+    s->atoms_within_cutoff = 0;
+    s->atoms_outside_cutoff = 0;
+    s->clusters_within_cutoff = 0;
+    s->clusters_outside_cutoff = 0;
+}
+
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
+#ifdef COMPUTE_STATS
+
+    const int MxN = CLUSTER_M * CLUSTER_N;
+    double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
+    double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
+                                          (double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
+    double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
+    double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
+    double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
+
+    #ifdef EXPLICIT_TYPES
+    force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->num_neighs) * sizeof(int);
+    #endif
+
+    printf("Statistics:\n");
+    printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
+    printf("\tAverage atoms per cluster: %.4f\n", avg_atoms_cluster);
+    printf("\tAverage neighbors per atom: %.4f\n", avg_neigh_atom);
+    printf("\tAverage neighbors per cluster: %.4f\n", avg_neigh_cluster);
+    printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
+    printf("\tTotal number of computed pair interactions: %lld\n", stats->num_neighs * MxN);
+    printf("\tTotal number of SIMD iterations: %lld\n", stats->force_iters);
+    printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
+    printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->force_iters);
+
+    #ifdef USE_REFERENCE_VERSION
+    const double atoms_eff = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
+    printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, atoms_eff);
+    const double clusters_eff = (double)stats->clusters_within_cutoff / (double)(stats->clusters_within_cutoff + stats->clusters_outside_cutoff) * 100.0;
+    printf("\tClusters within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->clusters_within_cutoff, stats->clusters_outside_cutoff, clusters_eff);
+    #endif
+
+#endif
+}
--- a/src/clusterpair/stats.h
+++ b/src/clusterpair/stats.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __STATS_H_
+#define __STATS_H_
+typedef struct {
+    long long int calculated_forces;
+    long long int num_neighs;
+    long long int force_iters;
+    long long int atoms_within_cutoff;
+    long long int atoms_outside_cutoff;
+    long long int clusters_within_cutoff;
+    long long int clusters_outside_cutoff;
+} Stats;
+
+void initStats(Stats *s);
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
+
+#ifdef COMPUTE_STATS
+#   define addStat(stat, value)     stat += value;
+#   define beginStatTimer()         double Si = getTimeStamp();
+#   define endStatTimer(stat)       stat += getTimeStamp() - Si;
+#else
+#   define addStat(stat, value)
+#   define beginStatTimer()
+#   define endStatTimer(stat)
+#endif
+
+#endif
--- a/src/clusterpair/tracing.c
+++ b/src/clusterpair/tracing.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <tracing.h>
+
+void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
+    MEM_TRACER_INIT;
+    INDEX_TRACER_INIT;
+    int Nlocal = atom->Nlocal;
+    int *neighs;
+    unsigned int *neighs_imask;
+    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
+
+    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MEM_TRACE(atom_x(i), 'R');
+        MEM_TRACE(atom_y(i), 'R');
+        MEM_TRACE(atom_z(i), 'R');
+        INDEX_TRACE_ATOM(i);
+
+        #ifdef EXPLICIT_TYPES
+        MEM_TRACE(atom->type[i], 'R');
+        #endif
+
+        DIST_TRACE_SORT(neighs, numneighs);
+        INDEX_TRACE(neighs, numneighs);
+        DIST_TRACE(neighs, numneighs);
+
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MEM_TRACE(j, 'R');
+            MEM_TRACE(atom_x(j), 'R');
+            MEM_TRACE(atom_y(j), 'R');
+            MEM_TRACE(atom_z(j), 'R');
+
+            #ifdef EXPLICIT_TYPES
+            MEM_TRACE(atom->type[j], 'R');
+            #endif
+        }
+
+        /*
+        MEM_TRACE(fx[i], 'R');
+        MEM_TRACE(fx[i], 'W');
+        MEM_TRACE(fy[i], 'R');
+        MEM_TRACE(fy[i], 'W');
+        MEM_TRACE(fz[i], 'R');
+        MEM_TRACE(fz[i], 'W');
+        */
+    }
+
+    INDEX_TRACER_END;
+    MEM_TRACER_END;
+}
--- a/src/clusterpair/tracing.h
+++ b/src/clusterpair/tracing.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#ifndef VECTOR_WIDTH
+#   define VECTOR_WIDTH                 8
+#endif
+
+#ifndef TRACER_CONDITION
+#   define TRACER_CONDITION                 (!(timestep % param->every))
+#endif
+
+#ifdef MEM_TRACER
+#   define MEM_TRACER_INIT                  FILE *mem_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char mem_tracer_fn[128]; \
+                                                snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
+                                                mem_tracer_fp = fopen(mem_tracer_fn, "w");
+                                            }
+
+#   define MEM_TRACER_END                   if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
+#   define MEM_TRACE(addr, op)              if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
+#else
+#   define MEM_TRACER_INIT
+#   define MEM_TRACER_END
+#   define MEM_TRACE(addr, op)
+#endif
+
+#ifdef INDEX_TRACER
+#   define INDEX_TRACER_INIT                FILE *index_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char index_tracer_fn[128]; \
+                                                snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
+                                                index_tracer_fp = fopen(index_tracer_fn, "w"); \
+                                            }
+
+#   define INDEX_TRACER_END                 if(TRACER_CONDITION) { fclose(index_tracer_fp); }
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)   if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
+#   define INDEX_TRACE_ATOM(a)              if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
+#   define INDEX_TRACE(l, e)                if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    fprintf(index_tracer_fp, "I: "); \
+                                                    for(int __j = 0; __j < __e; ++__j) { \
+                                                        fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
+                                                    } \
+                                                    fprintf(index_tracer_fp, "\n"); \
+                                                } \
+                                            }
+
+#   define DIST_TRACE_SORT(l, e)            if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        for(int __j = __i; __j < __i + __e - 1; ++__j) { \
+                                                            for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
+                                                                if(l[__k] > l[__k + 1]) { \
+                                                                    int __t = l[__k]; \
+                                                                    l[__k] = l[__k + 1]; \
+                                                                    l[__k + 1] = __t; \
+                                                                } \
+                                                            } \
+                                                        } \
+                                                    } \
+                                                } \
+                                            }
+
+#   define DIST_TRACE(l, e)                 if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        fprintf(index_tracer_fp, "D: "); \
+                                                        for(int __j = 0; __j < __e - 1; ++__j) { \
+                                                            int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
+                                                            fprintf(index_tracer_fp, "%d ", __dist); \
+                                                        } \
+                                                        fprintf(index_tracer_fp, "\n"); \
+                                                    } \
+                                                } \
+                                            }
+#else
+#   define INDEX_TRACER_INIT
+#   define INDEX_TRACER_END
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)
+#   define INDEX_TRACE_ATOM(a)
+#   define INDEX_TRACE(l, e)
+#   define DIST_TRACE_SORT(l, e)
+#   define DIST_TRACE(l, e)
+#endif
+
+extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
--- a/src/clusterpair/vtk.c
+++ b/src/clusterpair/vtk.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atom.h>
+#include <vtk.h>
+
+void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
+    write_local_atoms_to_vtk_file(filename, atom, timestep);
+    write_ghost_atoms_to_vtk_file(filename, atom, timestep);
+    write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
+    write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
+}
+
+int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+    for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nghost);
+    for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nghost, atom->Nghost * 2);
+    for(int i = 0; i < atom->Nghost; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nghost);
+    for(int i = 0; i < atom->Nghost; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = 0; i < atom->Nghost; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_local_edges_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+    int N = atom->Nclusters_local;
+    int tot_lines = 0;
+    int i = 0;
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET POLYDATA\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+    for(int ci = 0; ci < N; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+
+        tot_lines += atom->iclusters[ci].natoms;
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
+    for(int ci = 0; ci < N; ++ci) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%d ", i++);
+        }
+
+        fprintf(fp, "\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_edges_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+    int N = atom->Nclusters_local + atom->Nclusters_ghost;
+    int tot_lines = 0;
+    int i = 0;
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET POLYDATA\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nghost);
+    for(int ci = atom->Nclusters_local; ci < N; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+
+        tot_lines += atom->iclusters[ci].natoms;
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
+    for(int ci = atom->Nclusters_local; ci < N; ++ci) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%d ", i++);
+        }
+
+        fprintf(fp, "\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
--- a/src/clusterpair/vtk.h
+++ b/src/clusterpair/vtk.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+
+#ifndef __VTK_H_
+#define __VTK_H_
+extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
+extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
+#endif
--- a/src/clusterpair/xtc.c
+++ b/src/clusterpair/xtc.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+//---
+#include <atom.h>
+#include <allocate.h>
+#include <xtc.h>
+
+#ifdef XTC_OUTPUT
+#include <gromacs/fileio/xtcio.h>
+
+static struct t_fileio *xtc_file = NULL;
+static rvec *x_buf = NULL;
+static rvec basis[3];
+
+void xtc_init(const char *filename, Atom *atom, int timestep) {
+    basis[0][XX] = 1.0;
+    basis[0][YY] = 0.0;
+    basis[0][ZZ] = 0.0;
+    basis[1][XX] = 0.0;
+    basis[1][YY] = 1.0;
+    basis[1][ZZ] = 0.0;
+    basis[2][XX] = 0.0;
+    basis[2][YY] = 0.0;
+    basis[2][ZZ] = 1.0;
+
+    xtc_file = open_xtc(filename, "w");
+    x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1));
+    xtc_write(atom, timestep, 1, 1);
+}
+
+void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
+    int i = 0;
+    for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
+            x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
+            x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
+            x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
+            i++;
+        }
+    }
+
+    write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000);
+}
+
+void xtc_end() {
+    free(x_buf);
+    close_xtc(xtc_file);
+}
+#endif
--- a/src/clusterpair/xtc.h
+++ b/src/clusterpair/xtc.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+
+#ifndef __XTC_H_
+#define __XTC_H_
+
+#ifdef XTC_OUTPUT
+void xtc_init(const char *, Atom*, int);
+void xtc_write(Atom*, int, int, int);
+void xtc_end();
+#else
+#define xtc_init(a,b,c)
+#define xtc_write(a,b,c,d)
+#define xtc_end()
+#endif
+#endif
--- a/src/common/allocate.c
+++ b/src/common/allocate.c
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <util.h>
+
+void *allocate(int alignment, size_t bytesize) {
+    void *ptr;
+    int errorCode;
+
+    errorCode = posix_memalign(&ptr, alignment, bytesize);
+    if(errorCode == EINVAL) {
+        fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if(errorCode == ENOMEM) {
+        fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if(ptr == NULL) {
+        fprintf(stderr, "Error: posix_memalign failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return ptr;
+}
+
+void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) {
+    void *newarray = allocate(alignment, new_bytesize);
+    if(ptr != NULL) {
+        memcpy(newarray, ptr, old_bytesize);
+        free(ptr);
+    }
+
+    return newarray;
+}
--- a/src/common/allocate.h
+++ b/src/common/allocate.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+
+#ifndef __ALLOCATE_H_
+#define __ALLOCATE_H_
+extern void* allocate (int alignment, size_t bytesize);
+extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
+#endif
--- a/src/common/device.c
+++ b/src/common/device.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+//---
+#include <device.h>
+
+#ifdef CUDA_TARGET
+#include <cuda_runtime.h>
+
+void cuda_assert(const char *label, cudaError_t err) {
+    if (err != cudaSuccess) {
+        printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err));
+        exit(-1);
+    }
+}
+
+void *allocateGPU(size_t bytesize) {
+    void *ptr;
+    #ifdef CUDA_HOST_MEMORY
+    cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize));
+    #else
+    cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize));
+    #endif
+    return ptr;
+}
+
+// Data is not preserved
+void *reallocateGPU(void *ptr, size_t new_bytesize) {
+    if(ptr != NULL) {
+        #ifdef CUDA_HOST_MEMORY
+        cudaFreeHost(ptr);
+        #else
+        cudaFree(ptr);
+        #endif
+    }
+
+    return allocateGPU(new_bytesize);
+}
+
+void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {
+    #ifndef CUDA_HOST_MEMORY
+    cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice));
+    #endif
+}
+
+void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {
+    #ifndef CUDA_HOST_MEMORY
+    cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost));
+    #endif
+}
+
+void memsetGPU(void *d_ptr, int value, size_t bytesize) {
+    cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
+}
+
+#else
+void initDevice(Atom *atom, Neighbor *neighbor) {}
+void *allocateGPU(size_t bytesize) { return NULL; }
+void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; }
+void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {}
+void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {}
+void memsetGPU(void *d_ptr, int value, size_t bytesize) {}
+#endif
--- a/src/common/device.h
+++ b/src/common/device.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stddef.h>
+//---
+#include <atom.h>
+#include <neighbor.h>
+
+#ifndef __DEVICE_H_
+#define __DEVICE_H_
+
+#ifdef CUDA_TARGET
+#include <cuda_runtime.h>
+extern void cuda_assert(const char *msg, cudaError_t err);
+#endif
+
+extern void initDevice(Atom*, Neighbor*);
+extern void *allocateGPU(size_t bytesize);
+extern void *reallocateGPU(void *ptr, size_t new_bytesize);
+extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize);
+extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize);
+extern void memsetGPU(void *d_ptr, int value, size_t bytesize);
+#endif
--- a/src/common/eam.h
+++ b/src/common/eam.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __EAM_H_
+#define __EAM_H_
+typedef struct {
+    int nrho, nr;
+    MD_FLOAT drho, dr, cut, mass;
+    MD_FLOAT *frho, *rhor, *zr;
+} Funcfl;
+
+typedef struct {
+    MD_FLOAT* fp;
+    int nmax;
+    int nrho, nr;
+    int nrho_tot, nr_tot;
+    MD_FLOAT dr, rdr, drho, rdrho;
+    MD_FLOAT *frho, *rhor, *z2r;
+    MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
+    Funcfl file;
+} Eam;
+
+void initEam(Eam* eam, Parameter* param);
+void coeff(Eam* eam, Parameter* param);
+void init_style(Eam* eam, Parameter *param);
+void read_eam_file(Funcfl* file, const char* filename);
+void file2array(Eam* eam);
+void array2spline(Eam* eam, Parameter* param);
+void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
+void grab(FILE* fptr, int n, MD_FLOAT* list);
+#endif
--- a/src/common/eam_utils.c
+++ b/src/common/eam_utils.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <allocate.h>
+#include <atom.h>
+#include <eam.h>
+#include <parameter.h>
+#include <util.h>
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+void initEam(Eam* eam, Parameter* param) {
+    int ntypes = param->ntypes;
+    eam->nmax = 0;
+    eam->fp = NULL;
+    coeff(eam, param);
+    init_style(eam, param);
+}
+
+void coeff(Eam* eam, Parameter* param) {
+    read_eam_file(&eam->file, param->eam_file);
+    param->mass = eam->file.mass;
+    param->cutforce = eam->file.cut;
+    param->cutneigh = param->cutforce + 1.0;
+    param->temp = 600.0;
+    param->dt = 0.001;
+    param->rho = 0.07041125;
+    param->dtforce = 0.5 * param->dt / param->mass;
+}
+
+void init_style(Eam* eam, Parameter* param) {
+    // convert read-in file(s) to arrays and spline them
+    file2array(eam);
+    array2spline(eam, param);
+}
+
+void read_eam_file(Funcfl* file, const char* filename) {
+    FILE* fptr;
+    char line[MAXLINE];
+
+    fptr = fopen(filename, "r");
+    if(fptr == NULL) {
+        printf("Can't open EAM Potential file: %s\n", filename);
+        exit(0);
+    }
+
+    int tmp;
+    readline(line, fptr);
+    readline(line, fptr);
+    sscanf(line, "%d %lg", &tmp, &(file->mass));
+    readline(line, fptr);
+    sscanf(line, "%d %lg %d %lg %lg", &file->nrho, &file->drho, &file->nr, &file->dr, &file->cut);
+
+    //printf("Read: %lf %i %lf %i %lf %lf\n",file->mass,file->nrho,file->drho,file->nr,file->dr,file->cut);
+    file->frho = (MD_FLOAT *) allocate(ALIGNMENT, (file->nrho + 1) * sizeof(MD_FLOAT));
+    file->rhor = (MD_FLOAT *) allocate(ALIGNMENT, (file->nr + 1) * sizeof(MD_FLOAT));
+    file->zr = (MD_FLOAT *) allocate(ALIGNMENT, (file->nr + 1) * sizeof(MD_FLOAT));
+    grab(fptr, file->nrho, file->frho);
+    grab(fptr, file->nr, file->zr);
+    grab(fptr, file->nr, file->rhor);
+    for(int i = file->nrho; i > 0; i--) file->frho[i] = file->frho[i - 1];
+    for(int i = file->nr; i > 0; i--) file->rhor[i] = file->rhor[i - 1];
+    for(int i = file->nr; i > 0; i--) file->zr[i] = file->zr[i - 1];
+    fclose(fptr);
+}
+
+void file2array(Eam* eam) {
+    int i, j, k, m, n;
+    double sixth = 1.0 / 6.0;
+
+    // determine max function params from all active funcfl files
+    // active means some element is pointing at it via map
+    int active;
+    double rmax, rhomax;
+    eam->dr = eam->drho = rmax = rhomax = 0.0;
+    active = 0;
+    Funcfl* file = &eam->file;
+    eam->dr = MAX(eam->dr, file->dr);
+    eam->drho = MAX(eam->drho, file->drho);
+    rmax = MAX(rmax, (file->nr - 1) * file->dr);
+    rhomax = MAX(rhomax, (file->nrho - 1) * file->drho);
+
+    // set nr,nrho from cutoff and spacings
+    // 0.5 is for round-off in divide
+    eam->nr = (int)(rmax / eam->dr + 0.5);
+    eam->nrho = (int)(rhomax / eam->drho + 0.5);
+
+    // ------------------------------------------------------------------
+    // setup frho arrays
+    // ------------------------------------------------------------------
+
+    // allocate frho arrays
+    // nfrho = # of funcfl files + 1 for zero array
+    eam->frho = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nrho + 1) * sizeof(MD_FLOAT));
+
+    // interpolate each file's frho to a single grid and cutoff
+    double r, p, cof1, cof2, cof3, cof4;
+    n = 0;
+
+    for(m = 1; m <= eam->nrho; m++) {
+        r = (m - 1) * eam->drho;
+        p = r / file->drho + 1.0;
+        k = (int)(p);
+        k = MIN(k, file->nrho - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        eam->frho[m] = cof1 * file->frho[k - 1] + cof2 * file->frho[k] +
+                       cof3 * file->frho[k + 1] + cof4 * file->frho[k + 2];
+    }
+
+
+    // ------------------------------------------------------------------
+    // setup rhor arrays
+    // ------------------------------------------------------------------
+
+    // allocate rhor arrays
+    // nrhor = # of funcfl files
+    eam->rhor = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nr + 1) * sizeof(MD_FLOAT));
+
+    // interpolate each file's rhor to a single grid and cutoff
+    for(m = 1; m <= eam->nr; m++) {
+        r = (m - 1) * eam->dr;
+        p = r / file->dr + 1.0;
+        k = (int)(p);
+        k = MIN(k, file->nr - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        eam->rhor[m] = cof1 * file->rhor[k - 1] + cof2 * file->rhor[k] +
+                       cof3 * file->rhor[k + 1] + cof4 * file->rhor[k + 2];
+        //if(m==119)printf("BuildRho: %e %e %e %e %e %e\n",rhor[m],cof1,cof2,cof3,cof4,file->rhor[k]);
+    }
+
+    // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+    // for funcfl files, I,J mapping only depends on I
+    // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+    // ------------------------------------------------------------------
+    // setup z2r arrays
+    // ------------------------------------------------------------------
+
+    // allocate z2r arrays
+    // nz2r = N*(N+1)/2 where N = # of funcfl files
+    eam->z2r = (MD_FLOAT *) allocate(ALIGNMENT, (eam->nr + 1) * sizeof(MD_FLOAT));
+
+    // create a z2r array for each file against other files, only for I >= J
+    // interpolate zri and zrj to a single grid and cutoff
+    double zri, zrj;
+    Funcfl* ifile = &eam->file;
+    Funcfl* jfile = &eam->file;
+
+    for(m = 1; m <= eam->nr; m++) {
+        r = (m - 1) * eam->dr;
+        p = r / ifile->dr + 1.0;
+        k = (int)(p);
+        k = MIN(k, ifile->nr - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        zri = cof1 * ifile->zr[k - 1] + cof2 * ifile->zr[k] +
+        cof3 * ifile->zr[k + 1] + cof4 * ifile->zr[k + 2];
+
+        p = r / jfile->dr + 1.0;
+        k = (int)(p);
+        k = MIN(k, jfile->nr - 2);
+        k = MAX(k, 2);
+        p -= k;
+        p = MIN(p, 2.0);
+        cof1 = -sixth * p * (p - 1.0) * (p - 2.0);
+        cof2 = 0.5 * (p * p - 1.0) * (p - 2.0);
+        cof3 = -0.5 * p * (p + 1.0) * (p - 2.0);
+        cof4 = sixth * p * (p * p - 1.0);
+        zrj = cof1 * jfile->zr[k - 1] + cof2 * jfile->zr[k] +
+        cof3 * jfile->zr[k + 1] + cof4 * jfile->zr[k + 2];
+
+        eam->z2r[m] = 27.2 * 0.529 * zri * zrj;
+    }
+}
+
+void array2spline(Eam* eam, Parameter* param) {
+    eam->rdr = 1.0 / eam->dr;
+    eam->rdrho = 1.0 / eam->drho;
+    eam->nrho_tot = (eam->nrho + 1) * 7 + 64;
+    eam->nr_tot = (eam->nr + 1) * 7 + 64;
+    eam->nrho_tot -= eam->nrho_tot%64;
+    eam->nr_tot -= eam->nr_tot%64;
+
+    int ntypes = param->ntypes;
+    eam->frho_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nrho_tot * sizeof(MD_FLOAT));
+    eam->rhor_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nr_tot * sizeof(MD_FLOAT));
+    eam->z2r_spline = (MD_FLOAT *) allocate(ALIGNMENT, ntypes * ntypes * eam->nr_tot * sizeof(MD_FLOAT));
+    interpolate(eam->nrho, eam->drho, eam->frho, eam->frho_spline);
+    interpolate(eam->nr, eam->dr, eam->rhor, eam->rhor_spline);
+    interpolate(eam->nr, eam->dr, eam->z2r, eam->z2r_spline);
+
+    // replicate data for multiple types;
+    for(int tt = 0; tt < ntypes * ntypes; tt++) {
+        for(int k = 0; k < eam->nrho_tot; k++)
+            eam->frho_spline[tt*eam->nrho_tot + k] = eam->frho_spline[k];
+        for(int k = 0; k < eam->nr_tot; k++)
+            eam->rhor_spline[tt*eam->nr_tot + k] = eam->rhor_spline[k];
+        for(int k = 0; k < eam->nr_tot; k++)
+            eam->z2r_spline[tt*eam->nr_tot + k] = eam->z2r_spline[k];
+    }
+}
+
+void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline) {
+    for(int m = 1; m <= n; m++) spline[m * 7 + 6] = f[m];
+
+    spline[1 * 7 + 5] = spline[2 * 7 + 6] - spline[1 * 7 + 6];
+    spline[2 * 7 + 5] = 0.5 * (spline[3 * 7 + 6] - spline[1 * 7 + 6]);
+    spline[(n - 1) * 7 + 5] = 0.5 * (spline[n * 7 + 6] - spline[(n - 2) * 7 + 6]);
+    spline[n * 7 + 5] = spline[n * 7 + 6] - spline[(n - 1) * 7 + 6];
+
+    for(int m = 3; m <= n - 2; m++)
+        spline[m * 7 + 5] = ((spline[(m - 2) * 7 + 6] - spline[(m + 2) * 7 + 6]) +
+                            8.0 * (spline[(m + 1) * 7 + 6] - spline[(m - 1) * 7 + 6])) / 12.0;
+
+    for(int m = 1; m <= n - 1; m++) {
+        spline[m * 7 + 4] = 3.0 * (spline[(m + 1) * 7 + 6] - spline[m * 7 + 6]) -
+                            2.0 * spline[m * 7 + 5] - spline[(m + 1) * 7 + 5];
+        spline[m * 7 + 3] = spline[m * 7 + 5] + spline[(m + 1) * 7 + 5] -
+                            2.0 * (spline[(m + 1) * 7 + 6] - spline[m * 7 + 6]);
+    }
+
+    spline[n * 7 + 4] = 0.0;
+    spline[n * 7 + 3] = 0.0;
+
+    for(int m = 1; m <= n; m++) {
+        spline[m * 7 + 2] = spline[m * 7 + 5] / delta;
+        spline[m * 7 + 1] = 2.0 * spline[m * 7 + 4] / delta;
+        spline[m * 7 + 0] = 3.0 * spline[m * 7 + 3] / delta;
+    }
+}
+
+void grab(FILE* fptr, int n, MD_FLOAT* list) {
+    char* ptr;
+    char line[MAXLINE];
+    int i = 0;
+
+    while(i < n) {
+        readline(line, fptr);
+        ptr = strtok(line, " \t\n\r\f");
+        list[i++] = atof(ptr);
+        while((ptr = strtok(NULL, " \t\n\r\f"))) list[i++] = atof(ptr);
+    }
+}
--- a/src/common/likwid-marker.h
+++ b/src/common/likwid-marker.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#ifndef LIKWID_MARKER_H
+#define LIKWID_MARKER_H
+
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_RESET(regionTag)
+Shortcut for likwid_markerResetRegion() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#define LIKWID_MARKER_INIT likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else  /* LIKWID_PERFMON */
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_MARKER_RESET(regionTag)
+#endif /* LIKWID_PERFMON */
+
+
+/** \addtogroup NvMarkerAPI NvMarker API module (MarkerAPI for Nvidia GPUs)
+*  @{
+*/
+/*!
+\def LIKWID_NVMARKER_INIT
+Shortcut for likwid_gpuMarkerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_THREADINIT
+Shortcut for likwid_gpuMarkerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_REGISTER(regionTag)
+Shortcut for likwid_gpuMarkerRegisterRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_START(regionTag)
+Shortcut for likwid_gpuMarkerStartRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_STOP(regionTag)
+Shortcut for likwid_gpuMarkerStopRegion() with \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count)
+Shortcut for likwid_gpuMarkerGetRegion() for \a regionTag if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_SWITCH
+Shortcut for likwid_gpuMarkerNextGroup() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_RESET(regionTag)
+Shortcut for likwid_gpuMarkerResetRegion() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_NVMARKER_CLOSE
+Shortcut for likwid_gpuMarkerClose() if compiled with -DLIKWID_NVMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_NVMON
+#ifndef LIKWID_WITH_NVMON
+#define LIKWID_WITH_NVMON
+#endif
+#include <likwid.h>
+#define LIKWID_NVMARKER_INIT likwid_gpuMarkerInit()
+#define LIKWID_NVMARKER_THREADINIT likwid_gpuMarkerThreadInit()
+#define LIKWID_NVMARKER_SWITCH likwid_gpuMarkerNextGroup()
+#define LIKWID_NVMARKER_REGISTER(regionTag) likwid_gpuMarkerRegisterRegion(regionTag)
+#define LIKWID_NVMARKER_START(regionTag) likwid_gpuMarkerStartRegion(regionTag)
+#define LIKWID_NVMARKER_STOP(regionTag) likwid_gpuMarkerStopRegion(regionTag)
+#define LIKWID_NVMARKER_CLOSE likwid_gpuMarkerClose()
+#define LIKWID_NVMARKER_RESET(regionTag) likwid_gpuMarkerResetRegion(regionTag)
+#define LIKWID_NVMARKER_GET(regionTag, ngpus, nevents, events, time, count) \
+    likwid_gpuMarkerGetRegion(regionTag, ngpus, nevents, events, time, count)
+#else /* LIKWID_NVMON */
+#define LIKWID_NVMARKER_INIT
+#define LIKWID_NVMARKER_THREADINIT
+#define LIKWID_NVMARKER_SWITCH
+#define LIKWID_NVMARKER_REGISTER(regionTag)
+#define LIKWID_NVMARKER_START(regionTag)
+#define LIKWID_NVMARKER_STOP(regionTag)
+#define LIKWID_NVMARKER_CLOSE
+#define LIKWID_NVMARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_NVMARKER_RESET(regionTag)
+#endif /* LIKWID_NVMON */
+
+
+
+#endif /* LIKWID_MARKER_H */
--- a/src/common/parameter.c
+++ b/src/common/parameter.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+//---
+#include <atom.h>
+#include <parameter.h>
+#include <util.h>
+
+void initParameter(Parameter *param) {
+    param->input_file = NULL;
+    param->vtk_file = NULL;
+    param->xtc_file = NULL;
+    param->eam_file = NULL;
+    param->write_atom_file = NULL;
+    param->force_field = FF_LJ;
+    param->epsilon = 1.0;
+    param->sigma = 1.0;
+    param->sigma6 = 1.0;
+    param->rho = 0.8442;
+    param->ntypes = 4;
+    param->ntimes = 200;
+    param->dt = 0.005;
+    param->nx = 32;
+    param->ny = 32;
+    param->nz = 32;
+    param->pbc_x = 1;
+    param->pbc_y = 1;
+    param->pbc_z = 1;
+    param->cutforce = 2.5;
+    param->skin = 0.3;
+    param->cutneigh = param->cutforce + param->skin;
+    param->temp = 1.44;
+    param->nstat = 100;
+    param->mass = 1.0;
+    param->dtforce = 0.5 * param->dt;
+    param->reneigh_every = 20;
+    param->prune_every = 1000;
+    param->x_out_every = 20;
+    param->v_out_every = 5;
+    param->half_neigh = 0;
+    param->proc_freq = 2.4;
+    // DEM
+    param->k_s = 1.0;
+    param->k_dn = 1.0;
+    param->gx = 0.0;
+    param->gy = 0.0;
+    param->gz = 0.0;
+    param->reflect_x = 0.0;
+    param->reflect_y = 0.0;
+    param->reflect_z = 0.0;
+}
+
+void readParameter(Parameter *param, const char *filename) {
+    FILE *fp = fopen(filename, "r");
+    char line[MAXLINE];
+    int i;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open parameter file: %s\n", filename);
+        exit(-1);
+    }
+
+    while(!feof(fp)) {
+        line[0] = '\0';
+        readline(line, fp);
+        for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
+        line[i] = '\0';
+
+        char *tok = strtok(line, " ");
+        char *val = strtok(NULL, " ");
+
+        #define PARSE_PARAM(p,f)   if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
+        #define PARSE_STRING(p)    PARSE_PARAM(p, strdup)
+        #define PARSE_INT(p)       PARSE_PARAM(p, atoi)
+        #define PARSE_REAL(p)      PARSE_PARAM(p, atof)
+
+        if(tok != NULL && val != NULL) {
+            PARSE_PARAM(force_field, str2ff);
+            PARSE_STRING(input_file);
+            PARSE_STRING(eam_file);
+            PARSE_STRING(vtk_file);
+            PARSE_STRING(xtc_file);
+            PARSE_REAL(epsilon);
+            PARSE_REAL(sigma);
+            PARSE_REAL(k_s);
+            PARSE_REAL(k_dn);
+            PARSE_REAL(reflect_x);
+            PARSE_REAL(reflect_y);
+            PARSE_REAL(reflect_z);
+            PARSE_REAL(gx);
+            PARSE_REAL(gy);
+            PARSE_REAL(gz);
+            PARSE_REAL(rho);
+            PARSE_REAL(dt);
+            PARSE_REAL(cutforce);
+            PARSE_REAL(skin);
+            PARSE_REAL(temp);
+            PARSE_REAL(mass);
+            PARSE_REAL(proc_freq);
+            PARSE_INT(ntypes);
+            PARSE_INT(ntimes);
+            PARSE_INT(nx);
+            PARSE_INT(ny);
+            PARSE_INT(nz);
+            PARSE_INT(pbc_x);
+            PARSE_INT(pbc_y);
+            PARSE_INT(pbc_z);
+            PARSE_INT(nstat);
+            PARSE_INT(reneigh_every);
+            PARSE_INT(prune_every);
+            PARSE_INT(x_out_every);
+            PARSE_INT(v_out_every);
+            PARSE_INT(half_neigh);
+        }
+    }
+
+    // Update dtforce
+    param->dtforce = 0.5 * param->dt;
+
+    // Update sigma6 parameter
+    MD_FLOAT s2 = param->sigma * param->sigma;
+    param->sigma6 = s2 * s2 * s2;
+    fclose(fp);
+}
+
+void printParameter(Parameter *param) {
+    printf("Parameters:\n");
+    if(param->input_file != NULL) {
+        printf("\tInput file: %s\n", param->input_file);
+    }
+
+    if(param->vtk_file != NULL) {
+        printf("\tVTK file: %s\n", param->vtk_file);
+    }
+
+    if(param->xtc_file != NULL) {
+        printf("\tXTC file: %s\n", param->xtc_file);
+    }
+
+    if(param->eam_file != NULL) {
+        printf("\tEAM file: %s\n", param->eam_file);
+    }
+
+    printf("\tForce field: %s\n", ff2str(param->force_field));
+    #ifdef CLUSTER_M
+    printf("\tKernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
+    #else
+    printf("\tKernel: %s\n", KERNEL_NAME);
+    #endif
+    printf("\tData layout: %s\n", POS_DATA_LAYOUT);
+    printf("\tFloating-point precision: %s\n", PRECISION_STRING);
+    printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
+    printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
+    printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
+    printf("\tLattice size: %e\n", param->lattice);
+    printf("\tEpsilon: %e\n", param->epsilon);
+    printf("\tSigma: %e\n", param->sigma);
+    printf("\tSpring constant: %e\n", param->k_s);
+    printf("\tDamping constant: %e\n", param->k_dn);
+    printf("\tTemperature: %e\n", param->temp);
+    printf("\tRHO: %e\n", param->rho);
+    printf("\tMass: %e\n", param->mass);
+    printf("\tNumber of types: %d\n", param->ntypes);
+    printf("\tNumber of timesteps: %d\n", param->ntimes);
+    printf("\tReport stats every (timesteps): %d\n", param->nstat);
+    printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
+    #ifdef SORT_ATOMS
+    printf("\tSort atoms when reneighboring: yes\n");
+    #else
+    printf("\tSort atoms when reneighboring: no\n");
+    #endif
+    printf("\tPrune every (timesteps): %d\n", param->prune_every);
+    printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
+    printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
+    printf("\tDelta time (dt): %e\n", param->dt);
+    printf("\tCutoff radius: %e\n", param->cutforce);
+    printf("\tSkin: %e\n", param->skin);
+    printf("\tHalf neighbor lists: %d\n", param->half_neigh);
+    printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
+}
--- a/src/common/parameter.h
+++ b/src/common/parameter.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __PARAMETER_H_
+#define __PARAMETER_H_
+
+#if PRECISION == 1
+#   define MD_FLOAT float
+#   define MD_UINT  unsigned int
+#else
+#   define MD_FLOAT double
+#   define MD_UINT  unsigned long long int
+#endif
+
+typedef struct {
+    int force_field;
+    char* param_file;
+    char* input_file;
+    char* vtk_file;
+    char* xtc_file;
+    char* write_atom_file;
+    MD_FLOAT epsilon;
+    MD_FLOAT sigma;
+    MD_FLOAT sigma6;
+    MD_FLOAT temp;
+    MD_FLOAT rho;
+    MD_FLOAT mass;
+    int ntypes;
+    int ntimes;
+    int nstat;
+    int reneigh_every;
+    int prune_every;
+    int x_out_every;
+    int v_out_every;
+    int half_neigh;
+    MD_FLOAT dt;
+    MD_FLOAT dtforce;
+    MD_FLOAT skin;
+    MD_FLOAT cutforce;
+    MD_FLOAT cutneigh;
+    int nx, ny, nz;
+    int pbc_x, pbc_y, pbc_z;
+    MD_FLOAT lattice;
+    MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
+    MD_FLOAT xprd, yprd, zprd;
+    double proc_freq;
+    char* eam_file;
+    // DEM
+    MD_FLOAT k_s;
+    MD_FLOAT k_dn;
+    MD_FLOAT gx, gy, gz;
+    MD_FLOAT reflect_x, reflect_y, reflect_z;
+} Parameter;
+
+void initParameter(Parameter*);
+void readParameter(Parameter*, const char*);
+void printParameter(Parameter*);
+
+#endif
--- a/src/common/simd.h
+++ b/src/common/simd.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __SIMD_H__
+#define __SIMD_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#ifndef NO_ZMM_INTRIN
+#   include <zmmintrin.h>
+#endif
+
+#ifndef CLUSTER_M
+#   define CLUSTER_M 1
+#endif
+
+#ifndef CLUSTER_N
+#   define CLUSTER_N 1
+#endif
+
+#if defined(__ISA_AVX512__)
+#   if PRECISION == 2
+#       include "simd/avx512_double.h"
+#   else
+#       include "simd/avx512_float.h"
+#   endif
+#endif
+
+#if defined(__ISA_AVX2__)
+#   if PRECISION == 2
+#       include "simd/avx2_double.h"
+#   else
+#       include "simd/avx2_float.h"
+#   endif
+#endif
+
+#if defined(__ISA_AVX__)
+#   if PRECISION == 2
+#       include "simd/avx_double.h"
+#   else
+#       include "simd/avx_float.h"
+#   endif
+#endif
+
+#define SIMD_PRINT_REAL(a)  simd_print_real(#a, a);
+#define SIMD_PRINT_MASK(a)  simd_print_mask(#a, a);
+
+static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
+    double x[VECTOR_WIDTH];
+    memcpy(x, &a, sizeof(x));
+
+    fprintf(stdout, "%s: ", ref);
+    for(int i = 0; i < VECTOR_WIDTH; i++) {
+        fprintf(stdout, "%f ", x[i]);
+    }
+
+    fprintf(stdout, "\n");
+}
+
+static inline void simd_print_mask(const char *ref, MD_SIMD_MASK a) { fprintf(stdout, "%s: %x\n", ref, simd_mask_to_u32(a)); }
+
+#endif // __SIMD_H__
--- a/src/common/simd/avx2_double.h
+++ b/src/common/simd/avx2_double.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#define MD_SIMD_FLOAT   __m256d
+#define MD_SIMD_INT     __m128i
+#define MD_SIMD_MASK    __m256d
+
+static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
+static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
+    MD_SIMD_FLOAT ret;
+    fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX2 with double precision!");
+    exit(-1);
+    return ret;
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
+    MD_SIMD_FLOAT ret;
+    fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX2 with double precision!");
+    exit(-1);
+    return ret;
+}
+
+static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX2 with double precision!");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
+    __m256d t0, t1, t2;
+    __m128d a0, a1;
+
+    t0 = _mm256_hadd_pd(v0, v1);
+    t1 = _mm256_hadd_pd(v2, v3);
+    t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
+    t0 = _mm256_add_pd(t0, t2);
+    t1 = _mm256_add_pd(t1, t2);
+    t0 = _mm256_blend_pd(t0, t1, 0xC);
+    //t0 = _mm256_blend_pd(t0, t1, 0b1100);
+    t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
+    _mm256_store_pd(m, t1);
+
+    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
+    //t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
+    a0 = _mm256_castpd256_pd128(t0);
+    a1 = _mm256_extractf128_pd(t0, 0x1);
+    a0 = _mm_add_sd(a0, a1);
+    return *((MD_FLOAT *) &a0);
+}
+
+static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
+//static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
+// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
+    const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
+    const unsigned long long int none = 0x0;
+    return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
+}
+// TODO: Implement this, althrough it is just required for debugging
+static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128d a0, a1;
+    // test with shuffle & add as an alternative to hadd later
+    a = _mm256_hadd_pd(a, a);
+    a0 = _mm256_castpd256_pd128(a);
+    a1 = _mm256_extractf128_pd(a, 0x1);
+    a0 = _mm_add_sd(a0, a1);
+    return *((MD_FLOAT *) &a0);
+}
+
+static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
+    fprintf(stderr, "simd_h_decr3(): Not implemented for AVX2 with double precision!");
+    exit(-1);
+}
+
+// Functions used in LAMMPS kernel
+#define simd_gather(vidx, m, s)     _mm256_i32gather_pd(m, vidx, s);
+static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
+static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
+static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
+static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
+static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }
--- a/src/common/simd/avx2_float.h
+++ b/src/common/simd/avx2_float.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#define MD_SIMD_FLOAT   __m256
+#define MD_SIMD_MASK    __mmask8
+
+static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
+static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
+static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128 t0;
+    t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
+    __m128 t0, t2;
+    v0 = _mm256_hadd_ps(v0, v1);
+    v2 = _mm256_hadd_ps(v2, v3);
+    v0 = _mm256_hadd_ps(v0, v2);
+    t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
+    t2 = _mm_add_ps(t0, _mm_load_ps(m));
+    _mm_store_ps(m, t2);
+
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
+    return _mm256_broadcast_ps((const __m128 *)(m));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
+    __m128 t0, t1;
+    t0 = _mm_broadcast_ss(m);
+    t1 = _mm_broadcast_ss(m + 1);
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
+}
+
+static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m128 t0, t1;
+    v0 = _mm256_hadd_ps(v0, v1);
+    t0 = _mm256_extractf128_ps(v0, 0x1);
+    t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
+    t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
+    t1 = _mm_add_ps(t0, _mm_load_ps(m));
+    _mm_store_ps(m, t1);
+
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
+    __m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    _mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
+}
+
+static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
+    simd_h_decr(m, a0);
+    simd_h_decr(m + CLUSTER_N, a1);
+    simd_h_decr(m + CLUSTER_N * 2, a2);
+}
--- a/src/common/simd/avx512_double.h
+++ b/src/common/simd/avx512_double.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <immintrin.h>
+#ifndef NO_ZMM_INTRIN
+#   include <zmmintrin.h>
+#endif
+
+#define MD_SIMD_FLOAT       __m512d
+#define MD_SIMD_MASK        __mmask8
+#define MD_SIMD_INT         __m256i
+#define MD_SIMD_BITMASK     MD_SIMD_INT
+#define MD_SIMD_IBOOL       __mmask16
+
+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
+static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
+static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_pd(p, a); }
+static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_pd(_mm512_setzero_pd(), m, a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
+    x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
+    x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
+    return *((MD_FLOAT *) &x);
+}
+
+static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
+    __m512d t0, t2;
+    __m256d t3, t4;
+
+    t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55));
+    t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55));
+    t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55));
+    t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55));
+    t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e));
+    t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1));
+    t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee);
+    t3 = _mm512_castpd512_pd256(t0);
+    t4 = _mm256_load_pd(m);
+    t4 = _mm256_add_pd(t4, t3);
+    _mm256_store_pd(m, t4);
+
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
+    return _mm512_broadcast_f64x4(_mm256_load_pd(m));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
+    return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
+}
+
+static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512d t0;
+    __m256d t2, t3;
+
+    t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
+    t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
+    t2 = _mm512_castpd512_pd256(t0);
+    t3 = _mm256_load_pd(m);
+    t3 = _mm256_add_pd(t3, t2);
+    _mm256_store_pd(m, t3);
+
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
+    t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
+    return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
+}
+
+static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
+    __m256d t;
+    a = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
+    t = _mm256_load_pd(m);
+    t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a));
+    _mm256_store_pd(m, t);
+}
+
+static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
+    simd_h_decr(m, a0);
+    simd_h_decr(m + CLUSTER_N, a1);
+    simd_h_decr(m + CLUSTER_N * 2, a2);
+}
+
+// Functions used in LAMMPS kernel
+//static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); }
+#define simd_gather(vidx,m,s) (_mm512_i32gather_pd(vidx, m, s))
+static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); }
+static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); }
+static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
+static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_si256((const MD_SIMD_INT *) m); }
+//static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); }
+static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); }
+static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); }
--- a/src/common/simd/avx512_float.h
+++ b/src/common/simd/avx512_float.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+#ifndef NO_ZMM_INTRIN
+#   include <zmmintrin.h>
+#endif
+
+#define MD_SIMD_FLOAT       __m512
+#define MD_SIMD_MASK        __mmask16
+#define MD_SIMD_INT         __m256i
+#define MD_SIMD_IBOOL       __mmask16
+#define MD_SIMD_INT32       __m512i
+#define MD_SIMD_BITMASK     MD_SIMD_INT32
+
+static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
+    return _mm512_load_si512(m);
+}
+
+static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
+    return _mm512_set1_epi32(a);
+}
+
+static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
+    return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
+}
+
+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
+static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
+static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_ps(p, a); }
+static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    // This would only be called in a Mx16 configuration, which is not valid in GROMACS
+    fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
+    // This would only be called in a Mx16 configuration, which is not valid in GROMACS
+    fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
+    return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
+    return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
+}
+
+static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m512 t0, t1;
+    __m128 t2, t3;
+
+    t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
+    t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
+    t0 = _mm512_add_ps(t0, t1);
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
+    t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
+    t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
+    t3 = _mm512_castps512_ps128(t0);
+    t2 = _mm_load_ps(m);
+    t2 = _mm_add_ps(t2, t3);
+    _mm_store_ps(m, t2);
+
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
+    t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
+    return _mm_cvtss_f32(t3);
+}
+
+static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
+    __m256 t;
+    a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
+    t = _mm256_load_ps(m);
+    t = _mm256_sub_ps(t, _mm512_castps512_ps256(a));
+    _mm256_store_ps(m, t);
+}
+
+static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
+    simd_h_decr(m, a0);
+    simd_h_decr(m + CLUSTER_N, a1);
+    simd_h_decr(m + CLUSTER_N * 2, a2);
+}
--- a/src/common/simd/avx_double.h
+++ b/src/common/simd/avx_double.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <immintrin.h>
+
+#define MD_SIMD_FLOAT   __m256d
+#define MD_SIMD_INT     __m128i
+#define MD_SIMD_MASK    __m256d
+
+static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
+static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
+    MD_SIMD_FLOAT ret;
+    fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX with double precision!");
+    exit(-1);
+    return ret;
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
+    MD_SIMD_FLOAT ret;
+    fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX with double precision!");
+    exit(-1);
+    return ret;
+}
+
+static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX with double precision!");
+    exit(-1);
+    return 0.0;
+}
+
+static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
+    __m256d t0, t1, t2;
+    __m128d a0, a1;
+
+    t0 = _mm256_hadd_pd(v0, v1);
+    t1 = _mm256_hadd_pd(v2, v3);
+    t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
+    t0 = _mm256_add_pd(t0, t2);
+    t1 = _mm256_add_pd(t1, t2);
+    t0 = _mm256_blend_pd(t0, t1, 0b1100);
+    t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
+    _mm256_store_pd(m, t1);
+
+    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
+    a0 = _mm256_castpd256_pd128(t0);
+    a1 = _mm256_extractf128_pd(t0, 0x1);
+    a0 = _mm_add_sd(a0, a1);
+    return *((MD_FLOAT *) &a0);
+}
+
+static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
+#ifdef __ISA_AVX_FMA__
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
+#else
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
+#endif
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
+// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
+    const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
+    const unsigned long long int none = 0x0;
+    return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
+}
+// TODO: Implement this, althrough it is just required for debugging
+static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128d a0, a1;
+    a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
+    a0 = _mm256_castpd256_pd128(a);
+    a1 = _mm256_extractf128_pd(a, 0x1);
+    a0 = _mm_add_sd(a0, a1);
+    return *((MD_FLOAT *) &a0);
+}
+
+static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
+    fprintf(stderr, "simd_h_decr3(): Not implemented for AVX with double precision!");
+    exit(-1);
+}
+
+// Functions used in LAMMPS kernel
+static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
+static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
+static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
+static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
+static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
+static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
+static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }
--- a/src/common/simd/avx_float.h
+++ b/src/common/simd/avx_float.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <immintrin.h>
+#include <zmmintrin.h>
+
+#define MD_SIMD_FLOAT   __m256
+#define MD_SIMD_MASK    __mmask8
+
+static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
+static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
+static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
+static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
+static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
+static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
+static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
+static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
+static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
+static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
+static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
+static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
+static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
+static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
+    __m128 t0;
+    t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
+    __m128 t0, t2;
+    v0 = _mm256_hadd_ps(v0, v1);
+    v2 = _mm256_hadd_ps(v2, v3);
+    v0 = _mm256_hadd_ps(v0, v2);
+    t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
+    t2 = _mm_add_ps(t0, _mm_load_ps(m));
+    _mm_store_ps(m, t2);
+
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
+    return _mm256_broadcast_ps((const __m128 *)(m));
+}
+
+static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
+    __m128 t0, t1;
+    t0 = _mm_broadcast_ss(m);
+    t1 = _mm_broadcast_ss(m + 1);
+    return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
+}
+
+static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
+    __m128 t0, t1;
+    v0 = _mm256_hadd_ps(v0, v1);
+    t0 = _mm256_extractf128_ps(v0, 0x1);
+    t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
+    t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
+    t1 = _mm_add_ps(t0, _mm_load_ps(m));
+    _mm_store_ps(m, t1);
+
+    t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
+    t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
+    return *((MD_FLOAT *) &t0);
+}
+
+inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
+    __m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
+    _mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
+}
+
+static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
+    simd_h_decr(m, a0);
+    simd_h_decr(m + CLUSTER_N, a1);
+    simd_h_decr(m + CLUSTER_N * 2, a2);
+}
--- a/src/common/thermo.c
+++ b/src/common/thermo.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <thermo.h>
+#include <util.h>
+
+static int *steparr;
+static MD_FLOAT *tmparr;
+static MD_FLOAT *engarr;
+static MD_FLOAT *prsarr;
+static MD_FLOAT mvv2e;
+static MD_FLOAT dof_boltz;
+static MD_FLOAT t_scale;
+static MD_FLOAT p_scale;
+static MD_FLOAT e_scale;
+static MD_FLOAT t_act;
+static MD_FLOAT p_act;
+static MD_FLOAT e_act;
+static int mstat;
+
+/* exported subroutines */
+void setupThermo(Parameter *param, int natoms)
+{
+    int maxstat = param->ntimes / param->nstat + 2;
+
+    steparr = (int*) malloc(maxstat * sizeof(int));
+    tmparr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
+    engarr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
+    prsarr = (MD_FLOAT*) malloc(maxstat * sizeof(MD_FLOAT));
+
+    if(param->force_field == FF_LJ) {
+        mvv2e = 1.0;
+        dof_boltz = (natoms * 3 - 3);
+        t_scale = mvv2e / dof_boltz;
+        p_scale = 1.0 / 3 / param->xprd / param->yprd / param->zprd;
+        e_scale = 0.5;
+    } else if(param->force_field == FF_EAM) {
+        mvv2e = 1.036427e-04;
+        dof_boltz = (natoms * 3 - 3) * 8.617343e-05;
+        t_scale = mvv2e / dof_boltz;
+        p_scale = 1.602176e+06 / 3 / param->xprd / param->yprd / param->zprd;
+        e_scale = 524287.985533;//16.0;
+        param->dtforce /= mvv2e;
+    }
+}
+
+void computeThermo(int iflag, Parameter *param, Atom *atom)
+{
+    MD_FLOAT t = 0.0, p;
+    for(int i = 0; i < atom->Nlocal; i++) {
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
+    }
+
+    t = t * t_scale;
+    p = (t * dof_boltz) * p_scale;
+    int istep = iflag;
+
+    if(iflag == -1){
+        istep = param->ntimes;
+    }
+    if(iflag == 0){
+        mstat = 0;
+    }
+
+    steparr[mstat] = istep;
+    tmparr[mstat] = t;
+    prsarr[mstat] = p;
+    mstat++;
+    fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
+}
+
+void adjustThermo(Parameter *param, Atom *atom)
+{
+    /* zero center-of-mass motion */
+    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        vxtot += atom_vx(i);
+        vytot += atom_vy(i);
+        vztot += atom_vz(i);
+    }
+
+    vxtot = vxtot / atom->Natoms;
+    vytot = vytot / atom->Natoms;
+    vztot = vztot / atom->Natoms;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) -= vxtot;
+        atom_vy(i) -= vytot;
+        atom_vz(i) -= vztot;
+    }
+
+    t_act = 0;
+    MD_FLOAT t = 0.0;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
+    }
+
+    t *= t_scale;
+    MD_FLOAT factor = sqrt(param->temp / t);
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) *= factor;
+        atom_vy(i) *= factor;
+        atom_vz(i) *= factor;
+    }
+}
--- a/src/common/thermo.h
+++ b/src/common/thermo.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <parameter.h>
+#include <atom.h>
+
+#ifndef __THERMO_H_
+#define __THERMO_H_
+extern void setupThermo(Parameter*, int);
+extern void computeThermo(int, Parameter*, Atom*);
+extern void adjustThermo(Parameter*, Atom*);
+#endif
--- a/src/common/timers.h
+++ b/src/common/timers.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __TIMERS_H_
+#define __TIMERS_H_
+
+typedef enum {
+    TOTAL = 0,
+    NEIGH,
+    FORCE,
+    NUMTIMER
+} timertype;
+
+#endif
--- a/src/common/timing.c
+++ b/src/common/timing.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <time.h>
+
+double getTimeStamp(void)
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeResolution(void)
+{
+    struct timespec ts;
+    clock_getres(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
--- a/src/common/timing.h
+++ b/src/common/timing.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __TIMING_H_
+#define __TIMING_H_
+
+extern double getTimeStamp(void);
+extern double getTimeResolution(void);
+
+#endif
--- a/src/common/util.c
+++ b/src/common/util.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <util.h>
+
+/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
+#define IA   16807
+#define IM   2147483647
+#define AM   (1.0 / IM)
+#define IQ   127773
+#define IR   2836
+#define MASK 123459876
+
+double myrandom(int* seed)
+{
+    int k = (*seed) / IQ;
+    double ans;
+
+    *seed = IA * (*seed - k * IQ) - IR * k;
+    if (*seed < 0) *seed += IM;
+    ans = AM * (*seed);
+    return ans;
+}
+
+void random_reset(int* seed, int ibase, double* coord)
+{
+    int i;
+    char* str         = (char*)&ibase;
+    int n             = sizeof(int);
+    unsigned int hash = 0;
+
+    for (i = 0; i < n; i++) {
+        hash += str[i];
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+
+    str = (char*)coord;
+    n   = 3 * sizeof(double);
+    for (i = 0; i < n; i++) {
+        hash += str[i];
+        hash += (hash << 10);
+        hash ^= (hash >> 6);
+    }
+
+    hash += (hash << 3);
+    hash ^= (hash >> 11);
+    hash += (hash << 15);
+
+    // keep 31 bits of unsigned int as new seed
+    // do not allow seed = 0, since will cause hang in gaussian()
+
+    *seed = hash & 0x7ffffff;
+    if (!(*seed)) *seed = 1;
+
+    // warm up the RNG
+
+    for (i = 0; i < 5; i++)
+        myrandom(seed);
+    // save = 0;
+}
+
+int str2ff(const char* string)
+{
+    if (strncmp(string, "lj", 2) == 0) return FF_LJ;
+    if (strncmp(string, "eam", 3) == 0) return FF_EAM;
+    if (strncmp(string, "dem", 3) == 0) return FF_DEM;
+    return -1;
+}
+
+const char* ff2str(int ff)
+{
+    if (ff == FF_LJ) {
+        return "lj";
+    }
+    if (ff == FF_EAM) {
+        return "eam";
+    }
+    if (ff == FF_DEM) {
+        return "dem";
+    }
+    return "invalid";
+}
+
+int get_cuda_num_threads(void)
+{
+    const char* num_threads_env = getenv("NUM_THREADS");
+    return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
+}
+
+void readline(char* line, FILE* fp)
+{
+    if (fgets(line, MAXLINE, fp) == NULL) {
+        if (errno != 0) {
+            perror("readline()");
+            exit(-1);
+        }
+    }
+}
+
+void debug_printf(const char* format, ...)
+{
+#ifdef DEBUG
+    va_list arg;
+    int ret;
+
+    va_start(arg, format);
+    if ((vfprintf(stdout, format, arg)) < 0) {
+        perror("debug_printf()");
+    }
+    va_end(arg);
+#endif
+}
--- a/src/common/util.h
+++ b/src/common/util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __UTIL_H_
+#define __UTIL_H_
+
+#include <stdio.h>
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
+#define DEBUG_MESSAGE debug_printf
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+#define FF_LJ 0
+#define FF_EAM 1
+#define FF_DEM 2
+
+#if PRECISION == 1
+#define PRECISION_STRING "single"
+#else
+#define PRECISION_STRING "double"
+#endif
+
+extern double myrandom(int *);
+extern void random_reset(int *seed, int ibase, double *coord);
+extern int str2ff(const char *string);
+extern const char *ff2str(int ff);
+extern void readline(char *line, FILE *fp);
+extern void debug_printf(const char *format, ...);
+extern int get_cuda_num_threads(void);
+
+#endif
--- a/src/verletlist/atom.c
+++ b/src/verletlist/atom.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include <atom.h>
+#include <allocate.h>
+#include <device.h>
+#include <util.h>
+
+#define DELTA 20000
+
+#ifndef MAXLINE
+#define MAXLINE 4096
+#endif
+
+#ifndef MAX
+#define MAX(a,b)    ((a) > (b) ? (a) : (b))
+#endif
+
+void initAtom(Atom *atom) {
+    atom->x  = NULL; atom->y  = NULL; atom->z  = NULL;
+    atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
+    atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
+    atom->Natoms = 0;
+    atom->Nlocal = 0;
+    atom->Nghost = 0;
+    atom->Nmax   = 0;
+    atom->type = NULL;
+    atom->ntypes = 0;
+    atom->epsilon = NULL;
+    atom->sigma6 = NULL;
+    atom->cutforcesq = NULL;
+    atom->cutneighsq = NULL;
+    atom->radius = NULL;
+    atom->av = NULL;
+    atom->r = NULL;
+
+    DeviceAtom *d_atom = &(atom->d_atom);
+    d_atom->x  = NULL; d_atom->y  = NULL; d_atom->z  = NULL;
+    d_atom->vx = NULL; d_atom->vy = NULL; d_atom->vz = NULL;
+    d_atom->fx = NULL; d_atom->fy = NULL; d_atom->fz = NULL;
+    d_atom->border_map = NULL;
+    d_atom->type = NULL;
+    d_atom->epsilon = NULL;
+    d_atom->sigma6 = NULL;
+    d_atom->cutforcesq = NULL;
+    d_atom->cutneighsq = NULL;
+}
+
+void createAtom(Atom *atom, Parameter *param) {
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
+    atom->Natoms = 4 * param->nx * param->ny * param->nz;
+    atom->Nlocal = 0;
+    atom->ntypes = param->ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
+    int ilo = (int) (xlo / (0.5 * alat) - 1);
+    int ihi = (int) (xhi / (0.5 * alat) + 1);
+    int jlo = (int) (ylo / (0.5 * alat) - 1);
+    int jhi = (int) (yhi / (0.5 * alat) + 1);
+    int klo = (int) (zlo / (0.5 * alat) - 1);
+    int khi = (int) (zhi / (0.5 * alat) + 1);
+
+    ilo = MAX(ilo, 0);
+    ihi = MIN(ihi, 2 * param->nx - 1);
+    jlo = MAX(jlo, 0);
+    jhi = MIN(jhi, 2 * param->ny - 1);
+    klo = MAX(klo, 0);
+    khi = MIN(khi, 2 * param->nz - 1);
+
+    MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
+    int i, j, k, m, n;
+    int sx = 0; int sy = 0; int sz = 0;
+    int ox = 0; int oy = 0; int oz = 0;
+    int subboxdim = 8;
+
+    while(oz * subboxdim <= khi) {
+
+        k = oz * subboxdim + sz;
+        j = oy * subboxdim + sy;
+        i = ox * subboxdim + sx;
+
+        if(((i + j + k) % 2 == 0) &&
+                (i >= ilo) && (i <= ihi) &&
+                (j >= jlo) && (j <= jhi) &&
+                (k >= klo) && (k <= khi)) {
+
+            xtmp = 0.5 * alat * i;
+            ytmp = 0.5 * alat * j;
+            ztmp = 0.5 * alat * k;
+
+            if( xtmp >= xlo && xtmp < xhi &&
+                    ytmp >= ylo && ytmp < yhi &&
+                    ztmp >= zlo && ztmp < zhi ) {
+
+                n = k * (2 * param->ny) * (2 * param->nx) +
+                    j * (2 * param->nx) +
+                    i + 1;
+
+                for(m = 0; m < 5; m++) {
+                    myrandom(&n);
+                }
+                vxtmp = myrandom(&n);
+
+                for(m = 0; m < 5; m++){
+                    myrandom(&n);
+                }
+                vytmp = myrandom(&n);
+
+                for(m = 0; m < 5; m++) {
+                    myrandom(&n);
+                }
+                vztmp = myrandom(&n);
+
+                if(atom->Nlocal == atom->Nmax) {
+                    growAtom(atom);
+                }
+
+                atom_x(atom->Nlocal) = xtmp;
+                atom_y(atom->Nlocal) = ytmp;
+                atom_z(atom->Nlocal) = ztmp;
+                atom_vx(atom->Nlocal) = vxtmp;
+                atom_vy(atom->Nlocal) = vytmp;
+                atom_vz(atom->Nlocal) = vztmp;
+                atom->type[atom->Nlocal] = rand() % atom->ntypes;
+                atom->Nlocal++;
+            }
+        }
+
+        sx++;
+
+        if(sx == subboxdim) { sx = 0; sy++; }
+        if(sy == subboxdim) { sy = 0; sz++; }
+        if(sz == subboxdim) { sz = 0; ox++; }
+        if(ox * subboxdim > ihi) { ox = 0; oy++; }
+        if(oy * subboxdim > jhi) { oy = 0; oz++; }
+    }
+}
+
+int type_str2int(const char *type) {
+    if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
+    fprintf(stderr, "Invalid atom type: %s\n", type);
+    exit(-1);
+    return -1;
+}
+
+int readAtom(Atom* atom, Parameter* param) {
+    int len = strlen(param->input_file);
+    if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
+    if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
+    if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
+    if(strncmp(&param->input_file[len - 3], ".in",  3) == 0) { return readAtom_in(atom, param); }
+    fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
+    exit(-1);
+    return -1;
+}
+
+int readAtom_pdb(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int read_atoms = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    while(!feof(fp)) {
+        readline(line, fp);
+        char *item = strtok(line, " ");
+        if(strncmp(item, "CRYST1", 6) == 0) {
+            param->xlo = 0.0;
+            param->xhi = atof(strtok(NULL, " "));
+            param->ylo = 0.0;
+            param->yhi = atof(strtok(NULL, " "));
+            param->zlo = 0.0;
+            param->zhi = atof(strtok(NULL, " "));
+            param->xprd = param->xhi - param->xlo;
+            param->yprd = param->yhi - param->ylo;
+            param->zprd = param->zhi - param->zlo;
+            // alpha, beta, gamma, sGroup, z
+        } else if(strncmp(item, "ATOM", 4) == 0) {
+            char *label;
+            int atom_id, comp_id;
+            MD_FLOAT occupancy, charge;
+            atom_id = atoi(strtok(NULL, " ")) - 1;
+
+            while(atom_id + 1 >= atom->Nmax) {
+                growAtom(atom);
+            }
+
+            atom->type[atom_id] = type_str2int(strtok(NULL, " "));
+            label = strtok(NULL, " ");
+            comp_id = atoi(strtok(NULL, " "));
+            atom_x(atom_id) = atof(strtok(NULL, " "));
+            atom_y(atom_id) = atof(strtok(NULL, " "));
+            atom_z(atom_id) = atof(strtok(NULL, " "));
+            atom_vx(atom_id) = 0.0;
+            atom_vy(atom_id) = 0.0;
+            atom_vz(atom_id) = 0.0;
+            occupancy = atof(strtok(NULL, " "));
+            charge = atof(strtok(NULL, " "));
+            atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
+            atom->Natoms++;
+            atom->Nlocal++;
+            read_atoms++;
+        } else if(strncmp(item, "HEADER", 6) == 0 ||
+                  strncmp(item, "REMARK", 6) == 0 ||
+                  strncmp(item, "MODEL", 5) == 0 ||
+                  strncmp(item, "TER", 3) == 0 ||
+                  strncmp(item, "ENDMDL", 6) == 0) {
+            // Do nothing
+        } else {
+            fprintf(stderr, "Invalid item: %s\n", item);
+            exit(-1);
+            return -1;
+        }
+    }
+
+    if(!read_atoms) {
+        fprintf(stderr, "Input error: No atoms read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    fclose(fp);
+    return read_atoms;
+}
+
+int readAtom_gro(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    char desc[MAXLINE];
+    int read_atoms = 0;
+    int atoms_to_read = 0;
+    int i = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    readline(desc, fp);
+    for(i = 0; desc[i] != '\n'; i++);
+    desc[i] = '\0';
+    readline(line, fp);
+    atoms_to_read = atoi(strtok(line, " "));
+    fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
+
+    while(!feof(fp) && read_atoms < atoms_to_read) {
+        readline(line, fp);
+        char *label = strtok(line, " ");
+        int type = type_str2int(strtok(NULL, " "));
+        int atom_id = atoi(strtok(NULL, " ")) - 1;
+        atom_id = read_atoms;
+        while(atom_id + 1 >= atom->Nmax) {
+            growAtom(atom);
+        }
+
+        atom->type[atom_id] = type;
+        atom_x(atom_id) = atof(strtok(NULL, " "));
+        atom_y(atom_id) = atof(strtok(NULL, " "));
+        atom_z(atom_id) = atof(strtok(NULL, " "));
+        atom_vx(atom_id) = atof(strtok(NULL, " "));
+        atom_vy(atom_id) = atof(strtok(NULL, " "));
+        atom_vz(atom_id) = atof(strtok(NULL, " "));
+        atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
+        atom->Natoms++;
+        atom->Nlocal++;
+        read_atoms++;
+    }
+
+    if(!feof(fp)) {
+        readline(line, fp);
+        param->xlo = 0.0;
+        param->xhi = atof(strtok(line, " "));
+        param->ylo = 0.0;
+        param->yhi = atof(strtok(NULL, " "));
+        param->zlo = 0.0;
+        param->zhi = atof(strtok(NULL, " "));
+        param->xprd = param->xhi - param->xlo;
+        param->yprd = param->yhi - param->ylo;
+        param->zprd = param->zhi - param->zlo;
+    }
+
+    if(read_atoms != atoms_to_read) {
+        fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    fclose(fp);
+    return read_atoms;
+}
+
+int readAtom_dmp(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int natoms = 0;
+    int read_atoms = 0;
+    int atom_id = -1;
+    int ts = -1;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    while(!feof(fp) && ts < 1 && !read_atoms) {
+        readline(line, fp);
+        if(strncmp(line, "ITEM: ", 6) == 0) {
+            char *item = &line[6];
+
+            if(strncmp(item, "TIMESTEP", 8) == 0) {
+                readline(line, fp);
+                ts = atoi(line);
+            } else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
+                readline(line, fp);
+                natoms = atoi(line);
+                atom->Natoms = natoms;
+                atom->Nlocal = natoms;
+                while(atom->Nlocal >= atom->Nmax) {
+                    growAtom(atom);
+                }
+            } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
+                readline(line, fp);
+                param->xlo = atof(strtok(line, " "));
+                param->xhi = atof(strtok(NULL, " "));
+                param->xprd = param->xhi - param->xlo;
+
+                readline(line, fp);
+                param->ylo = atof(strtok(line, " "));
+                param->yhi = atof(strtok(NULL, " "));
+                param->yprd = param->yhi - param->ylo;
+
+                readline(line, fp);
+                param->zlo = atof(strtok(line, " "));
+                param->zhi = atof(strtok(NULL, " "));
+                param->zprd = param->zhi - param->zlo;
+            } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
+                for(int i = 0; i < natoms; i++) {
+                    readline(line, fp);
+                    atom_id = atoi(strtok(line, " ")) - 1;
+                    atom->type[atom_id] = atoi(strtok(NULL, " "));
+                    atom_x(atom_id) = atof(strtok(NULL, " "));
+                    atom_y(atom_id) = atof(strtok(NULL, " "));
+                    atom_z(atom_id) = atof(strtok(NULL, " "));
+                    atom_vx(atom_id) = atof(strtok(NULL, " "));
+                    atom_vy(atom_id) = atof(strtok(NULL, " "));
+                    atom_vz(atom_id) = atof(strtok(NULL, " "));
+                    atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
+                    read_atoms++;
+                }
+            } else {
+                fprintf(stderr, "Invalid item: %s\n", item);
+                exit(-1);
+                return -1;
+            }
+        } else {
+            fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
+            exit(-1);
+            return -1;
+        }
+    }
+
+    if(ts < 0 || !natoms || !read_atoms) {
+        fprintf(stderr, "Input error: atom data was not read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    return natoms;
+}
+
+int readAtom_in(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int natoms = 0;
+    int atom_id = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    readline(line, fp);
+    natoms = atoi(strtok(line, " "));
+    param->xlo = atof(strtok(NULL, " "));
+    param->xhi = atof(strtok(NULL, " "));
+    param->ylo = atof(strtok(NULL, " "));
+    param->yhi = atof(strtok(NULL, " "));
+    param->zlo = atof(strtok(NULL, " "));
+    param->zhi = atof(strtok(NULL, " "));
+    atom->Natoms = natoms;
+    atom->Nlocal = natoms;
+    atom->ntypes = 1;
+
+    while(atom->Nlocal >= atom->Nmax) {
+        growAtom(atom);
+    }
+
+    for(int i = 0; i < natoms; i++) {
+        readline(line, fp);
+
+        // TODO: store mass per atom
+        char *s_mass = strtok(line, " ");
+        if(strncmp(s_mass, "inf", 3) == 0) {
+            // Set atom's mass to INFINITY
+        } else {
+            param->mass = atof(s_mass);
+        }
+
+        atom->radius[atom_id] = atof(strtok(NULL, " "));
+        atom_x(atom_id) = atof(strtok(NULL, " "));
+        atom_y(atom_id) = atof(strtok(NULL, " "));
+        atom_z(atom_id) = atof(strtok(NULL, " "));
+        atom_vx(atom_id) = atof(strtok(NULL, " "));
+        atom_vy(atom_id) = atof(strtok(NULL, " "));
+        atom_vz(atom_id) = atof(strtok(NULL, " "));
+        atom->type[atom_id] = 0;
+        atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
+        atom_id++;
+    }
+
+    if(!natoms) {
+        fprintf(stderr, "Input error: atom data was not read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    return natoms;
+}
+
+void writeAtom(Atom *atom, Parameter *param) {
+    FILE *fp = fopen(param->write_atom_file, "w");
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
+            atom->type[i], 1.0,
+            atom_x(i), atom_y(i), atom_z(i),
+            atom_vx(i), atom_vy(i), atom_vz(i));
+    }
+
+    fclose(fp);
+    fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
+        param->write_atom_file, param->xprd, param->yprd, param->zprd);
+}
+
+void growAtom(Atom *atom) {
+    DeviceAtom *d_atom = &(atom->d_atom);
+    int nold = atom->Nmax;
+    atom->Nmax += DELTA;
+
+    #undef REALLOC
+    #define REALLOC(p,t,ns,os); \
+        atom->p = (t *) reallocate(atom->p, ALIGNMENT, ns, os); \
+        atom->d_atom.p = (t *) reallocateGPU(atom->d_atom.p, ns);
+
+    #ifdef AOS
+    REALLOC(x,  MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    #else
+    REALLOC(x,  MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(y,  MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(z,  MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(vy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(vz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(fy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    REALLOC(fz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    #endif
+    REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
+
+    // DEM
+    atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    atom->r  = (MD_FLOAT*) reallocate(atom->r,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
+}
--- a/src/verletlist/atom.h
+++ b/src/verletlist/atom.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <parameter.h>
+
+#ifndef __ATOM_H_
+#define __ATOM_H_
+
+#ifdef CUDA_TARGET
+#define KERNEL_NAME             "CUDA"
+#define computeForceLJFullNeigh computeForceLJFullNeigh_cuda
+#define initialIntegrate        initialIntegrate_cuda
+#define finalIntegrate          finalIntegrate_cuda
+#define buildNeighbor           buildNeighbor_cuda
+#define updatePbc               updatePbc_cuda
+#define updateAtomsPbc          updateAtomsPbc_cuda
+#else
+#ifdef USE_SIMD_KERNEL
+#define KERNEL_NAME             "SIMD"
+#define computeForceLJFullNeigh computeForceLJFullNeigh_simd
+#else
+#define KERNEL_NAME "PLAIN"
+#endif
+#define initialIntegrate initialIntegrate_cpu
+#define finalIntegrate   finalIntegrate_cpu
+#define buildNeighbor    buildNeighbor_cpu
+#define updatePbc        updatePbc_cpu
+#define updateAtomsPbc   updateAtomsPbc_cpu
+#endif
+
+typedef struct {
+    MD_FLOAT *x, *y, *z;
+    MD_FLOAT *vx, *vy, *vz;
+    MD_FLOAT *fx, *fy, *fz;
+    int* border_map;
+    int* type;
+    MD_FLOAT* epsilon;
+    MD_FLOAT* sigma6;
+    MD_FLOAT* cutforcesq;
+    MD_FLOAT* cutneighsq;
+} DeviceAtom;
+
+typedef struct {
+    int Natoms, Nlocal, Nghost, Nmax;
+    MD_FLOAT *x, *y, *z;
+    MD_FLOAT *vx, *vy, *vz;
+    MD_FLOAT *fx, *fy, *fz;
+    int* border_map;
+    int* type;
+    int ntypes;
+    MD_FLOAT* epsilon;
+    MD_FLOAT* sigma6;
+    MD_FLOAT* cutforcesq;
+    MD_FLOAT* cutneighsq;
+
+    // DEM
+    MD_FLOAT* radius;
+    MD_FLOAT* av;
+    MD_FLOAT* r;
+
+    // Device data
+    DeviceAtom d_atom;
+} Atom;
+
+extern void initAtom(Atom*);
+extern void createAtom(Atom*, Parameter*);
+extern int readAtom(Atom*, Parameter*);
+extern int readAtom_pdb(Atom*, Parameter*);
+extern int readAtom_gro(Atom*, Parameter*);
+extern int readAtom_dmp(Atom*, Parameter*);
+extern int readAtom_in(Atom*, Parameter*);
+extern void writeAtom(Atom*, Parameter*);
+extern void growAtom(Atom*);
+
+#ifdef AOS
+#define POS_DATA_LAYOUT "AoS"
+#define atom_x(i)       atom->x[(i) * 3 + 0]
+#define atom_y(i)       atom->x[(i) * 3 + 1]
+#define atom_z(i)       atom->x[(i) * 3 + 2]
+#define atom_vx(i)      atom->vx[(i) * 3 + 0]
+#define atom_vy(i)      atom->vx[(i) * 3 + 1]
+#define atom_vz(i)      atom->vx[(i) * 3 + 2]
+#define atom_fx(i)      atom->fx[(i) * 3 + 0]
+#define atom_fy(i)      atom->fx[(i) * 3 + 1]
+#define atom_fz(i)      atom->fx[(i) * 3 + 2]
+#else
+#define POS_DATA_LAYOUT "SoA"
+#define atom_x(i)       atom->x[i]
+#define atom_y(i)       atom->y[i]
+#define atom_z(i)       atom->z[i]
+#define atom_vx(i)      atom->vx[i]
+#define atom_vy(i)      atom->vy[i]
+#define atom_vz(i)      atom->vz[i]
+#define atom_fx(i)      atom->fx[i]
+#define atom_fy(i)      atom->fy[i]
+#define atom_fz(i)      atom->fz[i]
+#endif
+
+#endif
--- a/src/verletlist/cuda/force.cu
+++ b/src/verletlist/cuda/force.cu
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+//---
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+//---
+#include <likwid-marker.h>
+
+extern "C" {
+
+#include <allocate.h>
+#include <atom.h>
+#include <allocate.h>
+#include <device.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <timing.h>
+#include <util.h>
+
+}
+
+// cuda kernel
+__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= Nlocal) {
+        return;
+    }
+
+    DeviceAtom *atom = &a;
+    const int numneighs = neigh_numneigh[i];
+
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+
+    MD_FLOAT fix = 0;
+    MD_FLOAT fiy = 0;
+    MD_FLOAT fiz = 0;
+
+#ifdef EXPLICIT_TYPES
+    const int type_i = atom->type[i];
+#endif
+
+    for(int k = 0; k < numneighs; k++) {
+        int j = neigh_neighbors[Nlocal * k + i];
+        MD_FLOAT delx = xtmp - atom_x(j);
+        MD_FLOAT dely = ytmp - atom_y(j);
+        MD_FLOAT delz = ztmp - atom_z(j);
+        MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+        const int type_j = atom->type[j];
+        const int type_ij = type_i * ntypes + type_j;
+        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
+        const MD_FLOAT epsilon = atom->epsilon[type_ij];
+#endif
+
+        if(rsq < cutforcesq) {
+            MD_FLOAT sr2 = 1.0 / rsq;
+            MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+            MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+            fix += delx * force;
+            fiy += dely * force;
+            fiz += delz * force;
+        }
+    }
+
+    atom_fx(i) = fix;
+    atom_fy(i) = fiy;
+    atom_fz(i) = fiz;
+}
+
+__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    DeviceAtom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+    atom_x(i) = atom_x(i) + dt * atom_vx(i);
+    atom_y(i) = atom_y(i) + dt * atom_vy(i);
+    atom_z(i) = atom_z(i) + dt * atom_vz(i);
+}
+
+__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if( i >= Nlocal ) {
+        return;
+    }
+
+    DeviceAtom *atom = &a;
+
+    atom_vx(i) += dtforce * atom_fx(i);
+    atom_vy(i) += dtforce * atom_fy(i);
+    atom_vz(i) += dtforce * atom_fz(i);
+}
+
+extern "C" {
+
+void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
+    const int Nlocal = atom->Nlocal;
+    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
+    cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
+    cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
+
+    if(reneigh) {
+        memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
+    }
+}
+
+void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
+    const int Nlocal = atom->Nlocal;
+    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+
+    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
+    cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
+    cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
+
+    if(reneigh) {
+        memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
+    }
+}
+
+double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
+    const int num_threads_per_block = get_cuda_num_threads();
+    int Nlocal = atom->Nlocal;
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+
+    /*
+    int nDevices;
+    cudaGetDeviceCount(&nDevices);
+    size_t free, total;
+    for(int i = 0; i < nDevices; ++i) {
+        cudaMemGetInfo( &free, &total );
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, i);
+        printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
+    }
+    */
+
+
+    // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
+    // memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
+
+    cudaProfilerStart();
+    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
+    cuda_assert("calc_force", cudaPeekAtLastError());
+    cuda_assert("calc_force", cudaDeviceSynchronize());
+    cudaProfilerStop();
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    return E-S;
+}
+
+}
--- a/src/verletlist/cuda/neighbor.cu
+++ b/src/verletlist/cuda/neighbor.cu
@@ -0,0 +1,293 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <cuda_profiler_api.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+//---
+
+extern "C" {
+
+#include <atom.h>
+#include <device.h>
+#include <parameter.h>
+#include <neighbor.h>
+#include <util.h>
+
+}
+
+extern MD_FLOAT xprd, yprd, zprd;
+extern MD_FLOAT bininvx, bininvy, bininvz;
+extern int mbinxlo, mbinylo, mbinzlo;
+extern int nbinx, nbiny, nbinz;
+extern int mbinx, mbiny, mbinz; // n bins in x, y, z
+extern int mbins; //total number of bins
+extern int atoms_per_bin;  // max atoms per bin
+extern MD_FLOAT cutneighsq;  // neighbor cutoff squared
+extern int nmax;
+extern int nstencil;      // # of bins in stencil
+extern int* stencil;      // stencil list of bin offsets
+static int* c_stencil = NULL;
+static int* c_resize_needed = NULL;
+static int* c_new_maxneighs = NULL;
+static Binning c_binning {
+    .bincount = NULL,
+    .bins = NULL,
+    .mbins = 0,
+    .atoms_per_bin = 0
+};
+
+__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
+    int ix, iy, iz;
+
+    if(xin >= np.xprd) {
+        ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo;
+    } else {
+        ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
+    }
+
+    if(yin >= np.yprd) {
+        iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * np.bininvy) - np.mbinylo;
+    } else {
+        iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
+    }
+
+    if(zin >= np.zprd) {
+        iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo;
+    } else {
+        iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
+    }
+
+    return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
+}
+
+/* sorts the contents of a bin to make it comparable to the CPU version */
+/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
+__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= mbins) {
+        return;
+    }
+
+    int atoms_in_bin = bincount[i];
+    int *bin_ptr = &bins[i * atoms_per_bin];
+    int sorted;
+    do {
+        sorted = 1;
+        int tmp;
+        for(int index = 0; index < atoms_in_bin - 1; index++){
+            if (bin_ptr[index] > bin_ptr[index + 1]){
+                tmp = bin_ptr[index];
+                bin_ptr[index] = bin_ptr[index + 1];
+                bin_ptr[index + 1] = tmp;
+                sorted = 0;
+            }
+        }
+    } while (!sorted);
+}
+
+__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
+    DeviceAtom* atom = &a;
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= nall) {
+        return;
+    }
+
+    MD_FLOAT x = atom_x(i);
+    MD_FLOAT y = atom_y(i);
+    MD_FLOAT z = atom_z(i);
+    int ibin = coord2bin_device(x, y, z, np);
+    int ac = atomicAdd(&bincount[ibin], 1);
+
+    if(ac < atoms_per_bin){
+        bins[ibin * atoms_per_bin + ac] = i;
+    } else {
+        atomicMax(resize_needed, ac);
+    }
+}
+
+__global__ void compute_neighborhood(
+    DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
+    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
+
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= nlocal) {
+        return;
+    }
+
+    DeviceAtom *atom = &a;
+    DeviceNeighbor *neighbor = &neigh;
+
+    int* neighptr = &(neighbor->neighbors[i]);
+    int n = 0;
+    MD_FLOAT xtmp = atom_x(i);
+    MD_FLOAT ytmp = atom_y(i);
+    MD_FLOAT ztmp = atom_z(i);
+    int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
+#ifdef EXPLICIT_TYPES
+    int type_i = atom->type[i];
+#endif
+    for(int k = 0; k < nstencil; k++) {
+        int jbin = ibin + stencil[k];
+        int* loc_bin = &bins[jbin * atoms_per_bin];
+
+        for(int m = 0; m < bincount[jbin]; m++) {
+            int j = loc_bin[m];
+
+            if ( j == i ){
+                continue;
+            }
+
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+            int type_j = atom->type[j];
+            const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
+#else
+            const MD_FLOAT cutoff = cutneighsq;
+#endif
+
+            if( rsq <= cutoff ) {
+                int idx = nlocal * n;
+                neighptr[idx] = j;
+                n += 1;
+            }
+        }
+    }
+
+    neighbor->numneigh[i] = n;
+    if(n > maxneighs) {
+        atomicMax(new_maxneighs, n);
+    }
+}
+
+void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
+    int nall = atom->Nlocal + atom->Nghost;
+    int resize = 1;
+    const int num_blocks = ceil((float) nall / (float) threads_per_block);
+
+    while(resize > 0) {
+        resize = 0;
+        memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
+        memsetGPU(c_resize_needed, 0, sizeof(int));
+
+        binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
+	    cuda_assert("binatoms", cudaPeekAtLastError());
+	    cuda_assert("binatoms", cudaDeviceSynchronize());
+
+        memcpyFromGPU(&resize, c_resize_needed, sizeof(int));
+        if(resize) {
+            c_binning->atoms_per_bin *= 2;
+            c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int));
+        }
+    }
+
+    atoms_per_bin = c_binning->atoms_per_bin;
+    const int sortBlocks = ceil((float) mbins / (float) threads_per_block);
+    sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
+	cuda_assert("sort_bin", cudaPeekAtLastError());
+	cuda_assert("sort_bin", cudaDeviceSynchronize());
+}
+
+void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
+    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
+    const int num_threads_per_block = get_cuda_num_threads();
+    int nall = atom->Nlocal + atom->Nghost;
+
+    cudaProfilerStart();
+
+    // TODO move all of this initialization into its own method
+    if(c_stencil == NULL) {
+        c_stencil = (int *) allocateGPU(nstencil * sizeof(int));
+        memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int));
+    }
+
+    if(c_binning.mbins == 0) {
+        c_binning.mbins = mbins;
+        c_binning.atoms_per_bin = atoms_per_bin;
+        c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int));
+        c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int));
+    }
+
+    Neighbor_params np {
+        .xprd = xprd,
+        .yprd = yprd,
+        .zprd = zprd,
+        .bininvx = bininvx,
+        .bininvy = bininvy,
+        .bininvz = bininvz,
+        .mbinxlo = mbinxlo,
+        .mbinylo = mbinylo,
+        .mbinzlo = mbinzlo,
+        .nbinx = nbinx,
+        .nbiny = nbiny,
+        .nbinz = nbinz,
+        .mbinx = mbinx,
+        .mbiny = mbiny,
+        .mbinz = mbinz
+    };
+
+    if(c_resize_needed == NULL) {
+        c_resize_needed = (int *) allocateGPU(sizeof(int));
+    }
+
+    /* bin local & ghost atoms */
+    binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+    if(c_new_maxneighs == NULL) {
+        c_new_maxneighs = (int *) allocateGPU(sizeof(int));
+    }
+
+    int resize = 1;
+
+    if(nall > nmax) {
+        nmax = nall;
+        d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
+        d_neighbor->numneigh  = (int *) reallocateGPU(d_neighbor->numneigh,  nmax * sizeof(int));
+    }
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        resize = 0;
+        memsetGPU(c_new_maxneighs, 0, sizeof(int));
+        const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+        compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
+                                                                    np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
+                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
+                                                                    c_new_maxneighs,
+								                                    cutneighsq, atom->ntypes);
+
+        cuda_assert("compute_neighborhood", cudaPeekAtLastError());
+        cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
+
+        int new_maxneighs;
+        memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
+        if(new_maxneighs > neighbor->maxneighs){
+            resize = 1;
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", neighbor->maxneighs);
+            neighbor->maxneighs = new_maxneighs * 1.2;
+            printf("NEW SIZE %d\n", neighbor->maxneighs);
+            neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
+        }
+
+    }
+
+    cudaProfilerStop();
+}
--- a/src/verletlist/cuda/pbc.cu
+++ b/src/verletlist/cuda/pbc.cu
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+//---
+
+extern "C" {
+
+#include <allocate.h>
+#include <atom.h>
+#include <device.h>
+#include <pbc.h>
+#include <util.h>
+
+}
+
+extern int NmaxGhost;
+extern int *PBCx, *PBCy, *PBCz;
+static int c_NmaxGhost = 0;
+static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
+
+__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    DeviceAtom *atom = &a;
+    if(i >= nlocal) {
+        return;
+    }
+
+    if (atom_x(i) < 0.0) {
+        atom_x(i) += xprd;
+    } else if (atom_x(i) >= xprd) {
+        atom_x(i) -= xprd;
+    }
+
+    if (atom_y(i) < 0.0) {
+        atom_y(i) += yprd;
+    } else if (atom_y(i) >= yprd) {
+        atom_y(i) -= yprd;
+    }
+
+    if (atom_z(i) < 0.0) {
+        atom_z(i) += zprd;
+    } else if (atom_z(i) >= zprd) {
+        atom_z(i) -= zprd;
+    }
+}
+
+__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= nghost) {
+        return;
+    }
+
+    DeviceAtom* atom = &a;
+    int *border_map = atom->border_map;
+    atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
+    atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
+    atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
+    const int num_threads_per_block = get_cuda_num_threads();
+
+    if(reneigh) {
+        memcpyToGPU(atom->d_atom.x,     atom->x,    sizeof(MD_FLOAT) * atom->Nmax * 3);
+        memcpyToGPU(atom->d_atom.type,  atom->type, sizeof(int) * atom->Nmax);
+
+        if(c_NmaxGhost < NmaxGhost) {
+            c_NmaxGhost = NmaxGhost;
+            c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
+            c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
+            c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
+            atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
+        }
+
+        memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
+        memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
+        memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
+        memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
+        cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
+        cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
+    }
+
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
+    computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
+    cuda_assert("updatePbc", cudaPeekAtLastError());
+    cuda_assert("updatePbc", cudaDeviceSynchronize());
+}
+
+void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
+    const int num_threads_per_block = get_cuda_num_threads();
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
+    cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
+    cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
+    memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
+}
--- a/src/verletlist/device_spec.c
+++ b/src/verletlist/device_spec.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <device.h>
+
+#ifdef CUDA_TARGET
+
+void initDevice(Atom *atom, Neighbor *neighbor) {
+    DeviceAtom *d_atom = &(atom->d_atom);
+    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
+
+    d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->cutneighsq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
+    d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
+
+    memcpyToGPU(d_atom->x,              atom->x,          sizeof(MD_FLOAT) * atom->Nmax * 3);
+    memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
+    memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->cutneighsq,     atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
+}
+
+#endif
--- a/src/verletlist/force_dem.c
+++ b/src/verletlist/force_dem.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <math.h>
+//---
+#include <atom.h>
+#include <likwid-marker.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timing.h>
+
+
+double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT k_s = param->k_s;
+    MD_FLOAT k_dn = param->k_dn;
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+
+    for(int i = 0; i < Nlocal; i++) {
+        atom_fx(i) = 0.0;
+        atom_fy(i) = 0.0;
+        atom_fz(i) = 0.0;
+    }
+
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT irad = atom->radius[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT jrad = atom->radius[j];
+            MD_FLOAT xj = atom_x(j);
+            MD_FLOAT yj = atom_y(j);
+            MD_FLOAT zj = atom_z(j);
+            MD_FLOAT delx = xtmp - xj;
+            MD_FLOAT dely = ytmp - yj;
+            MD_FLOAT delz = ztmp - zj;
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+            if(rsq < cutforcesq) {
+                MD_FLOAT r = sqrt(rsq);
+                MD_FLOAT p = irad + jrad - r;
+
+                if(p > 0) {
+                    MD_FLOAT delvx = atom_vx(i) - atom_vx(j);
+                    MD_FLOAT delvy = atom_vy(i) - atom_vy(j);
+                    MD_FLOAT delvz = atom_vz(i) - atom_vz(j);
+                    MD_FLOAT vr = sqrt(delvx * delvx + delvy * delvy + delvz * delvz);
+
+                    // normal distance
+                    MD_FLOAT nx = delx / r;
+                    MD_FLOAT ny = dely / r;
+                    MD_FLOAT nz = delz / r;
+
+                    // normal contact velocity
+                    MD_FLOAT nvx = delvx / vr;
+                    MD_FLOAT nvy = delvy / vr;
+                    MD_FLOAT nvz = delvz / vr;
+
+                    // forces
+                    atom_fx(i) += k_s * p * nx - k_dn * nvx;
+                    atom_fy(i) += k_s * p * ny - k_dn * nvy;
+                    atom_fz(i) += k_s * p * nz - k_dn * nvz;
+                    atom_fx(j) += -k_s * p * nx - k_dn * nvx;
+                    atom_fy(j) += -k_s * p * ny - k_dn * nvy;
+                    atom_fz(j) += -k_s * p * nz - k_dn * nvz;
+
+                    // contact position
+                    //MD_FLOAT cterm = jrad / (irad + jrad);
+                    //MD_FLOAT cx = xj + cterm * delx;
+                    //MD_FLOAT cy = yj + cterm * dely;
+                    //MD_FLOAT cz = zj + cterm * delz;
+
+                    // delta contact and particle position
+                    //MD_FLOAT delcx = cx - xtmp;
+                    //MD_FLOAT delcy = cy - ytmp;
+                    //MD_FLOAT delcz = cz - ztmp;
+
+                    // contact velocity
+                    //MD_FLOAT cvx = (atom_vx(i) + atom_avx(i) * delcx) - (atom_vx(j) + atom_avx(j) * (cx - xj));
+                    //MD_FLOAT cvy = (atom_vy(i) + atom_avy(i) * delcy) - (atom_vy(j) + atom_avy(j) * (cy - yj));
+                    //MD_FLOAT cvz = (atom_vz(i) + atom_avz(i) * delcz) - (atom_vz(j) + atom_avz(j) * (cz - zj));
+
+                    // tangential force
+                    //fix += MIN(kdt * vtsq, kf * fnx) * tx;
+                    //fiy += MIN(kdt * vtsq, kf * fny) * ty;
+                    //fiz += MIN(kdt * vtsq, kf * fnz) * tz;
+                    // torque
+                    //MD_FLOAT taux = delcx * ftx;
+                    //MD_FLOAT tauy = delcy * fty;
+                    //MD_FLOAT tauz = delcz * ftz;
+                }
+#ifdef USE_REFERENCE_VERSION
+                addStat(stats->atoms_within_cutoff, 1);
+            } else {
+                addStat(stats->atoms_outside_cutoff, 1);
+#endif
+            }
+        }
+
+        addStat(stats->total_force_neighs, numneighs);
+        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+    }
+
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    return E-S;
+}
--- a/src/verletlist/force_eam.c
+++ b/src/verletlist/force_eam.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <likwid-marker.h>
+#include <math.h>
+
+#include <allocate.h>
+#include <timing.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <eam.h>
+#include <util.h>
+
+double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    if(eam->nmax < atom->Nmax) {
+        eam->nmax = atom->Nmax;
+        if(eam->fp != NULL) { free(eam->fp); }
+        eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
+    }
+
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
+    MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
+    MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
+    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
+    double S = getTimeStamp();
+
+
+    #pragma omp parallel
+    {
+    LIKWID_MARKER_START("force_eam_fp");
+
+    #pragma omp for
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT rhoi = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+            if(rsq < cutforcesq) {
+                MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+#ifdef EXPLICIT_TYPES
+                rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                rhoi += ((rhor_spline[m * 7 + 3] * p +
+                          rhor_spline[m * 7 + 4]) * p +
+                          rhor_spline[m * 7 + 5]) * p +
+                          rhor_spline[m * 7 + 6];
+#endif
+            }
+        }
+
+#ifdef EXPLICIT_TYPES
+        const int type_ii = type_i * type_i;
+#endif
+        MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
+        int m = (int)(p);
+        m = MAX(1, MIN(m, nrho - 1));
+        p -= m;
+        p = MIN(p, 1.0);
+#ifdef EXPLICIT_TYPES
+        fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 2];
+#else
+        fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
+#endif
+    }
+
+    LIKWID_MARKER_STOP("force_eam_fp");
+    }
+
+    // We still need to update fp for PBC atoms
+    for(int i = 0; i < atom->Nghost; i++) {
+        fp[Nlocal + i] = fp[atom->border_map[i]];
+    }
+
+
+    #pragma omp parallel
+    {
+    LIKWID_MARKER_START("force_eam");
+
+    #pragma omp for
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+
+            if(rsq < cutforcesq) {
+                MD_FLOAT r = sqrt(rsq);
+                MD_FLOAT p = r * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+
+
+                // rhoip = derivative of (density at atom j due to atom i)
+                // rhojp = derivative of (density at atom i due to atom j)
+                // phi = pair potential energy
+                // phip = phi'
+                // z2 = phi * r
+                // z2p = (phi * r)' = (phi' r) + phi
+                // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+                //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+                //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+#ifdef EXPLICIT_TYPES
+                MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
+                MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
+                MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
+                                z2r_spline[m * 7 + 4]) * p +
+                                z2r_spline[m * 7 + 5]) * p +
+                                z2r_spline[m * 7 + 6];
+#endif
+
+                MD_FLOAT recip = 1.0 / r;
+                MD_FLOAT phi = z2 * recip;
+                MD_FLOAT phip = z2p * recip - phi * recip;
+                MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
+                MD_FLOAT fpair = -psip * recip;
+
+                fix += delx * fpair;
+                fiy += dely * fpair;
+                fiz += delz * fpair;
+                //fpair *= 0.5;
+            }
+        }
+
+        atom_fx(i) = fix;
+        atom_fy(i) = fiy;
+        atom_fz(i) = fiz;
+        addStat(stats->total_force_neighs, numneighs);
+        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+    }
+
+    LIKWID_MARKER_STOP("force_eam");
+    }
+
+    double E = getTimeStamp();
+    return E-S;
+}
--- a/src/verletlist/force_lj-x86.c
+++ b/src/verletlist/force_lj-x86.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+//---
+#include <atom.h>
+#include <likwid-marker.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timing.h>
+
+#ifdef __SIMD_KERNEL__
+#include <simd.h>
+#endif
+
+double computeForceLJFullNeigh_simd(
+    Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
+{
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6     = param->sigma6;
+    MD_FLOAT epsilon    = param->epsilon;
+
+    for (int i = 0; i < Nlocal; i++) {
+        atom_fx(i) = 0.0;
+        atom_fy(i) = 0.0;
+        atom_fz(i) = 0.0;
+    }
+
+    double S = getTimeStamp();
+
+#ifndef __SIMD_KERNEL__
+    fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
+    exit(-1);
+#else
+    MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
+    MD_SIMD_FLOAT sigma6_vec     = simd_broadcast(sigma6);
+    MD_SIMD_FLOAT eps_vec        = simd_broadcast(epsilon);
+    MD_SIMD_FLOAT c48_vec        = simd_broadcast(48.0);
+    MD_SIMD_FLOAT c05_vec        = simd_broadcast(0.5);
+
+#pragma omp parallel
+    {
+        LIKWID_MARKER_START("force");
+
+#pragma omp for schedule(runtime)
+        for (int i = 0; i < Nlocal; i++) {
+            neighs                    = &neighbor->neighbors[i * neighbor->maxneighs];
+            int numneighs             = neighbor->numneigh[i];
+            MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs);
+            MD_SIMD_FLOAT xtmp        = simd_broadcast(atom_x(i));
+            MD_SIMD_FLOAT ytmp        = simd_broadcast(atom_y(i));
+            MD_SIMD_FLOAT ztmp        = simd_broadcast(atom_z(i));
+            MD_SIMD_FLOAT fix         = simd_zero();
+            MD_SIMD_FLOAT fiy         = simd_zero();
+            MD_SIMD_FLOAT fiz         = simd_zero();
+
+            for (int k = 0; k < numneighs; k += VECTOR_WIDTH) {
+                // If the last iteration of this loop is separated from the rest, this
+                // mask can be set only there
+                MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(
+                    simd_int_add(simd_int_broadcast(k), simd_int_seq()),
+                    numneighs_vec);
+                MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs);
+#ifdef AOS
+                MD_SIMD_INT j3     = simd_int_add(simd_int_add(j, j), j); // j * 3
+                MD_SIMD_FLOAT delx = xtmp -
+                                     simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT));
+                MD_SIMD_FLOAT dely = ytmp -
+                                     simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT));
+                MD_SIMD_FLOAT delz = ztmp -
+                                     simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT));
+#else
+                MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT));
+                MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT));
+                MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT));
+#endif
+                MD_SIMD_FLOAT rsq        = simd_fma(delx,
+                    delx,
+                    simd_fma(dely, dely, simd_mul(delz, delz)));
+                MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs,
+                    simd_mask_cond_lt(rsq, cutforcesq_vec));
+                MD_SIMD_FLOAT sr2        = simd_reciprocal(rsq);
+                MD_SIMD_FLOAT sr6        = simd_mul(sr2,
+                    simd_mul(sr2, simd_mul(sr2, sigma6_vec)));
+                MD_SIMD_FLOAT force      = simd_mul(c48_vec,
+                    simd_mul(sr6,
+                        simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec))));
+
+                fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask);
+                fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask);
+                fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask);
+            }
+
+            atom_fx(i) += simd_h_reduce_sum(fix);
+            atom_fy(i) += simd_h_reduce_sum(fiy);
+            atom_fz(i) += simd_h_reduce_sum(fiz);
+        }
+
+        LIKWID_MARKER_STOP("force");
+    }
+#endif
+
+    double E = getTimeStamp();
+    return E - S;
+}
--- a/src/verletlist/force_lj.c
+++ b/src/verletlist/force_lj.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <likwid-marker.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timing.h>
+
+double computeForceLJFullNeigh(
+    Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
+{
+    int nLocal = atom->Nlocal;
+    int* neighs;
+#ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6     = param->sigma6;
+    MD_FLOAT epsilon    = param->epsilon;
+#endif
+    const MD_FLOAT num1  = 1.0;
+    const MD_FLOAT num48 = 48.0;
+    const MD_FLOAT num05 = 0.5;
+
+    for (int i = 0; i < nLocal; i++) {
+        atom_fx(i) = 0.0;
+        atom_fy(i) = 0.0;
+        atom_fz(i) = 0.0;
+    }
+    double timeStart = getTimeStamp();
+
+#pragma omp parallel
+    {
+        LIKWID_MARKER_START("force");
+
+#pragma omp for schedule(runtime)
+        for (int i = 0; i < nLocal; i++) {
+            neighs        = &neighbor->neighbors[i * neighbor->maxneighs];
+            int numneighs = neighbor->numneigh[i];
+            MD_FLOAT xtmp = atom_x(i);
+            MD_FLOAT ytmp = atom_y(i);
+            MD_FLOAT ztmp = atom_z(i);
+            MD_FLOAT fix  = 0;
+            MD_FLOAT fiy  = 0;
+            MD_FLOAT fiz  = 0;
+
+#ifdef EXPLICIT_TYPES
+            const int type_i = atom->type[i];
+#endif
+
+            for (int k = 0; k < numneighs; k++) {
+                int j         = neighs[k];
+                MD_FLOAT delx = xtmp - atom_x(j);
+                MD_FLOAT dely = ytmp - atom_y(j);
+                MD_FLOAT delz = ztmp - atom_z(j);
+                MD_FLOAT rsq  = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+                const int type_j          = atom->type[j];
+                const int type_ij         = type_i * atom->ntypes + type_j;
+                const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+                const MD_FLOAT sigma6     = atom->sigma6[type_ij];
+                const MD_FLOAT epsilon    = atom->epsilon[type_ij];
+#endif
+
+                if (rsq < cutforcesq) {
+                    MD_FLOAT sr2   = num1 / rsq;
+                    MD_FLOAT sr6   = sr2 * sr2 * sr2 * sigma6;
+                    MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
+                    fix += delx * force;
+                    fiy += dely * force;
+                    fiz += delz * force;
+#ifdef USE_REFERENCE_VERSION
+                    addStat(stats->atoms_within_cutoff, 1);
+                } else {
+                    addStat(stats->atoms_outside_cutoff, 1);
+#endif
+                }
+            }
+
+            atom_fx(i) += fix;
+            atom_fy(i) += fiy;
+            atom_fz(i) += fiz;
+
+#ifdef USE_REFERENCE_VERSION
+            if (numneighs % VECTOR_WIDTH > 0) {
+                addStat(stats->atoms_outside_cutoff,
+                    VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
+            }
+#endif
+
+            addStat(stats->total_force_neighs, numneighs);
+            addStat(stats->total_force_iters,
+                (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+        }
+
+        LIKWID_MARKER_STOP("force");
+    }
+
+    double timeStop = getTimeStamp();
+    return timeStop - timeStart;
+}
+
+double computeForceLJHalfNeigh(
+    Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
+{
+    int nlocal = atom->Nlocal;
+    int* neighs;
+#ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6     = param->sigma6;
+    MD_FLOAT epsilon    = param->epsilon;
+#endif
+    const MD_FLOAT num1  = 1.0;
+    const MD_FLOAT num48 = 48.0;
+    const MD_FLOAT num05 = 0.5;
+
+    for (int i = 0; i < nlocal; i++) {
+        atom_fx(i) = 0.0;
+        atom_fy(i) = 0.0;
+        atom_fz(i) = 0.0;
+    }
+
+    double timeStart = getTimeStamp();
+
+#pragma omp parallel
+    {
+        LIKWID_MARKER_START("forceLJ-halfneigh");
+
+#pragma omp for schedule(runtime)
+        for (int i = 0; i < nlocal; i++) {
+            neighs        = &neighbor->neighbors[i * neighbor->maxneighs];
+            int numneighs = neighbor->numneigh[i];
+            MD_FLOAT xtmp = atom_x(i);
+            MD_FLOAT ytmp = atom_y(i);
+            MD_FLOAT ztmp = atom_z(i);
+            MD_FLOAT fix  = 0;
+            MD_FLOAT fiy  = 0;
+            MD_FLOAT fiz  = 0;
+
+#ifdef EXPLICIT_TYPES
+            const int type_i = atom->type[i];
+#endif
+
+// Pragma required to vectorize the inner loop
+#ifdef ENABLE_OMP_SIMD
+#pragma omp simd reduction(+ : fix, fiy, fiz)
+#endif
+            for (int k = 0; k < numneighs; k++) {
+                int j         = neighs[k];
+                MD_FLOAT delx = xtmp - atom_x(j);
+                MD_FLOAT dely = ytmp - atom_y(j);
+                MD_FLOAT delz = ztmp - atom_z(j);
+                MD_FLOAT rsq  = delx * delx + dely * dely + delz * delz;
+
+#ifdef EXPLICIT_TYPES
+                const int type_j          = atom->type[j];
+                const int type_ij         = type_i * atom->ntypes + type_j;
+                const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+                const MD_FLOAT sigma6     = atom->sigma6[type_ij];
+                const MD_FLOAT epsilon    = atom->epsilon[type_ij];
+#endif
+
+                if (rsq < cutforcesq) {
+                    MD_FLOAT sr2   = num1 / rsq;
+                    MD_FLOAT sr6   = sr2 * sr2 * sr2 * sigma6;
+                    MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
+                    fix += delx * force;
+                    fiy += dely * force;
+                    fiz += delz * force;
+
+                    // We do not need to update forces for ghost atoms
+                    if (j < nlocal) {
+                        atom_fx(j) -= delx * force;
+                        atom_fy(j) -= dely * force;
+                        atom_fz(j) -= delz * force;
+                    }
+                }
+            }
+
+            atom_fx(i) += fix;
+            atom_fy(i) += fiy;
+            atom_fz(i) += fiz;
+
+            addStat(stats->total_force_neighs, numneighs);
+            addStat(stats->total_force_iters,
+                (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+        }
+
+        LIKWID_MARKER_STOP("forceLJ-halfneigh");
+    }
+
+    double timeStop = getTimeStamp();
+    return timeStop - timeStart;
+}
--- a/src/verletlist/integrate.h
+++ b/src/verletlist/integrate.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+//---
+#include <parameter.h>
+#include <atom.h>
+
+void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
+        atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
+        atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
+        atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
+    }
+}
+
+void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
+    for(int i = 0; i < atom->Nlocal; i++) {
+        atom_vx(i) += param->dtforce * atom_fx(i);
+        atom_vy(i) += param->dtforce * atom_fy(i);
+        atom_vz(i) += param->dtforce * atom_fz(i);
+    }
+}
+
+#ifdef CUDA_TARGET
+void initialIntegrate_cuda(bool, Parameter*, Atom*);
+void finalIntegrate_cuda(bool, Parameter*, Atom*);
+#endif
--- a/src/verletlist/main-stub.c
+++ b/src/verletlist/main-stub.c
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <string.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <timing.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <thermo.h>
+#include <eam.h>
+#include <pbc.h>
+#include <timers.h>
+#include <util.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
+
+// Patterns
+#define P_SEQ   0
+#define P_FIX   1
+#define P_RAND  2
+
+void init(Parameter *param) {
+    param->input_file = NULL;
+    param->force_field = FF_LJ;
+    param->epsilon = 1.0;
+    param->sigma6 = 1.0;
+    param->rho = 0.8442;
+    param->ntypes = 4;
+    param->ntimes = 200;
+    param->nx = 1;
+    param->ny = 1;
+    param->nz = 1;
+    param->lattice = 1.0;
+    param->cutforce = 1000000.0;
+    param->cutneigh = param->cutforce;
+    param->mass = 1.0;
+    param->half_neigh = 0;
+    // Unused
+    param->dt = 0.005;
+    param->dtforce = 0.5 * param->dt;
+    param->nstat = 100;
+    param->temp = 1.44;
+    param->reneigh_every = 20;
+    param->proc_freq = 2.4;
+    param->eam_file = NULL;
+}
+
+void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
+    const int maxneighs = nneighs * nreps;
+    neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
+    neighbor->neighbors = (int*) malloc(atom->Nmax * maxneighs * sizeof(int));
+
+    if(pattern == P_RAND && atom->Nlocal <= nneighs) {
+        fprintf(stderr, "Error: When using random pattern, number of atoms should be higher than number of neighbors per atom!\n");
+        exit(-1);
+    }
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        int *neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
+        int j = (pattern == P_SEQ) ? (i + 1) : 0;
+        int m = (pattern == P_SEQ) ? atom->Nlocal : nneighs;
+
+        for(int k = 0; k < nneighs; k++) {
+            if(pattern == P_RAND) {
+                int found = 0;
+                do {
+                    j = rand() % atom->Nlocal;
+                    neighptr[k] = j;
+                    found = (int)(i == j);
+                    for(int l = 0; l < k; l++) {
+                        if(neighptr[l] == j) {
+                            found = 1;
+                        }
+                    }
+                } while(found == 1);
+            } else {
+                neighptr[k] = j;
+                j = (j + 1) % m;
+            }
+        }
+
+        for(int r = 1; r < nreps; r++) {
+            for(int k = 0; k < nneighs; k++) {
+                neighptr[r * nneighs + k] = neighptr[k];
+            }
+        }
+
+        neighbor->numneigh[i] = nneighs * nreps;
+    }
+}
+
+int main(int argc, const char *argv[]) {
+    Eam eam;
+    Atom atom_data;
+    Atom *atom = (Atom *)(&atom_data);
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+    char *pattern_str = NULL;
+    int pattern = P_SEQ;
+    int natoms = 256;
+    int nneighs = 76;
+    int nreps = 1;
+    int csv = 0;
+
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("force");
+    DEBUG_MESSAGE("Initializing parameters...\n");
+    init(&param);
+
+    for(int i = 0; i < argc; i++) {
+        if((strcmp(argv[i], "-f") == 0)) {
+            if((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-p") == 0)) {
+            pattern_str = strdup(argv[++i]);
+            if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
+            else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
+            else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
+            else {
+                fprintf(stderr, "Invalid pattern!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-e") == 0)) {
+            param.eam_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-na") == 0)) {
+            natoms = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nn") == 0)) {
+            nneighs = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nr") == 0)) {
+            nreps = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--freq") == 0)) {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--csv") == 0)) {
+            csv = 1;
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-f <string>:          force field (lj or eam), default lj\n");
+            printf("-p <string>:          pattern for data accesses (seq, fix or rand)\n");
+            printf("-n / --nsteps <int>:  number of timesteps for simulation\n");
+            printf("-na <int>:            number of atoms (default 256)\n");
+            printf("-nn <int>:            number of neighbors per atom (default 76)\n");
+            printf("-nr <int>:            number of times neighbor lists should be replicated (default 1)\n");
+            printf("--freq <real>:        set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
+            printf("--csv:                set output as CSV style\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    if(pattern_str == NULL) {
+        pattern_str = strdup("seq\0");
+    }
+
+    if(param.force_field == FF_EAM) {
+        DEBUG_MESSAGE("Initializing EAM parameters...\n");
+        initEam(&eam, &param);
+    }
+
+    DEBUG_MESSAGE("Initializing atoms...\n");
+    initAtom(atom);
+    initStats(&stats);
+
+    atom->ntypes = param.ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param.epsilon;
+        atom->sigma6[i] = param.sigma6;
+        atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
+        atom->cutforcesq[i] = param.cutforce * param.cutforce;
+    }
+
+    DEBUG_MESSAGE("Creating atoms...\n");
+    for(int i = 0; i < natoms; ++i) {
+        while(atom->Nlocal > atom->Nmax - natoms) {
+            growAtom(atom);
+        }
+
+        atom->type[atom->Nlocal] = rand() % atom->ntypes;
+        atom_x(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
+        atom_y(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
+        atom_z(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
+        atom_vx(atom->Nlocal) = 0.0;
+        atom_vy(atom->Nlocal) = 0.0;
+        atom_vz(atom->Nlocal) = 0.0;
+        atom->Nlocal++;
+    }
+
+    const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
+    const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
+    const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
+
+    if(!csv) {
+        printf("Pattern: %s\n", pattern_str);
+        printf("Number of timesteps: %d\n", param.ntimes);
+        printf("Number of atoms: %d\n", natoms);
+        printf("Number of neighbors per atom: %d\n", nneighs);
+        printf("Number of times to replicate neighbor lists: %d\n", nreps);
+        printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
+        printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
+        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
+    }
+
+    DEBUG_MESSAGE("Initializing neighbor lists...\n");
+    initNeighbor(&neighbor, &param);
+    DEBUG_MESSAGE("Creating neighbor lists...\n");
+    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
+    DEBUG_MESSAGE("Computing forces...\n");
+
+    double T_accum = 0.0;
+    for(int i = 0; i < param.ntimes; i++) {
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+        traceAddresses(&param, atom, &neighbor, i + 1);
+#endif
+
+        if(param.force_field == FF_EAM) {
+            computeForceEam(&eam, &param, atom, &neighbor, &stats);
+        } else {
+            if(param.half_neigh) {
+                T_accum += computeForceLJHalfNeigh(&param, atom, &neighbor, &stats);
+            } else {
+                T_accum += computeForceLJFullNeigh(&param, atom, &neighbor, &stats);
+            }
+        }
+    }
+
+    double freq_hz = param.proc_freq * 1.e9;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
+    const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
+
+    if(!csv) {
+        printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
+        if(param.proc_freq > 0.0) {
+            printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
+        }
+    } else {
+        printf("steps,pattern,natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
+        if(param.proc_freq > 0.0) {
+            printf(",cy/atom,cy/neigh");
+        }
+        printf("\n");
+
+        printf("%d,%s,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
+            param.ntimes, pattern_str, natoms, nneighs, nreps,
+            estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
+
+        if(param.proc_freq > 0.0) {
+            printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
+        }
+        printf("\n");
+    }
+
+    double timer[NUMTIMER];
+    timer[FORCE] = T_accum;
+    displayStatistics(atom, &param, &stats, timer);
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/src/verletlist/main.c
+++ b/src/verletlist/main.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <likwid-marker.h>
+#include <omp.h>
+
+#include <allocate.h>
+#include <atom.h>
+#include <device.h>
+#include <eam.h>
+#include <integrate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <pbc.h>
+#include <stats.h>
+#include <thermo.h>
+#include <timers.h>
+#include <timing.h>
+#include <util.h>
+#include <vtk.h>
+
+#define HLINE "------------------------------------------------------------------\n"
+
+extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
+
+#ifdef CUDA_TARGET
+extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
+#endif
+
+double setup(Parameter* param, Eam* eam, Atom* atom, Neighbor* neighbor, Stats* stats)
+{
+    if (param->force_field == FF_EAM) {
+        initEam(eam, param);
+    }
+    double timeStart, timeStop;
+    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
+    param->xprd    = param->nx * param->lattice;
+    param->yprd    = param->ny * param->lattice;
+    param->zprd    = param->nz * param->lattice;
+
+    timeStart = getTimeStamp();
+    initAtom(atom);
+    initPbc(atom);
+    initStats(stats);
+    initNeighbor(neighbor, param);
+    if (param->input_file == NULL) {
+        createAtom(atom, param);
+    } else {
+        readAtom(atom, param);
+    }
+
+    setupNeighbor(param);
+    setupThermo(param, atom->Natoms);
+    if (param->input_file == NULL) {
+        adjustThermo(param, atom);
+    }
+#ifdef SORT_ATOMS
+    atom->Nghost = 0;
+    sortAtom(atom);
+#endif
+    setupPbc(atom, param);
+    initDevice(atom, neighbor);
+    updatePbc(atom, param, true);
+    buildNeighbor(atom, neighbor);
+    timeStop = getTimeStamp();
+    return timeStop - timeStart;
+}
+
+double reneighbour(Parameter* param, Atom* atom, Neighbor* neighbor)
+{
+    double timeStart, timeStop;
+    timeStart = getTimeStamp();
+    LIKWID_MARKER_START("reneighbour");
+    updateAtomsPbc(atom, param);
+#ifdef SORT_ATOMS
+    atom->Nghost = 0;
+    sortAtom(atom);
+#endif
+    setupPbc(atom, param);
+    updatePbc(atom, param, true);
+    buildNeighbor(atom, neighbor);
+    LIKWID_MARKER_STOP("reneighbour");
+    timeStop = getTimeStamp();
+    return timeStop - timeStart;
+}
+
+void printAtomState(Atom* atom)
+{
+    printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
+        atom->Natoms,
+        atom->Nlocal,
+        atom->Nghost,
+        atom->Nmax);
+    // int nall = atom->Nlocal + atom->Nghost;
+    // for (int i=0; i<nall; i++) {
+    //     printf("%d  %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
+    // }
+}
+
+double computeForce(
+    Eam* eam, Parameter* param, Atom* atom, Neighbor* neighbor, Stats* stats)
+{
+    if (param->force_field == FF_EAM) {
+        return computeForceEam(eam, param, atom, neighbor, stats);
+    } else if (param->force_field == FF_DEM) {
+        if (param->half_neigh) {
+            fprintf(stderr, "Error: DEM cannot use half neighbor-lists!\n");
+            return 0.0;
+        } else {
+            return computeForceDemFullNeigh(param, atom, neighbor, stats);
+        }
+    }
+
+    if (param->half_neigh) {
+        return computeForceLJHalfNeigh(param, atom, neighbor, stats);
+    }
+
+#ifdef CUDA_TARGET
+    return computeForceLJFullNeigh(param, atom, neighbor);
+#else
+    return computeForceLJFullNeigh(param, atom, neighbor, stats);
+#endif
+}
+
+void writeInput(Parameter* param, Atom* atom)
+{
+    FILE* fpin = fopen("input.in", "w");
+    fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fpin,
+            "1,%f,%f,%f,%f,%f,%f\n",
+            atom_x(i),
+            atom_y(i),
+            atom_z(i),
+            atom_vx(i),
+            atom_vy(i),
+            atom_vz(i));
+    }
+
+    fclose(fpin);
+}
+
+int main(int argc, char** argv)
+{
+    double timer[NUMTIMER];
+    Eam eam;
+    Atom atom;
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+
+    LIKWID_MARKER_INIT;
+#pragma omp parallel
+    {
+        LIKWID_MARKER_REGISTER("force");
+        // LIKWID_MARKER_REGISTER("reneighbour");
+        // LIKWID_MARKER_REGISTER("pbc");
+    }
+
+    initParameter(&param);
+    for (int i = 0; i < argc; i++) {
+        if ((strcmp(argv[i], "-p") == 0) || strcmp(argv[i], "--params") == 0) {
+            readParameter(&param, argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-f") == 0)) {
+            if ((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if ((strcmp(argv[i], "-i") == 0)) {
+            param.input_file = strdup(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-e") == 0)) {
+            param.eam_file = strdup(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-nx") == 0)) {
+            param.nx = atoi(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-ny") == 0)) {
+            param.ny = atoi(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-nz") == 0)) {
+            param.nz = atoi(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-half") == 0)) {
+            param.half_neigh = atoi(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
+            param.cutforce = atof(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
+            param.skin = atof(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "--freq") == 0)) {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "--vtk") == 0)) {
+            param.vtk_file = strdup(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-w") == 0)) {
+            param.write_atom_file = strdup(argv[++i]);
+            continue;
+        }
+        if ((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-p / --params <string>:     file to read parameters from (can be "
+                   "specified more than once)\n");
+            printf("-f <string>:                force field (lj, eam or dem), "
+                   "default lj\n");
+            printf("-i <string>:                input file with atom positions "
+                   "(dump)\n");
+            printf("-e <string>:                input file for EAM\n");
+            printf("-n / --nsteps <int>:        set number of timesteps for "
+                   "simulation\n");
+            printf("-nx/-ny/-nz <int>:          set linear dimension of systembox in "
+                   "x/y/z direction\n");
+            printf("-half <int>:                use half (1) or full (0) neighbor "
+                   "lists\n");
+            printf("-r / --radius <real>:       set cutoff radius\n");
+            printf("-s / --skin <real>:         set skin (verlet buffer)\n");
+            printf("-w <file>:                  write input atoms to file\n");
+            printf("--freq <real>:              processor frequency (GHz)\n");
+            printf("--vtk <string>:             VTK file for visualization\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    param.cutneigh = param.cutforce + param.skin;
+    setup(&param, &eam, &atom, &neighbor, &stats);
+    printParameter(&param);
+    printf(HLINE);
+
+    printf("step\ttemp\t\tpressure\n");
+    computeThermo(0, &param, &atom);
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+    traceAddresses(&param, &atom, &neighbor, n + 1);
+#endif
+
+    if (param.write_atom_file != NULL) {
+        writeAtom(&atom, &param);
+    }
+
+    // writeInput(&param, &atom);
+
+    timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
+    timer[NEIGH] = 0.0;
+    timer[TOTAL] = getTimeStamp();
+
+    if (param.vtk_file != NULL) {
+        write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
+    }
+
+    for (int n = 0; n < param.ntimes; n++) {
+        bool reneigh = (n + 1) % param.reneigh_every == 0;
+        initialIntegrate(reneigh, &param, &atom);
+        if ((n + 1) % param.reneigh_every) {
+            updatePbc(&atom, &param, false);
+        } else {
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
+        }
+
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+        traceAddresses(&param, &atom, &neighbor, n + 1);
+#endif
+
+        timer[FORCE] += computeForce(&eam, &param, &atom, &neighbor, &stats);
+        finalIntegrate(reneigh, &param, &atom);
+
+        if (!((n + 1) % param.nstat) && (n + 1) < param.ntimes) {
+#ifdef CUDA_TARGET
+            memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
+#endif
+            computeThermo(n + 1, &param, &atom);
+        }
+
+        if (param.vtk_file != NULL) {
+            write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
+        }
+    }
+
+    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
+    computeThermo(-1, &param, &atom);
+
+    printf(HLINE);
+    printf("System: %d atoms %d ghost atoms, Steps: %d\n",
+        atom.Natoms,
+        atom.Nghost,
+        param.ntimes);
+    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
+        timer[TOTAL],
+        timer[FORCE],
+        timer[NEIGH],
+        timer[TOTAL] - timer[FORCE] - timer[NEIGH]);
+    printf(HLINE);
+
+    int nthreads  = 0;
+    int chunkSize = 0;
+    omp_sched_t schedKind;
+    char schedType[10];
+#pragma omp parallel
+#pragma omp master
+    {
+        omp_get_schedule(&schedKind, &chunkSize);
+
+        switch (schedKind) {
+        case omp_sched_static:
+            strcpy(schedType, "static");
+            break;
+        case omp_sched_dynamic:
+            strcpy(schedType, "dynamic");
+            break;
+        case omp_sched_guided:
+            strcpy(schedType, "guided");
+            break;
+        case omp_sched_auto:
+            strcpy(schedType, "auto");
+            break;
+        case omp_sched_monotonic:
+            strcpy(schedType, "auto");
+            break;
+        }
+
+        nthreads = omp_get_max_threads();
+    }
+
+    printf("Num threads: %d\n", nthreads);
+    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
+
+    printf("Performance: %.2f million atom updates per second\n",
+        1e-6 * (double)atom.Natoms * param.ntimes / timer[TOTAL]);
+#ifdef COMPUTE_STATS
+    displayStatistics(&atom, &param, &stats, timer);
+#endif
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/src/verletlist/neighbor.c
+++ b/src/verletlist/neighbor.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+
+MD_FLOAT xprd, yprd, zprd;
+MD_FLOAT bininvx, bininvy, bininvz;
+int mbinxlo, mbinylo, mbinzlo;
+int nbinx, nbiny, nbinz;
+int mbinx, mbiny, mbinz; // n bins in x, y, z
+int *bincount;
+int *bins;
+int mbins; //total number of bins
+int atoms_per_bin;  // max atoms per bin
+MD_FLOAT cutneigh;
+MD_FLOAT cutneighsq;  // neighbor cutoff squared
+int nmax;
+int nstencil;      // # of bins in stencil
+int* stencil;      // stencil list of bin offsets
+MD_FLOAT binsizex, binsizey, binsizez;
+static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
+static MD_FLOAT bindist(int, int, int);
+
+/* exported subroutines */
+void initNeighbor(Neighbor *neighbor, Parameter *param) {
+    MD_FLOAT neighscale = 5.0 / 6.0;
+    xprd = param->nx * param->lattice;
+    yprd = param->ny * param->lattice;
+    zprd = param->nz * param->lattice;
+    cutneigh = param->cutneigh;
+    nbinx = neighscale * param->nx;
+    nbiny = neighscale * param->ny;
+    nbinz = neighscale * param->nz;
+    nmax = 0;
+    atoms_per_bin = 8;
+    stencil = NULL;
+    bins = NULL;
+    bincount = NULL;
+    neighbor->maxneighs = 100;
+    neighbor->numneigh = NULL;
+    neighbor->neighbors = NULL;
+    neighbor->half_neigh = param->half_neigh;
+}
+
+void setupNeighbor(Parameter* param) {
+    MD_FLOAT coord;
+    int mbinxhi, mbinyhi, mbinzhi;
+    int nextx, nexty, nextz;
+
+    if(param->input_file != NULL) {
+        xprd = param->xprd;
+        yprd = param->yprd;
+        zprd = param->zprd;
+    }
+
+    // TODO: update lo and hi for standard case and use them here instead
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
+
+    cutneighsq = cutneigh * cutneigh;
+
+    if(param->input_file != NULL) {
+        binsizex = cutneigh * 0.5;
+        binsizey = cutneigh * 0.5;
+        binsizez = cutneigh * 0.5;
+        nbinx = (int)((param->xhi - param->xlo) / binsizex);
+        nbiny = (int)((param->yhi - param->ylo) / binsizey);
+        nbinz = (int)((param->zhi - param->zlo) / binsizez);
+        if(nbinx == 0) { nbinx = 1; }
+        if(nbiny == 0) { nbiny = 1; }
+        if(nbinz == 0) { nbinz = 1; }
+        bininvx = nbinx / (param->xhi - param->xlo);
+        bininvy = nbiny / (param->yhi - param->ylo);
+        bininvz = nbinz / (param->zhi - param->zlo);
+    } else {
+        binsizex = xprd / nbinx;
+        binsizey = yprd / nbiny;
+        binsizez = zprd / nbinz;
+        bininvx = 1.0 / binsizex;
+        bininvy = 1.0 / binsizey;
+        bininvz = 1.0 / binsizez;
+    }
+
+    coord = xlo - cutneigh - SMALL * xprd;
+    mbinxlo = (int) (coord * bininvx);
+    if (coord < 0.0) { mbinxlo = mbinxlo - 1; }
+    coord = xhi + cutneigh + SMALL * xprd;
+    mbinxhi = (int) (coord * bininvx);
+
+    coord = ylo - cutneigh - SMALL * yprd;
+    mbinylo = (int) (coord * bininvy);
+    if (coord < 0.0) { mbinylo = mbinylo - 1; }
+    coord = yhi + cutneigh + SMALL * yprd;
+    mbinyhi = (int) (coord * bininvy);
+
+    coord = zlo - cutneigh - SMALL * zprd;
+    mbinzlo = (int) (coord * bininvz);
+    if (coord < 0.0) { mbinzlo = mbinzlo - 1; }
+    coord = zhi + cutneigh + SMALL * zprd;
+    mbinzhi = (int) (coord * bininvz);
+
+    mbinxlo = mbinxlo - 1;
+    mbinxhi = mbinxhi + 1;
+    mbinx = mbinxhi - mbinxlo + 1;
+
+    mbinylo = mbinylo - 1;
+    mbinyhi = mbinyhi + 1;
+    mbiny = mbinyhi - mbinylo + 1;
+
+    mbinzlo = mbinzlo - 1;
+    mbinzhi = mbinzhi + 1;
+    mbinz = mbinzhi - mbinzlo + 1;
+
+    nextx = (int) (cutneigh * bininvx);
+    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
+    nexty = (int) (cutneigh * bininvy);
+    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
+    nextz = (int) (cutneigh * bininvz);
+    if(nextz * binsizez < FACTOR * cutneigh) nextz++;
+
+    if (stencil) { free(stencil); }
+    stencil = (int*) malloc((2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
+    nstencil = 0;
+    int kstart = -nextz;
+
+    for(int k = kstart; k <= nextz; k++) {
+        for(int j = -nexty; j <= nexty; j++) {
+            for(int i = -nextx; i <= nextx; i++) {
+                if(bindist(i, j, k) < cutneighsq) {
+                    stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
+                }
+            }
+        }
+    }
+
+    mbins = mbinx * mbiny * mbinz;
+    if (bincount) { free(bincount); }
+    bincount = (int*) malloc(mbins * sizeof(int));
+    if (bins) { free(bins); }
+    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+}
+
+void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
+    int nall = atom->Nlocal + atom->Nghost;
+
+    /* extend atom arrays if necessary */
+    if(nall > nmax) {
+        nmax = nall;
+        if(neighbor->numneigh) free(neighbor->numneigh);
+        if(neighbor->neighbors) free(neighbor->neighbors);
+        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
+        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+    }
+
+    /* bin local & ghost atoms */
+    binatoms(atom);
+    int resize = 1;
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        int new_maxneighs = neighbor->maxneighs;
+        resize = 0;
+
+        for(int i = 0; i < atom->Nlocal; i++) {
+            int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
+            int n = 0;
+            MD_FLOAT xtmp = atom_x(i);
+            MD_FLOAT ytmp = atom_y(i);
+            MD_FLOAT ztmp = atom_z(i);
+            int ibin = coord2bin(xtmp, ytmp, ztmp);
+            #ifdef EXPLICIT_TYPES
+            int type_i = atom->type[i];
+            #endif
+            for(int k = 0; k < nstencil; k++) {
+                int jbin = ibin + stencil[k];
+                int* loc_bin = &bins[jbin * atoms_per_bin];
+
+                for(int m = 0; m < bincount[jbin]; m++) {
+                    int j = loc_bin[m];
+                    if((j == i) || (neighbor->half_neigh && (j < i))) {
+                        continue;
+                    }
+
+                    MD_FLOAT delx = xtmp - atom_x(j);
+                    MD_FLOAT dely = ytmp - atom_y(j);
+                    MD_FLOAT delz = ztmp - atom_z(j);
+                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                    #ifdef EXPLICIT_TYPES
+                    int type_j = atom->type[j];
+                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+                    #else
+                    const MD_FLOAT cutoff = cutneighsq;
+                    #endif
+                    if(rsq <= cutoff) {
+                        neighptr[n++] = j;
+                    }
+                }
+            }
+
+            neighbor->numneigh[i] = n;
+            if(n >= neighbor->maxneighs) {
+                resize = 1;
+
+                if(n >= new_maxneighs) {
+                    new_maxneighs = n;
+                }
+            }
+        }
+
+        if(resize) {
+            printf("RESIZE %d\n", neighbor->maxneighs);
+            neighbor->maxneighs = new_maxneighs * 1.2;
+            free(neighbor->neighbors);
+            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+        }
+    }
+}
+
+/* internal subroutines */
+MD_FLOAT bindist(int i, int j, int k) {
+    MD_FLOAT delx, dely, delz;
+
+    if(i > 0) {
+        delx = (i - 1) * binsizex;
+    } else if(i == 0) {
+        delx = 0.0;
+    } else {
+        delx = (i + 1) * binsizex;
+    }
+
+    if(j > 0) {
+        dely = (j - 1) * binsizey;
+    } else if(j == 0) {
+        dely = 0.0;
+    } else {
+        dely = (j + 1) * binsizey;
+    }
+
+    if(k > 0) {
+        delz = (k - 1) * binsizez;
+    } else if(k == 0) {
+        delz = 0.0;
+    } else {
+        delz = (k + 1) * binsizez;
+    }
+
+    return (delx * delx + dely * dely + delz * delz);
+}
+
+int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin) {
+    int ix, iy, iz;
+
+    if(xin >= xprd) {
+        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+
+    if(zin >= zprd) {
+        iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
+    } else if(zin >= 0.0) {
+        iz = (int)(zin * bininvz) - mbinzlo;
+    } else {
+        iz = (int)(zin * bininvz) - mbinzlo - 1;
+    }
+
+    return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
+}
+
+void binatoms(Atom *atom) {
+    int nall = atom->Nlocal + atom->Nghost;
+    int resize = 1;
+
+    while(resize > 0) {
+        resize = 0;
+
+        for(int i = 0; i < mbins; i++) {
+            bincount[i] = 0;
+        }
+
+        for(int i = 0; i < nall; i++) {
+            int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
+
+            if(bincount[ibin] < atoms_per_bin) {
+                int ac = bincount[ibin]++;
+                bins[ibin * atoms_per_bin + ac] = i;
+            } else {
+                resize = 1;
+            }
+        }
+
+        if(resize) {
+            free(bins);
+            atoms_per_bin *= 2;
+            bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+        }
+    }
+}
+
+void sortAtom(Atom* atom) {
+    binatoms(atom);
+    int Nmax = atom->Nmax;
+    int* binpos = bincount;
+
+    for(int i = 1; i < mbins; i++) {
+        binpos[i] += binpos[i - 1];
+    }
+
+    #ifdef AOS
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+    #else
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    #endif
+    MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
+    MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
+
+    for(int mybin = 0; mybin < mbins; mybin++) {
+        int start = mybin > 0 ? binpos[mybin - 1] : 0;
+        int count = binpos[mybin] - start;
+        for(int k = 0; k < count; k++) {
+            int new_i = start + k;
+            int old_i = bins[mybin * atoms_per_bin + k];
+            #ifdef AOS
+            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
+            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
+            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
+            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
+            new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
+            new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
+            #else
+            new_x[new_i] = old_x[old_i];
+            new_y[new_i] = old_y[old_i];
+            new_z[new_i] = old_z[old_i];
+            new_vx[new_i] = old_vx[old_i];
+            new_vy[new_i] = old_vy[old_i];
+            new_vz[new_i] = old_vz[old_i];
+            #endif
+        }
+    }
+
+    free(atom->x);
+    free(atom->vx);
+    atom->x = new_x;
+    atom->vx = new_vx;
+    #ifndef AOS
+    free(atom->y);
+    free(atom->z);
+    free(atom->vy);
+    free(atom->vz);
+    atom->y = new_y;
+    atom->z = new_z;
+    atom->vy = new_vy;
+    atom->vz = new_vz;
+    #endif
+}
--- a/src/verletlist/neighbor.h
+++ b/src/verletlist/neighbor.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __NEIGHBOR_H_
+#define __NEIGHBOR_H_
+
+typedef struct {
+    int *neighbors;
+    int *numneigh;
+} DeviceNeighbor;
+
+typedef struct {
+    int every;
+    int ncalls;
+    int maxneighs;
+    int half_neigh;
+    int *neighbors;
+    int *numneigh;
+
+    // Device data
+    DeviceNeighbor d_neighbor;
+} Neighbor;
+
+typedef struct {
+    MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
+    MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
+    int mbinxlo; int mbinylo; int mbinzlo;
+    int nbinx; int nbiny; int nbinz;
+    int mbinx; int mbiny; int mbinz;
+} Neighbor_params;
+
+typedef struct {
+    int* bincount;
+    int* bins;
+    int mbins;
+    int atoms_per_bin;
+} Binning;
+
+extern void initNeighbor(Neighbor*, Parameter*);
+extern void setupNeighbor(Parameter*);
+extern void binatoms(Atom*);
+extern void buildNeighbor_cpu(Atom*, Neighbor*);
+extern void sortAtom(Atom*);
+
+#ifdef CUDA_TARGET
+extern void buildNeighbor_cuda(Atom*, Neighbor*);
+#endif
+
+#endif
--- a/src/verletlist/pbc.c
+++ b/src/verletlist/pbc.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+//---
+#include <allocate.h>
+#include <atom.h>
+#include <pbc.h>
+
+#define DELTA 20000
+
+int nmaxGhost;
+int *PBCx, *PBCy, *PBCz;
+
+static void growPbc(Atom*);
+
+/* exported subroutines */
+void initPbc(Atom* atom)
+{
+    nmaxGhost        = 0;
+    atom->border_map = NULL;
+    PBCx             = NULL;
+    PBCy             = NULL;
+    PBCz             = NULL;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void updatePbc_cpu(Atom* atom, Parameter* param, bool doReneighbor)
+{
+    int* borderMap = atom->border_map;
+    int nlocal     = atom->Nlocal;
+    MD_FLOAT xprd  = param->xprd;
+    MD_FLOAT yprd  = param->yprd;
+    MD_FLOAT zprd  = param->zprd;
+
+    for (int i = 0; i < atom->Nghost; i++) {
+        atom_x(nlocal + i) = atom_x(borderMap[i]) + PBCx[i] * xprd;
+        atom_y(nlocal + i) = atom_y(borderMap[i]) + PBCy[i] * yprd;
+        atom_z(nlocal + i) = atom_z(borderMap[i]) + PBCz[i] * zprd;
+    }
+}
+
+/* relocate atoms that have left domain according
+ * to periodic boundary conditions */
+void updateAtomsPbc_cpu(Atom* atom, Parameter* param)
+{
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+        if (atom_x(i) < 0.0) {
+            atom_x(i) += xprd;
+        } else if (atom_x(i) >= xprd) {
+            atom_x(i) -= xprd;
+        }
+
+        if (atom_y(i) < 0.0) {
+            atom_y(i) += yprd;
+        } else if (atom_y(i) >= yprd) {
+            atom_y(i) -= yprd;
+        }
+
+        if (atom_z(i) < 0.0) {
+            atom_z(i) += zprd;
+        } else if (atom_z(i) >= zprd) {
+            atom_z(i) -= zprd;
+        }
+    }
+}
+
+/* setup periodic boundary conditions by
+ * defining ghost atoms around domain
+ * only creates mapping and coordinate corrections
+ * that are then enforced in updatePbc */
+#define ADDGHOST(dx, dy, dz)                                                             \
+    Nghost++;                                                                            \
+    border_map[Nghost]                = i;                                               \
+    PBCx[Nghost]                      = dx;                                              \
+    PBCy[Nghost]                      = dy;                                              \
+    PBCz[Nghost]                      = dz;                                              \
+    atom->type[atom->Nlocal + Nghost] = atom->type[i]
+
+void setupPbc(Atom* atom, Parameter* param)
+{
+    int* border_map   = atom->border_map;
+    MD_FLOAT xprd     = param->xprd;
+    MD_FLOAT yprd     = param->yprd;
+    MD_FLOAT zprd     = param->zprd;
+    MD_FLOAT cutneigh = param->cutneigh;
+    int Nghost        = -1;
+
+    for (int i = 0; i < atom->Nlocal; i++) {
+        if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
+            growAtom(atom);
+        }
+
+        if (Nghost + 7 >= nmaxGhost) {
+            growPbc(atom);
+            border_map = atom->border_map;
+        }
+
+        MD_FLOAT x = atom_x(i);
+        MD_FLOAT y = atom_y(i);
+        MD_FLOAT z = atom_z(i);
+
+        /* Setup ghost atoms */
+        /* 6 planes */
+        if (param->pbc_x != 0) {
+            if (x < cutneigh) {
+                ADDGHOST(+1, 0, 0);
+            }
+            if (x >= (xprd - cutneigh)) {
+                ADDGHOST(-1, 0, 0);
+            }
+        }
+
+        if (param->pbc_y != 0) {
+            if (y < cutneigh) {
+                ADDGHOST(0, +1, 0);
+            }
+            if (y >= (yprd - cutneigh)) {
+                ADDGHOST(0, -1, 0);
+            }
+        }
+
+        if (param->pbc_z != 0) {
+            if (z < cutneigh) {
+                ADDGHOST(0, 0, +1);
+            }
+            if (z >= (zprd - cutneigh)) {
+                ADDGHOST(0, 0, -1);
+            }
+        }
+
+        /* 8 corners */
+        if (param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
+            if (x < cutneigh && y < cutneigh && z < cutneigh) {
+                ADDGHOST(+1, +1, +1);
+            }
+            if (x < cutneigh && y >= (yprd - cutneigh) && z < cutneigh) {
+                ADDGHOST(+1, -1, +1);
+            }
+            if (x < cutneigh && y < cutneigh && z >= (zprd - cutneigh)) {
+                ADDGHOST(+1, +1, -1);
+            }
+            if (x < cutneigh && y >= (yprd - cutneigh) && z >= (zprd - cutneigh)) {
+                ADDGHOST(+1, -1, -1);
+            }
+            if (x >= (xprd - cutneigh) && y < cutneigh && z < cutneigh) {
+                ADDGHOST(-1, +1, +1);
+            }
+            if (x >= (xprd - cutneigh) && y >= (yprd - cutneigh) && z < cutneigh) {
+                ADDGHOST(-1, -1, +1);
+            }
+            if (x >= (xprd - cutneigh) && y < cutneigh && z >= (zprd - cutneigh)) {
+                ADDGHOST(-1, +1, -1);
+            }
+            if (x >= (xprd - cutneigh) && y >= (yprd - cutneigh) &&
+                z >= (zprd - cutneigh)) {
+                ADDGHOST(-1, -1, -1);
+            }
+        }
+
+        /* 12 edges */
+        if (param->pbc_x != 0 && param->pbc_z != 0) {
+            if (x < cutneigh && z < cutneigh) {
+                ADDGHOST(+1, 0, +1);
+            }
+            if (x < cutneigh && z >= (zprd - cutneigh)) {
+                ADDGHOST(+1, 0, -1);
+            }
+            if (x >= (xprd - cutneigh) && z < cutneigh) {
+                ADDGHOST(-1, 0, +1);
+            }
+            if (x >= (xprd - cutneigh) && z >= (zprd - cutneigh)) {
+                ADDGHOST(-1, 0, -1);
+            }
+        }
+
+        if (param->pbc_y != 0 && param->pbc_z != 0) {
+            if (y < cutneigh && z < cutneigh) {
+                ADDGHOST(0, +1, +1);
+            }
+            if (y < cutneigh && z >= (zprd - cutneigh)) {
+                ADDGHOST(0, +1, -1);
+            }
+            if (y >= (yprd - cutneigh) && z < cutneigh) {
+                ADDGHOST(0, -1, +1);
+            }
+            if (y >= (yprd - cutneigh) && z >= (zprd - cutneigh)) {
+                ADDGHOST(0, -1, -1);
+            }
+        }
+
+        if (param->pbc_x != 0 && param->pbc_y != 0) {
+            if (y < cutneigh && x < cutneigh) {
+                ADDGHOST(+1, +1, 0);
+            }
+            if (y < cutneigh && x >= (xprd - cutneigh)) {
+                ADDGHOST(-1, +1, 0);
+            }
+            if (y >= (yprd - cutneigh) && x < cutneigh) {
+                ADDGHOST(+1, -1, 0);
+            }
+            if (y >= (yprd - cutneigh) && x >= (xprd - cutneigh)) {
+                ADDGHOST(-1, -1, 0);
+            }
+        }
+    }
+    // increase by one to make it the ghost atom count
+    atom->Nghost = Nghost + 1;
+}
+
+/* internal subroutines */
+void growPbc(Atom* atom)
+{
+    int nold = nmaxGhost;
+    nmaxGhost += DELTA;
+
+    atom->border_map = (int*)reallocate(atom->border_map,
+        ALIGNMENT,
+        nmaxGhost * sizeof(int),
+        nold * sizeof(int));
+    PBCx = (int*)reallocate(PBCx, ALIGNMENT, nmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCy = (int*)reallocate(PBCy, ALIGNMENT, nmaxGhost * sizeof(int), nold * sizeof(int));
+    PBCz = (int*)reallocate(PBCz, ALIGNMENT, nmaxGhost * sizeof(int), nold * sizeof(int));
+}
--- a/src/verletlist/pbc.h
+++ b/src/verletlist/pbc.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+//---
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __PBC_H_
+#define __PBC_H_
+extern void initPbc(Atom*);
+extern void updatePbc_cpu(Atom*, Parameter*, bool);
+extern void updateAtomsPbc_cpu(Atom*, Parameter*);
+extern void setupPbc(Atom*, Parameter*);
+
+#ifdef CUDA_TARGET
+extern void updatePbc_cuda(Atom*, Parameter*, bool);
+extern void updateAtomsPbc_cuda(Atom*, Parameter*);
+#endif
+
+#endif
--- a/src/verletlist/stats.c
+++ b/src/verletlist/stats.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+
+#include <atom.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timers.h>
+
+void initStats(Stats *s) {
+    s->total_force_neighs = 0;
+    s->total_force_iters = 0;
+    s->atoms_within_cutoff = 0;
+    s->atoms_outside_cutoff = 0;
+}
+
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
+#ifdef COMPUTE_STATS
+
+    double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
+                                          (double)(stats->total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
+    double avg_neigh = stats->total_force_neighs / (double)(atom->Nlocal * (param->ntimes + 1));
+    double avg_simd = stats->total_force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
+
+    #ifdef EXPLICIT_TYPES
+    force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->total_force_neighs) * sizeof(int);
+    #endif
+
+    printf("Statistics:\n");
+    printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
+    printf("\tAverage neighbors per atom: %.4f\n", avg_neigh);
+    printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
+    printf("\tTotal number of computed pair interactions: %lld\n", stats->total_force_neighs);
+    printf("\tTotal number of SIMD iterations: %lld\n", stats->total_force_iters);
+    printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
+    printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->total_force_iters);
+
+    #ifdef USE_REFERENCE_VERSION
+    const double eff_pct = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
+    printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, eff_pct);
+    #endif
+
+#endif
+}
--- a/src/verletlist/stats.h
+++ b/src/verletlist/stats.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __STATS_H_
+#define __STATS_H_
+typedef struct {
+    long long int total_force_neighs;
+    long long int total_force_iters;
+    long long int atoms_within_cutoff;
+    long long int atoms_outside_cutoff;
+} Stats;
+
+void initStats(Stats *s);
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
+
+#ifdef COMPUTE_STATS
+#   define addStat(stat, value)     stat += value;
+#   define beginStatTimer()         double Si = getTimeStamp();
+#   define endStatTimer(stat)       stat += getTimeStamp() - Si;
+#else
+#   define addStat(stat, value)
+#   define beginStatTimer()
+#   define endStatTimer(stat)
+#endif
+
+#endif
--- a/src/verletlist/tracing.c
+++ b/src/verletlist/tracing.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <tracing.h>
+
+void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
+    MEM_TRACER_INIT;
+    INDEX_TRACER_INIT;
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+
+    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MEM_TRACE(atom_x(i), 'R');
+        MEM_TRACE(atom_y(i), 'R');
+        MEM_TRACE(atom_z(i), 'R');
+        INDEX_TRACE_ATOM(i);
+
+        #ifdef EXPLICIT_TYPES
+        MEM_TRACE(atom->type[i], 'R');
+        #endif
+
+        DIST_TRACE_SORT(neighs, numneighs);
+        INDEX_TRACE(neighs, numneighs);
+        DIST_TRACE(neighs, numneighs);
+
+        for(int k = 0; k < numneighs; k++) {
+            MEM_TRACE(neighs[k], 'R');
+            MEM_TRACE(atom_x(j), 'R');
+            MEM_TRACE(atom_y(j), 'R');
+            MEM_TRACE(atom_z(j), 'R');
+
+            #ifdef EXPLICIT_TYPES
+            MEM_TRACE(atom->type[j], 'R');
+            #endif
+        }
+
+        MEM_TRACE(atom_fx(i), 'R');
+        MEM_TRACE(atom_fx(i), 'W');
+        MEM_TRACE(atom_fy(i), 'R');
+        MEM_TRACE(atom_fy(i), 'W');
+        MEM_TRACE(atom_fz(i), 'R');
+        MEM_TRACE(atom_fz(i), 'W');
+    }
+
+    INDEX_TRACER_END;
+    MEM_TRACER_END;
+}
--- a/src/verletlist/tracing.h
+++ b/src/verletlist/tracing.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#ifndef VECTOR_WIDTH
+#   define VECTOR_WIDTH                 8
+#endif
+
+#ifndef TRACER_CONDITION
+#   define TRACER_CONDITION                 (!(timestep % param->every))
+#endif
+
+#ifdef MEM_TRACER
+#   define MEM_TRACER_INIT                  FILE *mem_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char mem_tracer_fn[128]; \
+                                                snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
+                                                mem_tracer_fp = fopen(mem_tracer_fn, "w");
+                                            }
+
+#   define MEM_TRACER_END                   if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
+#   define MEM_TRACE(addr, op)              if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
+#else
+#   define MEM_TRACER_INIT
+#   define MEM_TRACER_END
+#   define MEM_TRACE(addr, op)
+#endif
+
+#ifdef INDEX_TRACER
+#   define INDEX_TRACER_INIT                FILE *index_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char index_tracer_fn[128]; \
+                                                snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
+                                                index_tracer_fp = fopen(index_tracer_fn, "w"); \
+                                            }
+
+#   define INDEX_TRACER_END                 if(TRACER_CONDITION) { fclose(index_tracer_fp); }
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)   if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
+#   define INDEX_TRACE_ATOM(a)              if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
+#   define INDEX_TRACE(l, e)                if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    fprintf(index_tracer_fp, "I: "); \
+                                                    for(int __j = 0; __j < __e; ++__j) { \
+                                                        fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
+                                                    } \
+                                                    fprintf(index_tracer_fp, "\n"); \
+                                                } \
+                                            }
+
+#   define DIST_TRACE_SORT(l, e)            if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        for(int __j = __i; __j < __i + __e - 1; ++__j) { \
+                                                            for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
+                                                                if(l[__k] > l[__k + 1]) { \
+                                                                    int __t = l[__k]; \
+                                                                    l[__k] = l[__k + 1]; \
+                                                                    l[__k + 1] = __t; \
+                                                                } \
+                                                            } \
+                                                        } \
+                                                    } \
+                                                } \
+                                            }
+
+#   define DIST_TRACE(l, e)                 if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        fprintf(index_tracer_fp, "D: "); \
+                                                        for(int __j = 0; __j < __e - 1; ++__j) { \
+                                                            int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
+                                                            fprintf(index_tracer_fp, "%d ", __dist); \
+                                                        } \
+                                                        fprintf(index_tracer_fp, "\n"); \
+                                                    } \
+                                                } \
+                                            }
+#else
+#   define INDEX_TRACER_INIT
+#   define INDEX_TRACER_END
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)
+#   define INDEX_TRACE_ATOM(a)
+#   define INDEX_TRACE(l, e)
+#   define DIST_TRACE_SORT(l, e)
+#   define DIST_TRACE(l, e)
+#endif
+
+extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
--- a/src/verletlist/vtk.c
+++ b/src/verletlist/vtk.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atom.h>
+
+int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
--- a/src/verletlist/vtk.h
+++ b/src/verletlist/vtk.h
@@ -0,0 +1,12 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+
+#ifndef __VTK_H_
+#define __VTK_H_
+extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+#endif