Cleanup. Remove copyright year. Reformat.

2024-05-13 12:33:08 +02:00
parent a6a269703d
commit 9712d7e2c8
77 changed files with 959 additions and 648 deletions
--- a/clusterpair/atom.c
+++ b/clusterpair/atom.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include <atom.h>
+#include <allocate.h>
+#include <util.h>
+
+void initAtom(Atom *atom) {
+    atom->x  = NULL; atom->y  = NULL; atom->z  = NULL;
+    atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
+    atom->cl_x = NULL;
+    atom->cl_v = NULL;
+    atom->cl_f = NULL;
+    atom->cl_type = NULL;
+    atom->Natoms = 0;
+    atom->Nlocal = 0;
+    atom->Nghost = 0;
+    atom->Nmax   = 0;
+    atom->Nclusters = 0;
+    atom->Nclusters_local = 0;
+    atom->Nclusters_ghost = 0;
+    atom->Nclusters_max = 0;
+    atom->type = NULL;
+    atom->ntypes = 0;
+    atom->epsilon = NULL;
+    atom->sigma6 = NULL;
+    atom->cutforcesq = NULL;
+    atom->cutneighsq = NULL;
+    atom->iclusters = NULL;
+    atom->jclusters = NULL;
+    atom->icluster_bin = NULL;
+    initMasks(atom);
+}
+
+void createAtom(Atom *atom, Parameter *param) {
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
+    atom->Natoms = 4 * param->nx * param->ny * param->nz;
+    atom->Nlocal = 0;
+    atom->ntypes = param->ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
+    int ilo = (int) (xlo / (0.5 * alat) - 1);
+    int ihi = (int) (xhi / (0.5 * alat) + 1);
+    int jlo = (int) (ylo / (0.5 * alat) - 1);
+    int jhi = (int) (yhi / (0.5 * alat) + 1);
+    int klo = (int) (zlo / (0.5 * alat) - 1);
+    int khi = (int) (zhi / (0.5 * alat) + 1);
+
+    ilo = MAX(ilo, 0);
+    ihi = MIN(ihi, 2 * param->nx - 1);
+    jlo = MAX(jlo, 0);
+    jhi = MIN(jhi, 2 * param->ny - 1);
+    klo = MAX(klo, 0);
+    khi = MIN(khi, 2 * param->nz - 1);
+
+    MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
+    int i, j, k, m, n;
+    int sx = 0; int sy = 0; int sz = 0;
+    int ox = 0; int oy = 0; int oz = 0;
+    int subboxdim = 8;
+
+    while(oz * subboxdim <= khi) {
+        k = oz * subboxdim + sz;
+        j = oy * subboxdim + sy;
+        i = ox * subboxdim + sx;
+
+        if(((i + j + k) % 2 == 0) && (i >= ilo) && (i <= ihi) && (j >= jlo) && (j <= jhi) && (k >= klo) && (k <= khi)) {
+            xtmp = 0.5 * alat * i;
+            ytmp = 0.5 * alat * j;
+            ztmp = 0.5 * alat * k;
+
+            if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
+                n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
+                for(m = 0; m < 5; m++) { myrandom(&n); }
+                vxtmp = myrandom(&n);
+                for(m = 0; m < 5; m++){ myrandom(&n); }
+                vytmp = myrandom(&n);
+                for(m = 0; m < 5; m++) { myrandom(&n); }
+                vztmp = myrandom(&n);
+
+                if(atom->Nlocal == atom->Nmax) { growAtom(atom); }
+                atom_x(atom->Nlocal) = xtmp;
+                atom_y(atom->Nlocal) = ytmp;
+                atom_z(atom->Nlocal) = ztmp;
+                atom->vx[atom->Nlocal] = vxtmp;
+                atom->vy[atom->Nlocal] = vytmp;
+                atom->vz[atom->Nlocal] = vztmp;
+                atom->type[atom->Nlocal] = rand() % atom->ntypes;
+                atom->Nlocal++;
+            }
+        }
+
+        sx++;
+        if(sx == subboxdim) { sx = 0; sy++; }
+        if(sy == subboxdim) { sy = 0; sz++; }
+        if(sz == subboxdim) { sz = 0; ox++; }
+        if(ox * subboxdim > ihi) { ox = 0; oy++; }
+        if(oy * subboxdim > jhi) { oy = 0; oz++; }
+    }
+}
+
+int type_str2int(const char *type) {
+    if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
+    fprintf(stderr, "Invalid atom type: %s\n", type);
+    exit(-1);
+    return -1;
+}
+
+int readAtom(Atom* atom, Parameter* param) {
+    int len = strlen(param->input_file);
+    if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
+    if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
+    if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
+    fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
+    exit(-1);
+    return -1;
+}
+
+int readAtom_pdb(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int read_atoms = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    while(!feof(fp)) {
+        readline(line, fp);
+        char *item = strtok(line, " ");
+        if(strncmp(item, "CRYST1", 6) == 0) {
+            param->xlo = 0.0;
+            param->xhi = atof(strtok(NULL, " "));
+            param->ylo = 0.0;
+            param->yhi = atof(strtok(NULL, " "));
+            param->zlo = 0.0;
+            param->zhi = atof(strtok(NULL, " "));
+            param->xprd = param->xhi - param->xlo;
+            param->yprd = param->yhi - param->ylo;
+            param->zprd = param->zhi - param->zlo;
+            // alpha, beta, gamma, sGroup, z
+        } else if(strncmp(item, "ATOM", 4) == 0) {
+            char *label;
+            int atom_id, comp_id;
+            MD_FLOAT occupancy, charge;
+            atom_id = atoi(strtok(NULL, " ")) - 1;
+
+            while(atom_id + 1 >= atom->Nmax) {
+                growAtom(atom);
+            }
+
+            atom->type[atom_id] = type_str2int(strtok(NULL, " "));
+            label = strtok(NULL, " ");
+            comp_id = atoi(strtok(NULL, " "));
+            atom_x(atom_id) = atof(strtok(NULL, " "));
+            atom_y(atom_id) = atof(strtok(NULL, " "));
+            atom_z(atom_id) = atof(strtok(NULL, " "));
+            atom->vx[atom_id] = 0.0;
+            atom->vy[atom_id] = 0.0;
+            atom->vz[atom_id] = 0.0;
+            occupancy = atof(strtok(NULL, " "));
+            charge = atof(strtok(NULL, " "));
+            atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
+            atom->Natoms++;
+            atom->Nlocal++;
+            read_atoms++;
+        } else if(strncmp(item, "HEADER", 6) == 0 ||
+                  strncmp(item, "REMARK", 6) == 0 ||
+                  strncmp(item, "MODEL", 5) == 0 ||
+                  strncmp(item, "TER", 3) == 0 ||
+                  strncmp(item, "ENDMDL", 6) == 0) {
+            // Do nothing
+        } else {
+            fprintf(stderr, "Invalid item: %s\n", item);
+            exit(-1);
+            return -1;
+        }
+    }
+
+    if(!read_atoms) {
+        fprintf(stderr, "Input error: No atoms read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    fclose(fp);
+    return read_atoms;
+}
+
+int readAtom_gro(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    char desc[MAXLINE];
+    int read_atoms = 0;
+    int atoms_to_read = 0;
+    int i = 0;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    readline(desc, fp);
+    for(i = 0; desc[i] != '\n'; i++);
+    desc[i] = '\0';
+    readline(line, fp);
+    atoms_to_read = atoi(strtok(line, " "));
+    fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
+
+    while(!feof(fp) && read_atoms < atoms_to_read) {
+        readline(line, fp);
+        char *label = strtok(line, " ");
+        int type = type_str2int(strtok(NULL, " "));
+        int atom_id = atoi(strtok(NULL, " ")) - 1;
+        atom_id = read_atoms;
+        while(atom_id + 1 >= atom->Nmax) {
+            growAtom(atom);
+        }
+
+        atom->type[atom_id] = type;
+        atom_x(atom_id) = atof(strtok(NULL, " "));
+        atom_y(atom_id) = atof(strtok(NULL, " "));
+        atom_z(atom_id) = atof(strtok(NULL, " "));
+        atom->vx[atom_id] = atof(strtok(NULL, " "));
+        atom->vy[atom_id] = atof(strtok(NULL, " "));
+        atom->vz[atom_id] = atof(strtok(NULL, " "));
+        atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
+        atom->Natoms++;
+        atom->Nlocal++;
+        read_atoms++;
+    }
+
+    if(!feof(fp)) {
+        readline(line, fp);
+        param->xlo = 0.0;
+        param->xhi = atof(strtok(line, " "));
+        param->ylo = 0.0;
+        param->yhi = atof(strtok(NULL, " "));
+        param->zlo = 0.0;
+        param->zhi = atof(strtok(NULL, " "));
+        param->xprd = param->xhi - param->xlo;
+        param->yprd = param->yhi - param->ylo;
+        param->zprd = param->zhi - param->zlo;
+    }
+
+    if(read_atoms != atoms_to_read) {
+        fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    fclose(fp);
+    return read_atoms;
+}
+
+int readAtom_dmp(Atom* atom, Parameter* param) {
+    FILE *fp = fopen(param->input_file, "r");
+    char line[MAXLINE];
+    int natoms = 0;
+    int read_atoms = 0;
+    int atom_id = -1;
+    int ts = -1;
+
+    if(!fp) {
+        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        exit(-1);
+        return -1;
+    }
+
+    while(!feof(fp) && ts < 1 && !read_atoms) {
+        readline(line, fp);
+        if(strncmp(line, "ITEM: ", 6) == 0) {
+            char *item = &line[6];
+
+            if(strncmp(item, "TIMESTEP", 8) == 0) {
+                readline(line, fp);
+                ts = atoi(line);
+            } else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
+                readline(line, fp);
+                natoms = atoi(line);
+                atom->Natoms = natoms;
+                atom->Nlocal = natoms;
+                while(atom->Nlocal >= atom->Nmax) {
+                    growAtom(atom);
+                }
+            } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
+                readline(line, fp);
+                param->xlo = atof(strtok(line, " "));
+                param->xhi = atof(strtok(NULL, " "));
+                param->xprd = param->xhi - param->xlo;
+
+                readline(line, fp);
+                param->ylo = atof(strtok(line, " "));
+                param->yhi = atof(strtok(NULL, " "));
+                param->yprd = param->yhi - param->ylo;
+
+                readline(line, fp);
+                param->zlo = atof(strtok(line, " "));
+                param->zhi = atof(strtok(NULL, " "));
+                param->zprd = param->zhi - param->zlo;
+            } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
+                for(int i = 0; i < natoms; i++) {
+                    readline(line, fp);
+                    atom_id = atoi(strtok(line, " ")) - 1;
+                    atom->type[atom_id] = atoi(strtok(NULL, " "));
+                    atom_x(atom_id) = atof(strtok(NULL, " "));
+                    atom_y(atom_id) = atof(strtok(NULL, " "));
+                    atom_z(atom_id) = atof(strtok(NULL, " "));
+                    atom->vx[atom_id] = atof(strtok(NULL, " "));
+                    atom->vy[atom_id] = atof(strtok(NULL, " "));
+                    atom->vz[atom_id] = atof(strtok(NULL, " "));
+                    atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
+                    read_atoms++;
+                }
+            } else {
+                fprintf(stderr, "Invalid item: %s\n", item);
+                exit(-1);
+                return -1;
+            }
+        } else {
+            fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
+            exit(-1);
+            return -1;
+        }
+    }
+
+    if(ts < 0 || !natoms || !read_atoms) {
+        fprintf(stderr, "Input error: atom data was not read!\n");
+        exit(-1);
+        return -1;
+    }
+
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param->epsilon;
+        atom->sigma6[i] = param->sigma6;
+        atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
+        atom->cutforcesq[i] = param->cutforce * param->cutforce;
+    }
+
+    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    fclose(fp);
+    return natoms;
+}
+
+void initMasks(Atom *atom) {
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
+    unsigned int mask0, mask1, mask2, mask3;
+
+    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
+    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
+    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
+    //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
+
+    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
+        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+    }
+
+    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
+        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
+    }
+
+    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
+        atom->exclusion_filter[i] = (1U << i);
+    }
+
+    #if CLUSTER_M == CLUSTER_N
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x3 * cond0);
+        mask2 = (unsigned int)(0xf - 0x7 * cond0);
+        mask3 = (unsigned int)(0xf - 0xf * cond0);
+        atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x2 * cond0);
+        mask2 = (unsigned int)(0xf - 0x4 * cond0);
+        mask3 = (unsigned int)(0xf - 0x8 * cond0);
+        atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+        atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
+
+        atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
+    }
+    #else
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
+            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
+            #endif
+
+            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
+            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+
+            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+            #if CLUSTER_M < CLUSTER_N
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
+
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+        }
+    }
+    #endif
+}
+
+void growAtom(Atom *atom) {
+    int nold = atom->Nmax;
+    atom->Nmax += DELTA;
+
+    #ifdef AOS
+    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
+    #else
+    atom->x  = (MD_FLOAT*) reallocate(atom->x,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->y  = (MD_FLOAT*) reallocate(atom->y,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->z  = (MD_FLOAT*) reallocate(atom->z,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    #endif
+    atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
+}
+
+void growClusters(Atom *atom) {
+    int nold = atom->Nclusters_max;
+    int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
+    atom->Nclusters_max += DELTA;
+    atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
+    atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
+    atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
+    atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
+}
--- a/clusterpair/cuda/force_lj.cu
+++ b/clusterpair/cuda/force_lj.cu
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+extern "C" {
+
+#include <stdio.h>
+//---
+#include <cuda.h>
+#include <driver_types.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <atom.h>
+#include <device.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timing.h>
+#include <util.h>
+
+}
+
+extern "C" {
+    MD_FLOAT *cuda_cl_x;
+    MD_FLOAT *cuda_cl_v;
+    MD_FLOAT *cuda_cl_f;
+    int *cuda_neighbors;
+    int *cuda_numneigh;
+    int *cuda_natoms;
+    int *natoms;
+    int *ngatoms;
+    int *cuda_border_map;
+    int *cuda_jclusters_natoms;
+    MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
+    MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
+    MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
+    int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
+    int isReneighboured;
+}
+
+extern "C"
+void initDevice(Atom *atom, Neighbor *neighbor) {
+    cuda_assert("cudaDeviceSetup", cudaDeviceReset());
+    cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
+    cuda_cl_x               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_cl_v               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_cl_f               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_natoms             =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_jclusters_natoms   =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_border_map         =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCx               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCy               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCz               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_numneigh           =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_neighbors          =   (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
+    natoms                  =   (int *) malloc(atom->Nclusters_max * sizeof(int));
+    ngatoms                 =   (int *) malloc(atom->Nclusters_max * sizeof(int));
+    isReneighboured = 1;
+}
+
+extern "C"
+void copyDataToCUDADevice(Atom *atom) {
+    memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        natoms[ci] = atom->iclusters[ci].natoms;
+    }
+
+    memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
+
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
+        const int cj = ncj + cg;
+        ngatoms[cg] = atom->jclusters[cj].natoms;
+    }
+
+    memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
+}
+
+extern "C"
+void copyDataFromCUDADevice(Atom *atom) {
+    memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+}
+
+extern "C"
+void cudaDeviceFree() {
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
+    free(natoms);
+    free(ngatoms);
+}
+
+__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
+                                         int *cuda_natoms,
+                                         int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (ci_pos >= Nclusters_local) return;
+
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
+    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+
+    for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
+        ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
+        ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
+        ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
+        ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
+        ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
+        ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
+    }
+}
+
+__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
+                                   int *cuda_jclusters_natoms,
+                                   int *cuda_PBCx,
+                                   int *cuda_PBCy,
+                                   int *cuda_PBCz,
+                                   int Nclusters_local,
+                                   int Nclusters_ghost,
+                                   MD_FLOAT param_xprd,
+                                   MD_FLOAT param_yprd,
+                                   MD_FLOAT param_zprd) {
+    unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
+    if (cg >= Nclusters_ghost) return;
+
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = Nclusters_local / jfac;
+    MD_FLOAT xprd = param_xprd;
+    MD_FLOAT yprd = param_yprd;
+    MD_FLOAT zprd = param_zprd;
+
+    const int cj = ncj + cg;
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
+    MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+    MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
+
+    for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
+        cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
+        cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
+    }
+}
+
+__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
+                                         int Nclusters_local, int Nclusters_max,
+                                         int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
+                                         MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
+    if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
+
+    int ci_cj0 = CJ0_FROM_CI(ci_pos);
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+    int numneighs = cuda_numneigh[ci_pos];
+    for(int k = 0; k < numneighs; k++) {
+        int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+        MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
+
+        MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
+        MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
+        MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+
+        int cond;
+#if CLUSTER_M == CLUSTER_N
+        cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
+                            (ci_cj0 != cj || cii_pos != cjj_pos);
+#elif CLUSTER_M < CLUSTER_N
+        cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
+                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
+#endif
+        if(cond) {
+            MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
+            MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
+            MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+            if(rsq < cutforcesq) {
+                MD_FLOAT sr2 = 1.0 / rsq;
+                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+
+                if(half_neigh) {
+                    atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
+                    atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
+                    atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
+                }
+
+                fix += delx * force;
+                fiy += dely * force;
+                fiz += delz * force;
+
+                atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
+                atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
+                atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
+            }
+        }
+    }
+}
+
+__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
+                                          int *cuda_natoms,
+                                          int Nclusters_local, MD_FLOAT dtforce) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (ci_pos >= Nclusters_local) return;
+
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+
+    for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
+        ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
+        ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
+        ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
+    }
+}
+
+extern "C"
+void cudaInitialIntegrate(Parameter *param, Atom *atom) {
+    const int threads_num = 16;
+    dim3 block_size = dim3(threads_num, 1, 1);
+    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
+    cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
+                                                         cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
+    cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
+    cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+extern "C"
+void cudaUpdatePbc(Atom *atom, Parameter *param) {
+    const int threads_num = 512;
+    dim3 block_size = dim3(threads_num, 1, 1);;
+    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
+    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
+                                       cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
+                                       atom->Nclusters_local, atom->Nclusters_ghost,
+                                       param->xprd, param->yprd, param->zprd);
+    cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
+    cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
+}
+
+extern "C"
+double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+
+    memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    if (isReneighboured) {
+        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+            memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
+            memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
+        }
+
+        isReneighboured = 0;
+    }
+
+    const int threads_num = 1;
+    dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
+    dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+    computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
+                                                        atom->Nclusters_local, atom->Nclusters_max,
+                                                        cuda_numneigh, cuda_neighbors,
+                                                        neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
+                                                        sigma6, epsilon);
+    cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
+    cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    return E-S;
+}
+
+extern "C"
+void cudaFinalIntegrate(Parameter *param, Atom *atom) {
+    const int threads_num = 16;
+    dim3 block_size = dim3(threads_num, 1, 1);
+    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
+    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
+    cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
+    cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
+}
--- a/clusterpair/force_eam.c
+++ b/clusterpair/force_eam.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <likwid-marker.h>
+#include <math.h>
+
+#include <allocate.h>
+#include <timing.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <eam.h>
+#include <util.h>
+
+double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    /*
+    if(eam->nmax < atom->Nmax) {
+        eam->nmax = atom->Nmax;
+        if(eam->fp != NULL) { free(eam->fp); }
+        eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
+    }
+
+    int Nlocal = atom->Nlocal;
+    int* neighs;
+    MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
+    MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
+    MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
+    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
+    */
+    double S = getTimeStamp();
+
+    LIKWID_MARKER_START("force_eam_fp");
+    /*
+    #pragma omp parallel for
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT rhoi = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+            if(rsq < cutforcesq) {
+                MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+#ifdef EXPLICIT_TYPES
+                rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                          rhor_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                rhoi += ((rhor_spline[m * 7 + 3] * p +
+                          rhor_spline[m * 7 + 4]) * p +
+                          rhor_spline[m * 7 + 5]) * p +
+                          rhor_spline[m * 7 + 6];
+#endif
+            }
+        }
+
+#ifdef EXPLICIT_TYPES
+        const int type_ii = type_i * type_i;
+#endif
+        MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
+        int m = (int)(p);
+        m = MAX(1, MIN(m, nrho - 1));
+        p -= m;
+        p = MIN(p, 1.0);
+#ifdef EXPLICIT_TYPES
+        fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
+                 frho_spline[type_ii * nrho_tot + m * 7 + 2];
+#else
+        fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
+#endif
+    }
+
+    LIKWID_MARKER_STOP("force_eam_fp");
+
+    // We still need to update fp for PBC atoms
+    for(int i = 0; i < atom->Nghost; i++) {
+        fp[Nlocal + i] = fp[atom->border_map[i]];
+    }
+
+    LIKWID_MARKER_START("force_eam");
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MD_FLOAT xtmp = atom_x(i);
+        MD_FLOAT ytmp = atom_y(i);
+        MD_FLOAT ztmp = atom_z(i);
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+#ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+#endif
+
+        #pragma ivdep
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(j);
+            MD_FLOAT dely = ytmp - atom_y(j);
+            MD_FLOAT delz = ztmp - atom_z(j);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+#ifdef EXPLICIT_TYPES
+            const int type_j = atom->type[j];
+            const int type_ij = type_i * ntypes + type_j;
+            const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
+#else
+            const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+#endif
+
+            if(rsq < cutforcesq) {
+                MD_FLOAT r = sqrt(rsq);
+                MD_FLOAT p = r * rdr + 1.0;
+                int m = (int)(p);
+                m = m < nr - 1 ? m : nr - 1;
+                p -= m;
+                p = p < 1.0 ? p : 1.0;
+
+
+                // rhoip = derivative of (density at atom j due to atom i)
+                // rhojp = derivative of (density at atom i due to atom j)
+                // phi = pair potential energy
+                // phip = phi'
+                // z2 = phi * r
+                // z2p = (phi * r)' = (phi' r) + phi
+                // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+                //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+                //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+#ifdef EXPLICIT_TYPES
+                MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                  rhor_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 2];
+
+                MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
+                                z2r_spline[type_ij * nr_tot + m * 7 + 6];
+#else
+                MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
+                MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
+                MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
+                                z2r_spline[m * 7 + 4]) * p +
+                                z2r_spline[m * 7 + 5]) * p +
+                                z2r_spline[m * 7 + 6];
+#endif
+
+                MD_FLOAT recip = 1.0 / r;
+                MD_FLOAT phi = z2 * recip;
+                MD_FLOAT phip = z2p * recip - phi * recip;
+                MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
+                MD_FLOAT fpair = -psip * recip;
+
+                fix += delx * fpair;
+                fiy += dely * fpair;
+                fiz += delz * fpair;
+                //fpair *= 0.5;
+            }
+        }
+
+        fx[i] = fix;
+        fy[i] = fiy;
+        fz[i] = fiz;
+        addStat(stats->total_force_neighs, numneighs);
+        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
+    }
+
+    */
+    LIKWID_MARKER_STOP("force_eam");
+    double E = getTimeStamp();
+    return E-S;
+}
--- a/clusterpair/force_lj.c
+++ b/clusterpair/force_lj.c
--- a/clusterpair/includes/atom.h
+++ b/clusterpair/includes/atom.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <parameter.h>
+
+#ifndef __ATOM_H_
+#define __ATOM_H_
+
+#define DELTA 20000
+
+// Nbnxn layouts (as of GROMACS):
+// Simd4xN: M=4, N=VECTOR_WIDTH
+// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
+// Cuda: M=8, N=VECTOR_WIDTH
+
+#ifdef CUDA_TARGET
+#   undef VECTOR_WIDTH
+#   define VECTOR_WIDTH             8
+#   define KERNEL_NAME              "CUDA"
+#   define CLUSTER_M                8
+#   define CLUSTER_N                VECTOR_WIDTH
+#   define UNROLL_J                 1
+#   define computeForceLJ           computeForceLJ_cuda
+#   define initialIntegrate         cudaInitialIntegrate
+#   define finalIntegrate           cudaFinalIntegrate
+#   define updatePbc                cudaUpdatePbc
+#else
+#   define CLUSTER_M                4
+// Simd2xNN (here used for single-precision)
+#   if VECTOR_WIDTH > CLUSTER_M * 2
+#       define KERNEL_NAME          "Simd2xNN"
+#       define CLUSTER_N            (VECTOR_WIDTH / 2)
+#       define UNROLL_I             4
+#       define UNROLL_J             2
+#       define computeForceLJ       computeForceLJ_2xnn
+// Simd4xN
+#   else
+#       define KERNEL_NAME          "Simd4xN"
+#       define CLUSTER_N            VECTOR_WIDTH
+#       define UNROLL_I             4
+#       define UNROLL_J             1
+#       define computeForceLJ       computeForceLJ_4xn
+#   endif
+#   ifdef USE_REFERENCE_VERSION
+#       undef KERNEL_NAME
+#       undef computeForceLJ
+#       define KERNEL_NAME          "Reference"
+#       define computeForceLJ       computeForceLJ_ref
+#   endif
+#   define initialIntegrate         cpuInitialIntegrate
+#   define finalIntegrate           cpuFinalIntegrate
+#   define updatePbc                cpuUpdatePbc
+#endif
+
+#if CLUSTER_M == CLUSTER_N
+#   define CJ0_FROM_CI(a)           (a)
+#   define CJ1_FROM_CI(a)           (a)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#elif CLUSTER_M == CLUSTER_N * 2 // M > N
+#   define CJ0_FROM_CI(a)           ((a) << 1)
+#   define CJ1_FROM_CI(a)           (((a) << 1) | 0x1)
+#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_M * (b))
+#   define CJ_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
+#elif CLUSTER_M == CLUSTER_N / 2 // M < N
+#   define CJ0_FROM_CI(a)           ((a) >> 1)
+#   define CJ1_FROM_CI(a)           ((a) >> 1)
+#   define CI_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
+#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
+#else
+#   error "Invalid cluster configuration!"
+#endif
+
+#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
+#   error "Cluster N dimension can be only 2, 4 and 8"
+#endif
+
+#define CI_SCALAR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 1))
+#define CI_VECTOR_BASE_INDEX(a)     (CI_BASE_INDEX(a, 3))
+#define CJ_SCALAR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 1))
+#define CJ_VECTOR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 3))
+
+#if CLUSTER_M >= CLUSTER_N
+#   define CL_X_OFFSET              (0 * CLUSTER_M)
+#   define CL_Y_OFFSET              (1 * CLUSTER_M)
+#   define CL_Z_OFFSET              (2 * CLUSTER_M)
+#else
+#   define CL_X_OFFSET              (0 * CLUSTER_N)
+#   define CL_Y_OFFSET              (1 * CLUSTER_N)
+#   define CL_Z_OFFSET              (2 * CLUSTER_N)
+#endif
+
+typedef struct {
+    int natoms;
+    MD_FLOAT bbminx, bbmaxx;
+    MD_FLOAT bbminy, bbmaxy;
+    MD_FLOAT bbminz, bbmaxz;
+} Cluster;
+
+typedef struct {
+    int Natoms, Nlocal, Nghost, Nmax;
+    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
+    MD_FLOAT *x, *y, *z;
+    MD_FLOAT *vx, *vy, *vz;
+    int *border_map;
+    int *type;
+    int ntypes;
+    MD_FLOAT *epsilon;
+    MD_FLOAT *sigma6;
+    MD_FLOAT *cutforcesq;
+    MD_FLOAT *cutneighsq;
+    int *PBCx, *PBCy, *PBCz;
+    // Data in cluster format
+    MD_FLOAT *cl_x;
+    MD_FLOAT *cl_v;
+    MD_FLOAT *cl_f;
+    int *cl_type;
+    Cluster *iclusters, *jclusters;
+    int *icluster_bin;
+    int dummy_cj;
+    MD_UINT *exclusion_filter;
+    MD_FLOAT *diagonal_4xn_j_minus_i;
+    MD_FLOAT *diagonal_2xnn_j_minus_i;
+    unsigned int masks_2xnn_hn[8];
+    unsigned int masks_2xnn_fn[8];
+    unsigned int masks_4xn_hn[16];
+    unsigned int masks_4xn_fn[16];
+} Atom;
+
+extern void initAtom(Atom*);
+extern void initMasks(Atom*);
+extern void createAtom(Atom*, Parameter*);
+extern int readAtom(Atom*, Parameter*);
+extern int readAtom_pdb(Atom*, Parameter*);
+extern int readAtom_gro(Atom*, Parameter*);
+extern int readAtom_dmp(Atom*, Parameter*);
+extern void growAtom(Atom*);
+extern void growClusters(Atom*);
+
+#ifdef AOS
+#   define POS_DATA_LAYOUT     "AoS"
+#   define atom_x(i)           atom->x[(i) * 3 + 0]
+#   define atom_y(i)           atom->x[(i) * 3 + 1]
+#   define atom_z(i)           atom->x[(i) * 3 + 2]
+/*
+#   define atom_vx(i)          atom->vx[(i) * 3 + 0]
+#   define atom_vy(i)          atom->vx[(i) * 3 + 1]
+#   define atom_vz(i)          atom->vx[(i) * 3 + 2]
+#   define atom_fx(i)          atom->fx[(i) * 3 + 0]
+#   define atom_fy(i)          atom->fx[(i) * 3 + 1]
+#   define atom_fz(i)          atom->fx[(i) * 3 + 2]
+*/
+#else
+#   define POS_DATA_LAYOUT     "SoA"
+#   define atom_x(i)           atom->x[i]
+#   define atom_y(i)           atom->y[i]
+#   define atom_z(i)           atom->z[i]
+#endif
+
+// TODO: allow to switch velocites and forces to AoS
+#   define atom_vx(i)          atom->vx[i]
+#   define atom_vy(i)          atom->vy[i]
+#   define atom_vz(i)          atom->vz[i]
+#   define atom_fx(i)          atom->fx[i]
+#   define atom_fy(i)          atom->fy[i]
+#   define atom_fz(i)          atom->fz[i]
+
+#endif
--- a/clusterpair/includes/integrate.h
+++ b/clusterpair/includes/integrate.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+//---
+#include <atom.h>
+#include <parameter.h>
+#include <util.h>
+
+void cpuInitialIntegrate(Parameter *param, Atom *atom) {
+    DEBUG_MESSAGE("cpuInitialIntegrate start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+            ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
+            ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
+            ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
+        }
+    }
+
+    DEBUG_MESSAGE("cpuInitialIntegrate end\n");
+}
+
+void cpuFinalIntegrate(Parameter *param, Atom *atom) {
+    DEBUG_MESSAGE("cpuFinalIntegrate start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+        }
+    }
+
+    DEBUG_MESSAGE("cpuFinalIntegrate end\n");
+}
+
+#ifdef CUDA_TARGET
+void cudaInitialIntegrate(Parameter*, Atom*);
+void cudaFinalIntegrate(Parameter*, Atom*);
+#endif
--- a/clusterpair/includes/neighbor.h
+++ b/clusterpair/includes/neighbor.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __NEIGHBOR_H_
+#define __NEIGHBOR_H_
+// Interaction masks from GROMACS, things to remember (maybe these confused just me):
+//   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
+//      interaction masks (1 = interaction, 0 = no interaction)
+//   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
+//      so read them from right to left (least significant to most significant bit)
+// All interaction mask is the same for all kernels
+#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
+// 4x4 kernel diagonal mask
+#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
+// 4x2 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
+#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
+// 4x8 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
+#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
+
+typedef struct {
+    int every;
+    int ncalls;
+    int maxneighs;
+    int* numneigh;
+    int* numneigh_masked;
+    int half_neigh;
+    int* neighbors;
+    unsigned int* neighbors_imask;
+} Neighbor;
+
+extern void initNeighbor(Neighbor*, Parameter*);
+extern void setupNeighbor(Parameter*, Atom*);
+extern void binatoms(Atom*);
+extern void buildNeighbor(Atom*, Neighbor*);
+extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
+extern void sortAtom(Atom*);
+extern void buildClusters(Atom*);
+extern void defineJClusters(Atom*);
+extern void binClusters(Atom*);
+extern void updateSingleAtoms(Atom*);
+#endif
--- a/clusterpair/includes/pbc.h
+++ b/clusterpair/includes/pbc.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __PBC_H_
+#define __PBC_H_
+extern void initPbc();
+extern void cpuUpdatePbc(Atom*, Parameter*, int);
+extern void updateAtomsPbc(Atom*, Parameter*);
+extern void setupPbc(Atom*, Parameter*);
+
+#ifdef CUDA_TARGET
+extern void cudaUpdatePbc(Atom*, Parameter*, int);
+#endif
+#endif
--- a/clusterpair/includes/stats.h
+++ b/clusterpair/includes/stats.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+#include <parameter.h>
+
+#ifndef __STATS_H_
+#define __STATS_H_
+typedef struct {
+    long long int calculated_forces;
+    long long int num_neighs;
+    long long int force_iters;
+    long long int atoms_within_cutoff;
+    long long int atoms_outside_cutoff;
+    long long int clusters_within_cutoff;
+    long long int clusters_outside_cutoff;
+} Stats;
+
+void initStats(Stats *s);
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
+
+#ifdef COMPUTE_STATS
+#   define addStat(stat, value)     stat += value;
+#   define beginStatTimer()         double Si = getTimeStamp();
+#   define endStatTimer(stat)       stat += getTimeStamp() - Si;
+#else
+#   define addStat(stat, value)
+#   define beginStatTimer()
+#   define endStatTimer(stat)
+#endif
+
+#endif
--- a/clusterpair/includes/tracing.h
+++ b/clusterpair/includes/tracing.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+
+#if defined(MEM_TRACER) || defined(INDEX_TRACER)
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#ifndef VECTOR_WIDTH
+#   define VECTOR_WIDTH                 8
+#endif
+
+#ifndef TRACER_CONDITION
+#   define TRACER_CONDITION                 (!(timestep % param->every))
+#endif
+
+#ifdef MEM_TRACER
+#   define MEM_TRACER_INIT                  FILE *mem_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char mem_tracer_fn[128]; \
+                                                snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
+                                                mem_tracer_fp = fopen(mem_tracer_fn, "w");
+                                            }
+
+#   define MEM_TRACER_END                   if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
+#   define MEM_TRACE(addr, op)              if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
+#else
+#   define MEM_TRACER_INIT
+#   define MEM_TRACER_END
+#   define MEM_TRACE(addr, op)
+#endif
+
+#ifdef INDEX_TRACER
+#   define INDEX_TRACER_INIT                FILE *index_tracer_fp; \
+                                            if(TRACER_CONDITION) { \
+                                                char index_tracer_fn[128]; \
+                                                snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
+                                                index_tracer_fp = fopen(index_tracer_fn, "w"); \
+                                            }
+
+#   define INDEX_TRACER_END                 if(TRACER_CONDITION) { fclose(index_tracer_fp); }
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)   if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
+#   define INDEX_TRACE_ATOM(a)              if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
+#   define INDEX_TRACE(l, e)                if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    fprintf(index_tracer_fp, "I: "); \
+                                                    for(int __j = 0; __j < __e; ++__j) { \
+                                                        fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
+                                                    } \
+                                                    fprintf(index_tracer_fp, "\n"); \
+                                                } \
+                                            }
+
+#   define DIST_TRACE_SORT(l, e)            if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        for(int __j = __i; __j < __i + __e - 1; ++__j) { \
+                                                            for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
+                                                                if(l[__k] > l[__k + 1]) { \
+                                                                    int __t = l[__k]; \
+                                                                    l[__k] = l[__k + 1]; \
+                                                                    l[__k + 1] = __t; \
+                                                                } \
+                                                            } \
+                                                        } \
+                                                    } \
+                                                } \
+                                            }
+
+#   define DIST_TRACE(l, e)                 if(TRACER_CONDITION) { \
+                                                for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
+                                                    int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
+                                                    if(__e > 1) { \
+                                                        fprintf(index_tracer_fp, "D: "); \
+                                                        for(int __j = 0; __j < __e - 1; ++__j) { \
+                                                            int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
+                                                            fprintf(index_tracer_fp, "%d ", __dist); \
+                                                        } \
+                                                        fprintf(index_tracer_fp, "\n"); \
+                                                    } \
+                                                } \
+                                            }
+#else
+#   define INDEX_TRACER_INIT
+#   define INDEX_TRACER_END
+#   define INDEX_TRACE_NATOMS(nl, ng, mn)
+#   define INDEX_TRACE_ATOM(a)
+#   define INDEX_TRACE(l, e)
+#   define DIST_TRACE_SORT(l, e)
+#   define DIST_TRACE(l, e)
+#endif
+
+extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
--- a/clusterpair/includes/vtk.h
+++ b/clusterpair/includes/vtk.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+
+#ifndef __VTK_H_
+#define __VTK_H_
+extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
+extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
+#endif
--- a/clusterpair/includes/xtc.h
+++ b/clusterpair/includes/xtc.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <atom.h>
+
+#ifndef __XTC_H_
+#define __XTC_H_
+
+#ifdef XTC_OUTPUT
+void xtc_init(const char *, Atom*, int);
+void xtc_write(Atom*, int, int, int);
+void xtc_end();
+#else
+#define xtc_init(a,b,c)
+#define xtc_write(a,b,c,d)
+#define xtc_end()
+#endif
+#endif
--- a/clusterpair/main-stub.c
+++ b/clusterpair/main-stub.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <timing.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <stats.h>
+#include <thermo.h>
+#include <eam.h>
+#include <pbc.h>
+#include <timers.h>
+#include <util.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
+
+// Patterns
+#define P_SEQ   0
+#define P_FIX   1
+#define P_RAND  2
+
+void init(Parameter *param) {
+    param->input_file = NULL;
+    param->force_field = FF_LJ;
+    param->epsilon = 1.0;
+    param->sigma6 = 1.0;
+    param->rho = 0.8442;
+    param->ntypes = 4;
+    param->ntimes = 200;
+    param->nx = 1;
+    param->ny = 1;
+    param->nz = 1;
+    param->lattice = 1.0;
+    param->cutforce = 1000000.0;
+    param->cutneigh = param->cutforce;
+    param->mass = 1.0;
+    param->half_neigh = 0;
+    // Unused
+    param->dt = 0.005;
+    param->dtforce = 0.5 * param->dt;
+    param->nstat = 100;
+    param->temp = 1.44;
+    param->reneigh_every = 20;
+    param->proc_freq = 2.4;
+    param->eam_file = NULL;
+}
+
+void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
+    const int maxneighs = nneighs * nreps;
+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    const int ncj = atom->Nclusters_local / jfac;
+    const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
+    neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
+    neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
+    neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
+    neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
+
+    if(pattern == P_RAND && ncj <= nneighs) {
+        fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
+        exit(-1);
+    }
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+        unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
+        int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
+        int m = (pattern == P_SEQ) ? ncj : nneighs;
+        int k = 0;
+
+        for(int k = 0; k < nneighs; k++) {
+            if(pattern == P_RAND) {
+                int found = 0;
+                do {
+                    int cj = rand() % ncj;
+                    neighptr[k] = cj;
+                    neighptr_imask[k] = imask;
+                    found = 0;
+                    for(int l = 0; l < k; l++) {
+                        if(neighptr[l] == cj) {
+                            found = 1;
+                        }
+                    }
+                } while(found == 1);
+            } else {
+                neighptr[k] = j;
+                neighptr_imask[k] = imask;
+                j = (j + 1) % m;
+            }
+        }
+
+        for(int r = 1; r < nreps; r++) {
+            for(int k = 0; k < nneighs; k++) {
+                neighptr[r * nneighs + k] = neighptr[k];
+                neighptr_imask[r * nneighs + k] = neighptr_imask[k];
+            }
+        }
+
+        neighbor->numneigh[ci] = nneighs * nreps;
+        neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
+    }
+}
+
+int main(int argc, const char *argv[]) {
+    Eam eam;
+    Atom atom_data;
+    Atom *atom = (Atom *)(&atom_data);
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+    char *pattern_str = NULL;
+    int pattern = P_SEQ;
+    int niclusters = 256;               // Number of local i-clusters
+    int iclusters_natoms = CLUSTER_M;   // Number of valid atoms within i-clusters
+    int nneighs = 9;                    // Number of j-cluster neighbors per i-cluster
+    int masked = 0;                     // Use masked loop 
+    int nreps = 1;
+    int csv = 0;
+
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("force");
+    DEBUG_MESSAGE("Initializing parameters...\n");
+    init(&param);
+
+    for(int i = 0; i < argc; i++) {
+        if((strcmp(argv[i], "-f") == 0)) {
+            if((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-p") == 0)) {
+            pattern_str = strdup(argv[++i]);
+            if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
+            else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
+            else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
+            else {
+                fprintf(stderr, "Invalid pattern!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-e") == 0)) {
+            param.eam_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-m") == 0)) {
+            masked = 1;
+            continue;
+        }
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-ni") == 0)) {
+            niclusters = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-na") == 0)) {
+            iclusters_natoms = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nn") == 0)) {
+            nneighs = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nr") == 0)) {
+            nreps = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--freq") == 0)) {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--csv") == 0)) {
+            csv = 1;
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-f <string>:          force field (lj or eam), default lj\n");
+            printf("-p <string>:          pattern for data accesses (seq, fix or rand)\n");
+            printf("-n / --nsteps <int>:  number of timesteps for simulation\n");
+            printf("-ni <int>:            number of i-clusters (default 256)\n");
+            printf("-na <int>:            number of atoms per i-cluster (default %d)\n", CLUSTER_M);
+            printf("-nn <int>:            number of j-cluster neighbors per i-cluster (default 9)\n");
+            printf("-nr <int>:            number of times neighbor lists should be replicated (default 1)\n");
+            printf("--freq <real>:        set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
+            printf("--csv:                set output as CSV style\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    if(pattern_str == NULL) {
+        pattern_str = strdup("seq\0");
+    }
+
+    if(param.force_field == FF_EAM) {
+        DEBUG_MESSAGE("Initializing EAM parameters...\n");
+        initEam(&eam, &param);
+    }
+
+    DEBUG_MESSAGE("Initializing atoms...\n");
+    initAtom(atom);
+    initStats(&stats);
+
+    atom->ntypes = param.ntypes;
+    atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
+        atom->epsilon[i] = param.epsilon;
+        atom->sigma6[i] = param.sigma6;
+        atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
+        atom->cutforcesq[i] = param.cutforce * param.cutforce;
+    }
+
+    DEBUG_MESSAGE("Creating atoms...\n");
+    while(atom->Nmax < niclusters * iclusters_natoms) {
+        growAtom(atom);
+    }
+
+    while(atom->Nclusters_max < niclusters) {
+        growClusters(atom);
+    }
+
+    for(int ci = 0; ci < niclusters; ++ci) {
+        int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        int *ci_type = &atom->cl_type[ci_sca_base];
+
+        for(int cii = 0; cii < iclusters_natoms; ++cii) {
+            ci_x[CL_X_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
+            ci_x[CL_Y_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
+            ci_x[CL_Z_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
+            ci_v[CL_X_OFFSET + cii] = 0.0;
+            ci_v[CL_Y_OFFSET + cii] = 0.0;
+            ci_v[CL_Z_OFFSET + cii] = 0.0;
+            ci_type[cii] = rand() % atom->ntypes;
+            atom->Nlocal++;
+        }
+
+        for(int cii = iclusters_natoms; cii < CLUSTER_M; cii++) {
+            ci_x[CL_X_OFFSET + cii] = INFINITY;
+            ci_x[CL_Y_OFFSET + cii] = INFINITY;
+            ci_x[CL_Z_OFFSET + cii] = INFINITY;
+        }
+
+        atom->iclusters[ci].natoms = iclusters_natoms;
+        atom->Nclusters_local++;
+    }
+
+    const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
+    const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
+    const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
+
+    if(!csv) {
+        printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
+        printf("Floating-point precision: %s\n", PRECISION_STRING);
+        printf("Pattern: %s\n", pattern_str);
+        printf("Number of timesteps: %d\n", param.ntimes);
+        printf("Number of i-clusters: %d\n", niclusters);
+        printf("Number of atoms per i-cluster: %d\n", iclusters_natoms);
+        printf("Number of j-cluster neighbors per i-cluster: %d\n", nneighs);
+        printf("Number of times to replicate neighbor lists: %d\n", nreps);
+        printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
+        printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
+        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
+    }
+
+    DEBUG_MESSAGE("Defining j-clusters...\n");
+    defineJClusters(atom);
+    DEBUG_MESSAGE("Initializing neighbor lists...\n");
+    initNeighbor(&neighbor, &param);
+    DEBUG_MESSAGE("Creating neighbor lists...\n");
+    createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
+    DEBUG_MESSAGE("Computing forces...\n");
+
+    double T_accum = 0.0;
+    for(int i = 0; i < param.ntimes; i++) {
+        #if defined(MEM_TRACER) || defined(INDEX_TRACER)
+        traceAddresses(&param, atom, &neighbor, i + 1);
+        #endif
+
+        if(param.force_field == FF_EAM) {
+            T_accum += computeForceEam(&eam, &param, atom, &neighbor, &stats);
+        } else {
+            T_accum += computeForceLJ(&param, atom, &neighbor, &stats);
+        }
+    }
+
+    double freq_hz = param.proc_freq * 1.e9;
+    const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
+    const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
+    const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
+
+    if(!csv) {
+        printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
+        if(param.proc_freq > 0.0) {
+            printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
+        }
+    } else {
+        printf("steps,pattern,niclusters,iclusters_natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
+        if(param.proc_freq > 0.0) {
+            printf(",cy/atom,cy/neigh");
+        }
+        printf("\n");
+
+        printf("%d,%s,%d,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
+            param.ntimes, pattern_str, niclusters, iclusters_natoms, nneighs, nreps,
+            estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
+
+        if(param.proc_freq > 0.0) {
+            printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
+        }
+        printf("\n");
+    }
+
+    double timer[NUMTIMER];
+    timer[FORCE] = T_accum;
+    displayStatistics(atom, &param, &stats, timer);
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/clusterpair/main.c
+++ b/clusterpair/main.c
@@ -0,0 +1,344 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+//--
+#include <likwid-marker.h>
+//--
+#include <atom.h>
+#include <allocate.h>
+#include <device.h>
+#include <eam.h>
+#include <integrate.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <pbc.h>
+#include <stats.h>
+#include <thermo.h>
+#include <timers.h>
+#include <timing.h>
+#include <util.h>
+#include <vtk.h>
+#include <xtc.h>
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
+extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
+
+#ifdef CUDA_TARGET
+extern int isReneighboured;
+extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
+extern void copyDataToCUDADevice(Atom *atom);
+extern void copyDataFromCUDADevice(Atom *atom);
+extern void cudaDeviceFree();
+#endif
+
+double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    if(param->force_field == FF_EAM) { initEam(eam, param); }
+    double S, E;
+    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
+    param->xprd = param->nx * param->lattice;
+    param->yprd = param->ny * param->lattice;
+    param->zprd = param->nz * param->lattice;
+
+    S = getTimeStamp();
+    initAtom(atom);
+    initPbc(atom);
+    initStats(stats);
+    initNeighbor(neighbor, param);
+    if(param->input_file == NULL) {
+        createAtom(atom, param);
+    } else {
+        readAtom(atom, param);
+    }
+
+    setupNeighbor(param, atom);
+    setupThermo(param, atom->Natoms);
+    if(param->input_file == NULL) { adjustThermo(param, atom); }
+    buildClusters(atom);
+    defineJClusters(atom);
+    setupPbc(atom, param);
+    binClusters(atom);
+    buildNeighbor(atom, neighbor);
+    initDevice(atom, neighbor);
+    E = getTimeStamp();
+    return E-S;
+}
+
+double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
+    double S, E;
+    S = getTimeStamp();
+    LIKWID_MARKER_START("reneighbour");
+    updateSingleAtoms(atom);
+    updateAtomsPbc(atom, param);
+    buildClusters(atom);
+    defineJClusters(atom);
+    setupPbc(atom, param);
+    binClusters(atom);
+    buildNeighbor(atom, neighbor);
+    LIKWID_MARKER_STOP("reneighbour");
+    E = getTimeStamp();
+    return E-S;
+}
+
+void printAtomState(Atom *atom) {
+    printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
+            atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
+
+    /*     int nall = atom->Nlocal + atom->Nghost; */
+
+    /*     for (int i=0; i<nall; i++) { */
+    /*         printf("%d  %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
+    /*     } */
+}
+
+int main(int argc, char** argv) {
+    double timer[NUMTIMER];
+    Eam eam;
+    Atom atom;
+    Neighbor neighbor;
+    Stats stats;
+    Parameter param;
+
+    LIKWID_MARKER_INIT;
+#pragma omp parallel
+    {
+        LIKWID_MARKER_REGISTER("force");
+        //LIKWID_MARKER_REGISTER("reneighbour");
+        //LIKWID_MARKER_REGISTER("pbc");
+    }
+
+    initParameter(&param);
+    for(int i = 0; i < argc; i++) {
+        if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
+            readParameter(&param, argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-f") == 0)) {
+            if((param.force_field = str2ff(argv[++i])) < 0) {
+                fprintf(stderr, "Invalid force field!\n");
+                exit(-1);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-i") == 0)) {
+            param.input_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-e") == 0)) {
+            param.eam_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
+            param.ntimes = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nx") == 0)) {
+            param.nx = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-ny") == 0)) {
+            param.ny = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-nz") == 0)) {
+            param.nz = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-half") == 0)) {
+            param.half_neigh = atoi(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
+            param.mass = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
+            param.cutforce = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
+            param.skin = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--freq") == 0)) {
+            param.proc_freq = atof(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--vtk") == 0)) {
+            param.vtk_file = strdup(argv[++i]);
+            continue;
+        }
+        if((strcmp(argv[i], "--xtc") == 0)) {
+            #ifndef XTC_OUTPUT
+            fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!");
+            exit(-1);
+            #else
+            param.xtc_file = strdup(argv[++i]);
+            #endif
+            continue;
+        }
+        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+            printf(HLINE);
+            printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
+            printf("-f <string>:          force field (lj or eam), default lj\n");
+            printf("-i <string>:          input file with atom positions (dump)\n");
+            printf("-e <string>:          input file for EAM\n");
+            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
+            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
+            printf("-r / --radius <real>: set cutoff radius\n");
+            printf("-s / --skin <real>:   set skin (verlet buffer)\n");
+            printf("--freq <real>:        processor frequency (GHz)\n");
+            printf("--vtk <string>:       VTK file for visualization\n");
+            printf("--xtc <string>:       XTC file for visualization\n");
+            printf(HLINE);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    param.cutneigh = param.cutforce + param.skin;
+    setup(&param, &eam, &atom, &neighbor, &stats);
+    printParameter(&param);
+    printf(HLINE);
+
+    printf("step\ttemp\t\tpressure\n");
+    computeThermo(0, &param, &atom);
+    #if defined(MEM_TRACER) || defined(INDEX_TRACER)
+    traceAddresses(&param, &atom, &neighbor, n + 1);
+    #endif
+
+    #ifdef CUDA_TARGET
+    copyDataToCUDADevice(&atom);
+    #endif
+
+    if(param.force_field == FF_EAM) {
+        timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
+    } else {
+        timer[FORCE] = computeForceLJ(&param, &atom, &neighbor, &stats);
+    }
+
+    timer[NEIGH] = 0.0;
+    timer[TOTAL] = getTimeStamp();
+
+    if(param.vtk_file != NULL) {
+        write_data_to_vtk_file(param.vtk_file, &atom, 0);
+    }
+
+    if(param.xtc_file != NULL) {
+        xtc_init(param.xtc_file, &atom, 0);
+    }
+
+    for(int n = 0; n < param.ntimes; n++) {
+        initialIntegrate(&param, &atom);
+
+        if((n + 1) % param.reneigh_every) {
+            if(!((n + 1) % param.prune_every)) {
+                pruneNeighbor(&param, &atom, &neighbor);
+            }
+
+            updatePbc(&atom, &param, 0);
+        } else {
+            #ifdef CUDA_TARGET
+            copyDataFromCUDADevice(&atom);
+            #endif
+
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
+
+            #ifdef CUDA_TARGET
+            copyDataToCUDADevice(&atom);
+            isReneighboured = 1;
+            #endif
+        }
+
+        #if defined(MEM_TRACER) || defined(INDEX_TRACER)
+        traceAddresses(&param, &atom, &neighbor, n + 1);
+        #endif
+
+        if(param.force_field == FF_EAM) {
+            timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
+        } else {
+            timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
+        }
+
+        finalIntegrate(&param, &atom);
+
+        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
+            computeThermo(n + 1, &param, &atom);
+        }
+
+        int write_pos = !((n + 1) % param.x_out_every);
+        int write_vel = !((n + 1) % param.v_out_every);
+        if(write_pos || write_vel) {
+            if(param.vtk_file != NULL) {
+                write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
+            }
+
+            if(param.xtc_file != NULL) {
+                xtc_write(&atom, n + 1, write_pos, write_vel);
+            }
+        }
+    }
+
+    #ifdef CUDA_TARGET
+    copyDataFromCUDADevice(&atom);
+    #endif
+
+    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
+    updateSingleAtoms(&atom);
+    computeThermo(-1, &param, &atom);
+
+    if(param.xtc_file != NULL) {
+        xtc_end();
+    }
+
+    #ifdef CUDA_TARGET
+    cudaDeviceFree();
+    #endif
+
+    printf(HLINE);
+    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
+    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
+            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
+    printf(HLINE);
+    
+    int nthreads = 0;
+    int chunkSize = 0;
+    omp_sched_t schedKind;
+    char schedType[10];
+#pragma omp parallel
+#pragma omp master
+    {
+	omp_get_schedule(&schedKind, &chunkSize);
+
+    	switch (schedKind)
+    	{
+        	case omp_sched_static:  strcpy(schedType, "static"); break;
+        	case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
+        	case omp_sched_guided:  strcpy(schedType, "guided"); break;
+        	case omp_sched_auto:    strcpy(schedType, "auto"); break;
+    	}
+
+    	nthreads = omp_get_max_threads();
+    }
+
+    printf("Num threads: %d\n", nthreads);
+    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
+
+    printf("Performance: %.2f million atom updates per second\n",
+            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+    #ifdef COMPUTE_STATS
+    displayStatistics(&atom, &param, &stats, timer);
+    #endif
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/clusterpair/neighbor.c
+++ b/clusterpair/neighbor.c
@@ -0,0 +1,939 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <util.h>
+
+#define SMALL 1.0e-6
+#define FACTOR 0.999
+
+static MD_FLOAT xprd, yprd, zprd;
+static MD_FLOAT bininvx, bininvy;
+static int mbinxlo, mbinylo;
+static int nbinx, nbiny;
+static int mbinx, mbiny; // n bins in x, y
+static int *bincount;
+static int *bins;
+static int *bin_nclusters;
+static int *bin_clusters;
+static int mbins; //total number of bins
+static int atoms_per_bin;  // max atoms per bin
+static int clusters_per_bin;  // max clusters per bin
+static MD_FLOAT cutneigh;
+static MD_FLOAT cutneighsq;  // neighbor cutoff squared
+static int nmax;
+static int nstencil;      // # of bins in stencil
+static int* stencil;      // stencil list of bin offsets
+static MD_FLOAT binsizex, binsizey;
+
+static int coord2bin(MD_FLOAT, MD_FLOAT);
+static MD_FLOAT bindist(int, int);
+
+/* exported subroutines */
+void initNeighbor(Neighbor *neighbor, Parameter *param) {
+    MD_FLOAT neighscale = 5.0 / 6.0;
+    xprd = param->nx * param->lattice;
+    yprd = param->ny * param->lattice;
+    zprd = param->nz * param->lattice;
+    cutneigh = param->cutneigh;
+    nmax = 0;
+    atoms_per_bin = 8;
+    clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
+    stencil = NULL;
+    bins = NULL;
+    bincount = NULL;
+    bin_clusters = NULL;
+    bin_nclusters = NULL;
+    neighbor->half_neigh = param->half_neigh;
+    neighbor->maxneighs = 100;
+    neighbor->numneigh = NULL;
+    neighbor->numneigh_masked = NULL;
+    neighbor->neighbors = NULL;
+    neighbor->neighbors_imask = NULL;
+}
+
+void setupNeighbor(Parameter *param, Atom *atom) {
+    MD_FLOAT coord;
+    int mbinxhi, mbinyhi;
+    int nextx, nexty, nextz;
+
+    if(param->input_file != NULL) {
+        xprd = param->xprd;
+        yprd = param->yprd;
+        zprd = param->zprd;
+    }
+
+    // TODO: update lo and hi for standard case and use them here instead
+    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
+    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
+    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
+
+    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
+    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
+    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
+    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
+    nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
+    nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
+    binsizex = (xhi - xlo) / nbinx;
+    binsizey = (yhi - ylo) / nbiny;
+    bininvx = 1.0 / binsizex;
+    bininvy = 1.0 / binsizey;
+    cutneighsq = cutneigh * cutneigh;
+
+    coord = xlo - cutneigh - SMALL * xprd;
+    mbinxlo = (int)(coord * bininvx);
+    if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
+    coord = xhi + cutneigh + SMALL * xprd;
+    mbinxhi = (int)(coord * bininvx);
+
+    coord = ylo - cutneigh - SMALL * yprd;
+    mbinylo = (int)(coord * bininvy);
+    if(coord < 0.0) { mbinylo = mbinylo - 1; }
+    coord = yhi + cutneigh + SMALL * yprd;
+    mbinyhi = (int)(coord * bininvy);
+
+    mbinxlo = mbinxlo - 1;
+    mbinxhi = mbinxhi + 1;
+    mbinx = mbinxhi - mbinxlo + 1;
+
+    mbinylo = mbinylo - 1;
+    mbinyhi = mbinyhi + 1;
+    mbiny = mbinyhi - mbinylo + 1;
+
+    nextx = (int)(cutneigh * bininvx);
+    nexty = (int)(cutneigh * bininvy);
+    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
+    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
+
+    if (stencil) { free(stencil); }
+    stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
+    nstencil = 0;
+
+    for(int j = -nexty; j <= nexty; j++) {
+        for(int i = -nextx; i <= nextx; i++) {
+            if(bindist(i, j) < cutneighsq) {
+                stencil[nstencil++] = j * mbinx + i;
+            }
+        }
+    }
+
+    if(bincount) { free(bincount); }
+    if(bins) { free(bins); }
+    if(bin_nclusters) { free(bin_nclusters); }
+    if(bin_clusters) { free(bin_clusters); }
+    mbins = mbinx * mbiny;
+    bincount = (int*) malloc(mbins * sizeof(int));
+    bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+    bin_nclusters = (int*) malloc(mbins * sizeof(int));
+    bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+
+    /*
+    DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
+    DEBUG_MESSAGE("binsize = %e, %e\n", binsizex, binsizey);
+    DEBUG_MESSAGE("mbin lo, hi = (%d, %d), (%d, %d)\n", mbinxlo, mbinylo, mbinxhi, mbinyhi);
+    DEBUG_MESSAGE("mbins = %d (%d x %d)\n", mbins, mbinx, mbiny);
+    DEBUG_MESSAGE("nextx = %d, nexty = %d\n", nextx, nexty);
+    */
+}
+
+MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
+    MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
+    MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
+    MD_FLOAT dm = MAX(dl, dh);
+    MD_FLOAT dm0 = MAX(dm, 0.0);
+    MD_FLOAT d2 = dm0 * dm0;
+
+    dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
+    dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
+    dm = MAX(dl, dh);
+    dm0 = MAX(dm, 0.0);
+    d2 += dm0 * dm0;
+
+    dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
+    dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
+    dm = MAX(dl, dh);
+    dm0 = MAX(dm, 0.0);
+    d2 += dm0 * dm0;
+    return d2;
+}
+
+int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+
+    for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
+            MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
+            MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
+            if(delx * delx + dely * dely + delz * delz < rsq) {
+                return 1;
+            }
+        }
+    }
+
+    return 0;
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
+    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
+                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
+                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+#if VECTOR_WIDTH == 2
+#   define get_imask_simd_4xn get_imask_simd_j2
+#elif VECTOR_WIDTH== 4
+#   define get_imask_simd_4xn get_imask_simd_j4
+#elif VECTOR_WIDTH == 8
+#   define get_imask_simd_4xn get_imask_simd_j8
+#   define get_imask_simd_2xnn get_imask_simd_j4
+#elif VECTOR_WIDTH == 16
+#   define get_imask_simd_2xnn get_imask_simd_j8
+#else
+#   error "Invalid cluster configuration"
+#endif
+
+void buildNeighbor(Atom *atom, Neighbor *neighbor) {
+    DEBUG_MESSAGE("buildNeighbor start\n");
+
+    /* extend atom arrays if necessary */
+    if(atom->Nclusters_local > nmax) {
+        nmax = atom->Nclusters_local;
+        if(neighbor->numneigh) free(neighbor->numneigh);
+        if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
+        if(neighbor->neighbors) free(neighbor->neighbors);
+        if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
+        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
+        neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
+        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
+        neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
+    }
+
+    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
+    MD_FLOAT bby = 0.5 * (binsizey + binsizey);
+    MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
+    rbb_sq = rbb_sq * rbb_sq;
+    int resize = 1;
+
+    /* loop over each atom, storing neighbors */
+    while(resize) {
+        int new_maxneighs = neighbor->maxneighs;
+        resize = 0;
+
+        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+            int ci_cj1 = CJ1_FROM_CI(ci);
+            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+            unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
+            int n = 0, nmasked = 0;
+            int ibin = atom->icluster_bin[ci];
+            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
+            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
+            MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
+            MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
+            MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
+            MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
+
+            for(int k = 0; k < nstencil; k++) {
+                int jbin = ibin + stencil[k];
+                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
+                int cj, m = -1;
+                MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
+                const int c = bin_nclusters[jbin];
+
+                if(c > 0) {
+                    MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
+
+                    do {
+                        m++;
+                        cj = loc_bin[m];
+                        if(neighbor->half_neigh && ci_cj1 > cj) {
+                            continue;
+                        }
+                        jbb_zmin = atom->jclusters[cj].bbminz;
+                        jbb_zmax = atom->jclusters[cj].bbmaxz;
+                        dl = ibb_zmin - jbb_zmax;
+                        dh = jbb_zmin - ibb_zmax;
+                        dm = MAX(dl, dh);
+                        dm0 = MAX(dm, 0.0);
+                        d_bb_sq = dm0 * dm0;
+                    } while(m + 1 < c && d_bb_sq > cutneighsq);
+
+                    jbb_xmin = atom->jclusters[cj].bbminx;
+                    jbb_xmax = atom->jclusters[cj].bbmaxx;
+                    jbb_ymin = atom->jclusters[cj].bbminy;
+                    jbb_ymax = atom->jclusters[cj].bbmaxy;
+
+                    while(m < c) {
+                        if(!neighbor->half_neigh || ci_cj1 <= cj) {
+                            dl = ibb_zmin - jbb_zmax;
+                            dh = jbb_zmin - ibb_zmax;
+                            dm = MAX(dl, dh);
+                            dm0 = MAX(dm, 0.0);
+                            d_bb_sq = dm0 * dm0;
+
+                            /*if(d_bb_sq > cutneighsq) {
+                                break;
+                            }*/
+
+                            dl = ibb_ymin - jbb_ymax;
+                            dh = jbb_ymin - ibb_ymax;
+                            dm = MAX(dl, dh);
+                            dm0 = MAX(dm, 0.0);
+                            d_bb_sq += dm0 * dm0;
+
+                            dl = ibb_xmin - jbb_xmax;
+                            dh = jbb_xmin - ibb_xmax;
+                            dm = MAX(dl, dh);
+                            dm0 = MAX(dm, 0.0);
+                            d_bb_sq += dm0 * dm0;
+
+                            if(d_bb_sq < cutneighsq) {
+                                if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
+                                    // We use true (1) for rdiag because we only care if there are masks
+                                    // at all, and when this is set to false (0) the self-exclusions are
+                                    // not accounted for, which  makes the optimized version to not work!
+                                    unsigned int imask;
+                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
+                                    imask = get_imask_simd_2xnn(1, ci, cj);
+                                    #else // 4xn
+                                    imask = get_imask_simd_4xn(1, ci, cj);
+                                    #endif
+
+                                    if(n < neighbor->maxneighs) {
+                                        if(imask == NBNXN_INTERACTION_MASK_ALL) {
+                                            neighptr[n] = cj;
+                                            neighptr_imask[n] = imask;
+                                        } else {
+                                            neighptr[n] = neighptr[nmasked];
+                                            neighptr_imask[n] = neighptr_imask[nmasked];
+                                            neighptr[nmasked] = cj;
+                                            neighptr_imask[nmasked] = imask;
+                                            nmasked++;
+                                        }
+                                    }
+
+                                    n++;
+                                }
+                            }
+                        }
+
+                        m++;
+                        if(m < c) {
+                            cj = loc_bin[m];
+                            jbb_xmin = atom->jclusters[cj].bbminx;
+                            jbb_xmax = atom->jclusters[cj].bbmaxx;
+                            jbb_ymin = atom->jclusters[cj].bbminy;
+                            jbb_ymax = atom->jclusters[cj].bbmaxy;
+                            jbb_zmin = atom->jclusters[cj].bbminz;
+                            jbb_zmax = atom->jclusters[cj].bbmaxz;
+                        }
+                    }
+                }
+            }
+
+            // Fill neighbor list with dummy values to fit vector width
+            if(CLUSTER_N < VECTOR_WIDTH) {
+                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
+                    neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr_imask[n] = 0;
+                    n++;
+                }
+            }
+
+            neighbor->numneigh[ci] = n;
+            neighbor->numneigh_masked[ci] = nmasked;
+            if(n >= neighbor->maxneighs) {
+                resize = 1;
+
+                if(n >= new_maxneighs) {
+                    new_maxneighs = n;
+                }
+            }
+        }
+
+        if(resize) {
+            neighbor->maxneighs = new_maxneighs * 1.2;
+            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
+            free(neighbor->neighbors);
+            free(neighbor->neighbors_imask);
+            neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
+            neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
+        }
+    }
+
+    /*
+    DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
+    for(int ci = 0; ci < 6; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+
+        DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
+            ci,
+            atom->iclusters[ci].bbminx,
+            atom->iclusters[ci].bbmaxx,
+            atom->iclusters[ci].bbminy,
+            atom->iclusters[ci].bbmaxy,
+            atom->iclusters[ci].bbminz,
+            atom->iclusters[ci].bbmaxz);
+
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
+            DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+
+        DEBUG_MESSAGE("Neighbors:\n");
+        for(int k = 0; k < neighbor->numneigh[ci]; k++) {
+            int cj = neighptr[k];
+            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+
+            DEBUG_MESSAGE("    Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
+                cj,
+                atom->jclusters[cj].bbminx,
+                atom->jclusters[cj].bbmaxx,
+                atom->jclusters[cj].bbminy,
+                atom->jclusters[cj].bbmaxy,
+                atom->jclusters[cj].bbminz,
+                atom->jclusters[cj].bbmaxz);
+
+            for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+                DEBUG_MESSAGE("    %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
+            }
+        }
+    }
+    */
+
+    DEBUG_MESSAGE("buildNeighbor end\n");
+}
+
+void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
+    DEBUG_MESSAGE("pruneNeighbor start\n");
+    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
+    MD_FLOAT cutsq = cutneighsq;
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[ci];
+        int numneighs_masked = neighbor->numneigh_masked[ci];
+        int k = 0;
+
+        // Remove dummy clusters if necessary
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(neighs[numneighs - 1] == atom->dummy_cj) {
+                numneighs--;
+            }
+        }
+
+        while(k < numneighs) {
+            int cj = neighs[k];
+            if(atomDistanceInRange(atom, ci, cj, cutsq)) {
+                k++;
+            } else {
+                numneighs--;
+                if(k < numneighs_masked) {
+                    numneighs_masked--;
+                }
+                neighs[k] = neighs[numneighs];
+            }
+        }
+
+        // Readd dummy clusters if necessary
+        if(CLUSTER_N < VECTOR_WIDTH) {
+            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
+                neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs_imask[numneighs] = 0;
+                numneighs++;
+            }
+        }
+
+        neighbor->numneigh[ci] = numneighs;
+        neighbor->numneigh_masked[ci] = numneighs_masked;
+    }
+
+    DEBUG_MESSAGE("pruneNeighbor end\n");
+}
+
+/* internal subroutines */
+MD_FLOAT bindist(int i, int j) {
+    MD_FLOAT delx, dely, delz;
+
+    if(i > 0) {
+        delx = (i - 1) * binsizex;
+    } else if(i == 0) {
+        delx = 0.0;
+    } else {
+        delx = (i + 1) * binsizex;
+    }
+
+    if(j > 0) {
+        dely = (j - 1) * binsizey;
+    } else if(j == 0) {
+        dely = 0.0;
+    } else {
+        dely = (j + 1) * binsizey;
+    }
+
+    return (delx * delx + dely * dely);
+}
+
+int coord2bin(MD_FLOAT xin, MD_FLOAT yin) {
+    int ix, iy;
+
+    if(xin >= xprd) {
+        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+
+    return (iy * mbinx + ix + 1);
+}
+
+void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) {
+    if(xin >= xprd) {
+        *ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+    } else if(xin >= 0.0) {
+        *ix = (int)(xin * bininvx) - mbinxlo;
+    } else {
+        *ix = (int)(xin * bininvx) - mbinxlo - 1;
+    }
+
+    if(yin >= yprd) {
+        *iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    } else if(yin >= 0.0) {
+        *iy = (int)(yin * bininvy) - mbinylo;
+    } else {
+        *iy = (int)(yin * bininvy) - mbinylo - 1;
+    }
+}
+
+void binAtoms(Atom *atom) {
+    DEBUG_MESSAGE("binAtoms start\n");
+    int resize = 1;
+
+    while(resize > 0) {
+        resize = 0;
+
+        for(int i = 0; i < mbins; i++) {
+            bincount[i] = 0;
+        }
+
+        for(int i = 0; i < atom->Nlocal; i++) {
+            int ibin = coord2bin(atom_x(i), atom_y(i));
+            if(bincount[ibin] < atoms_per_bin) {
+                int ac = bincount[ibin]++;
+                bins[ibin * atoms_per_bin + ac] = i;
+            } else {
+                resize = 1;
+            }
+        }
+
+        if(resize) {
+            free(bins);
+            atoms_per_bin *= 2;
+            bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
+        }
+    }
+
+    DEBUG_MESSAGE("binAtoms end\n");
+}
+
+// TODO: Use pigeonhole sorting
+void sortAtomsByZCoord(Atom *atom) {
+    DEBUG_MESSAGE("sortAtomsByZCoord start\n");
+    for(int bin = 0; bin < mbins; bin++) {
+        int c = bincount[bin];
+        int *bin_ptr = &bins[bin * atoms_per_bin];
+
+        for(int ac_i = 0; ac_i < c; ac_i++) {
+            int i = bin_ptr[ac_i];
+            int min_ac = ac_i;
+            int min_idx = i;
+            MD_FLOAT min_z = atom_z(i);
+
+            for(int ac_j = ac_i + 1; ac_j < c; ac_j++) {
+                int j = bin_ptr[ac_j];
+                MD_FLOAT zj = atom_z(j);
+                if(zj < min_z) {
+                    min_ac = ac_j;
+                    min_idx = j;
+                    min_z = zj;
+                }
+            }
+
+            bin_ptr[ac_i] = min_idx;
+            bin_ptr[min_ac] = i;
+        }
+    }
+
+    DEBUG_MESSAGE("sortAtomsByZCoord end\n");
+}
+
+void buildClusters(Atom *atom) {
+    DEBUG_MESSAGE("buildClusters start\n");
+    atom->Nclusters_local = 0;
+
+    /* bin local atoms */
+    binAtoms(atom);
+    sortAtomsByZCoord(atom);
+
+    for(int bin = 0; bin < mbins; bin++) {
+        int c = bincount[bin];
+        int ac = 0;
+        int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
+        if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
+        for(int cl = 0; cl < nclusters; cl++) {
+            const int ci = atom->Nclusters_local;
+            if(ci >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+            int *ci_type = &atom->cl_type[ci_sca_base];
+            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            atom->iclusters[ci].natoms = 0;
+            for(int cii = 0; cii < CLUSTER_M; cii++) {
+                if(ac < c) {
+                    int i = bins[bin * atoms_per_bin + ac];
+                    MD_FLOAT xtmp = atom_x(i);
+                    MD_FLOAT ytmp = atom_y(i);
+                    MD_FLOAT ztmp = atom_z(i);
+
+                    ci_x[CL_X_OFFSET + cii] = xtmp;
+                    ci_x[CL_Y_OFFSET + cii] = ytmp;
+                    ci_x[CL_Z_OFFSET + cii] = ztmp;
+                    ci_v[CL_X_OFFSET + cii] = atom->vx[i];
+                    ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
+                    ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
+
+                    // TODO: To create the bounding boxes faster, we can use SIMD operations
+                    if(bbminx > xtmp) { bbminx = xtmp; }
+                    if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                    if(bbminy > ytmp) { bbminy = ytmp; }
+                    if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                    if(bbminz > ztmp) { bbminz = ztmp; }
+                    if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+
+                    ci_type[cii] = atom->type[i];
+                    atom->iclusters[ci].natoms++;
+                } else {
+                    ci_x[CL_X_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Y_OFFSET + cii] = INFINITY;
+                    ci_x[CL_Z_OFFSET + cii] = INFINITY;
+                }
+
+                ac++;
+            }
+
+            atom->icluster_bin[ci] = bin;
+            atom->iclusters[ci].bbminx = bbminx;
+            atom->iclusters[ci].bbmaxx = bbmaxx;
+            atom->iclusters[ci].bbminy = bbminy;
+            atom->iclusters[ci].bbmaxy = bbmaxy;
+            atom->iclusters[ci].bbminz = bbminz;
+            atom->iclusters[ci].bbmaxz = bbmaxz;
+            atom->Nclusters_local++;
+        }
+    }
+
+    DEBUG_MESSAGE("buildClusters end\n");
+}
+
+void defineJClusters(Atom *atom) {
+    DEBUG_MESSAGE("defineJClusters start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int cj0 = CJ0_FROM_CI(ci);
+
+        if(CLUSTER_M == CLUSTER_N) {
+            atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
+            atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
+            atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
+            atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
+            atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
+            atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
+            atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
+
+        } else if(CLUSTER_M > CLUSTER_N) {
+            int cj1 = CJ1_FROM_CI(ci);
+            int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+            MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+            MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+            MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+            MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj0].bbminx = bbminx;
+            atom->jclusters[cj0].bbmaxx = bbmaxx;
+            atom->jclusters[cj0].bbminy = bbminy;
+            atom->jclusters[cj0].bbmaxy = bbmaxy;
+            atom->jclusters[cj0].bbminz = bbminz;
+            atom->jclusters[cj0].bbmaxz = bbmaxz;
+            atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
+
+            bbminx = INFINITY, bbmaxx = -INFINITY;
+            bbminy = INFINITY, bbmaxy = -INFINITY;
+            bbminz = INFINITY, bbmaxz = -INFINITY;
+
+            for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
+                MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
+                MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
+                MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
+
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+
+            atom->jclusters[cj1].bbminx = bbminx;
+            atom->jclusters[cj1].bbmaxx = bbmaxx;
+            atom->jclusters[cj1].bbminy = bbminy;
+            atom->jclusters[cj1].bbmaxy = bbmaxy;
+            atom->jclusters[cj1].bbminz = bbminz;
+            atom->jclusters[cj1].bbmaxz = bbmaxz;
+            atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
+
+        } else {
+            if(ci % 2 == 0) {
+                const int ci1 = ci + 1;
+                atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
+                atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
+                atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
+                atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
+                atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
+                atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
+                atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
+            }
+        }
+    }
+
+    DEBUG_MESSAGE("defineJClusters end\n");
+}
+
+void binClusters(Atom *atom) {
+    DEBUG_MESSAGE("binClusters start\n");
+
+    /*
+    DEBUG_MESSAGE("Nghost = %d\n", atom->Nclusters_ghost);
+    for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + 4; ci++) {
+        MD_FLOAT *cptr = cluster_pos_ptr(ci);
+        DEBUG_MESSAGE("Cluster %d:\n", ci);
+        DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
+            atom->icluster_bin[ci],
+            atom->clusters[ci].natoms,
+            atom->clusters[ci].bbminx,
+            atom->clusters[ci].bbmaxx,
+            atom->clusters[ci].bbminy,
+            atom->clusters[ci].bbmaxy,
+            atom->clusters[ci].bbminz,
+            atom->clusters[ci].bbmaxz);
+
+        for(int cii = 0; cii < CLUSTER_M; cii++) {
+            DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
+        }
+    }
+    */
+
+    const int nlocal = atom->Nclusters_local;
+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    const int ncj = atom->Nclusters_local / jfac;
+
+    int resize = 1;
+    while(resize > 0) {
+        resize = 0;
+
+        for(int bin = 0; bin < mbins; bin++) {
+            bin_nclusters[bin] = 0;
+        }
+
+        for(int ci = 0; ci < nlocal && !resize; ci++) {
+            // Assure we add this j-cluster only once in the bin
+            if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
+                int bin = atom->icluster_bin[ci];
+                int c = bin_nclusters[bin];
+                if(c + 1 < clusters_per_bin) {
+                    bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
+                    bin_nclusters[bin]++;
+
+                    if(CLUSTER_M > CLUSTER_N) {
+                        int cj1 = CJ1_FROM_CI(ci);
+                        if(atom->jclusters[cj1].natoms > 0) {
+                            bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
+                            bin_nclusters[bin]++;
+                        }
+                    }
+                } else {
+                    resize = 1;
+                }
+            }
+        }
+
+        for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
+            const int cj = ncj + cg;
+            int ix = -1, iy = -1;
+            MD_FLOAT xtmp, ytmp;
+
+            if(atom->jclusters[cj].natoms > 0) {
+                int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+                MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+                MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
+
+                xtmp = cj_x[CL_X_OFFSET + 0];
+                ytmp = cj_x[CL_Y_OFFSET + 0];
+                coord2bin2D(xtmp, ytmp, &ix, &iy);
+                ix = MAX(MIN(ix, mbinx - 1), 0);
+                iy = MAX(MIN(iy, mbiny - 1), 0);
+                for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
+                    int nix, niy;
+                    xtmp = cj_x[CL_X_OFFSET + cjj];
+                    ytmp = cj_x[CL_Y_OFFSET + cjj];
+                    coord2bin2D(xtmp, ytmp, &nix, &niy);
+                    nix = MAX(MIN(nix, mbinx - 1), 0);
+                    niy = MAX(MIN(niy, mbiny - 1), 0);
+
+                    // Always put the cluster on the bin of its innermost atom so
+                    // the cluster should be closer to local clusters
+                    if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
+                    if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
+                    if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
+                    if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
+                }
+
+                int bin = iy * mbinx + ix + 1;
+                int c = bin_nclusters[bin];
+                if(c < clusters_per_bin) {
+                    // Insert the current ghost cluster in the bin keeping clusters
+                    // sorted by z coordinate
+                    int inserted = 0;
+                    for(int i = 0; i < c; i++) {
+                        int last_cl = bin_clusters[bin * clusters_per_bin + i];
+                        if(atom->jclusters[last_cl].bbminz > cj_minz) {
+                            bin_clusters[bin * clusters_per_bin + i] = cj;
+
+                            for(int j = i + 1; j <= c; j++) {
+                                int tmp = bin_clusters[bin * clusters_per_bin + j];
+                                bin_clusters[bin * clusters_per_bin + j] = last_cl;
+                                last_cl = tmp;
+                            }
+
+                            inserted = 1;
+                            break;
+                        }
+                    }
+
+                    if(!inserted) {
+                        bin_clusters[bin * clusters_per_bin + c] = cj;
+                    }
+
+                    bin_nclusters[bin]++;
+                } else {
+                    resize = 1;
+                }
+            }
+        }
+
+        if(resize) {
+            free(bin_clusters);
+            clusters_per_bin *= 2;
+            bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
+        }
+    }
+
+    /*
+    DEBUG_MESSAGE("bin_nclusters\n");
+    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
+    DEBUG_MESSAGE("\n");
+    */
+
+    DEBUG_MESSAGE("binClusters stop\n");
+}
+
+void updateSingleAtoms(Atom *atom) {
+    DEBUG_MESSAGE("updateSingleAtoms start\n");
+    int Natom = 0;
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
+            atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
+            atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
+            atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
+            atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
+            atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
+            Natom++;
+        }
+    }
+
+    if(Natom != atom->Nlocal) {
+        fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n");
+    }
+
+    DEBUG_MESSAGE("updateSingleAtoms stop\n");
+}
--- a/clusterpair/pbc.c
+++ b/clusterpair/pbc.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#include <pbc.h>
+#include <atom.h>
+#include <allocate.h>
+#include <neighbor.h>
+#include <util.h>
+
+#define DELTA 20000
+
+static int NmaxGhost;
+
+static void growPbc(Atom*);
+
+/* exported subroutines */
+void initPbc(Atom* atom) {
+    NmaxGhost = 0;
+    atom->border_map = NULL;
+    atom->PBCx = NULL; atom->PBCy = NULL; atom->PBCz = NULL;
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
+    DEBUG_MESSAGE("updatePbc start\n");
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
+        const int cj = ncj + cg;
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+        MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
+        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
+            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
+            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
+
+            cj_x[CL_X_OFFSET + cjj] = xtmp;
+            cj_x[CL_Y_OFFSET + cjj] = ytmp;
+            cj_x[CL_Z_OFFSET + cjj] = ztmp;
+
+            if(firstUpdate) {
+                // TODO: To create the bounding boxes faster, we can use SIMD operations
+                if(bbminx > xtmp) { bbminx = xtmp; }
+                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+                if(bbminy > ytmp) { bbminy = ytmp; }
+                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+                if(bbminz > ztmp) { bbminz = ztmp; }
+                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+            }
+        }
+
+        if(firstUpdate) {
+            for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+                cj_x[CL_X_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+                cj_x[CL_Z_OFFSET + cjj] = INFINITY;
+            }
+
+            atom->jclusters[cj].bbminx = bbminx;
+            atom->jclusters[cj].bbmaxx = bbmaxx;
+            atom->jclusters[cj].bbminy = bbminy;
+            atom->jclusters[cj].bbmaxy = bbmaxy;
+            atom->jclusters[cj].bbminz = bbminz;
+            atom->jclusters[cj].bbmaxz = bbmaxz;
+        }
+    }
+
+    DEBUG_MESSAGE("updatePbc end\n");
+}
+
+/* relocate atoms that have left domain according
+ * to periodic boundary conditions */
+void updateAtomsPbc(Atom *atom, Parameter *param) {
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        if(atom_x(i) < 0.0) {
+            atom_x(i) += xprd;
+        } else if(atom_x(i) >= xprd) {
+            atom_x(i) -= xprd;
+        }
+
+        if(atom_y(i) < 0.0) {
+            atom_y(i) += yprd;
+        } else if(atom_y(i) >= yprd) {
+            atom_y(i) -= yprd;
+        }
+
+        if(atom_z(i) < 0.0) {
+            atom_z(i) += zprd;
+        } else if(atom_z(i) >= zprd) {
+            atom_z(i) -= zprd;
+        }
+    }
+}
+
+/* setup periodic boundary conditions by
+ * defining ghost atoms around domain
+ * only creates mapping and coordinate corrections
+ * that are then enforced in updatePbc */
+#define ADDGHOST(dx,dy,dz);                                                     \
+    Nghost++;                                                                   \
+    const int cg = ncj + Nghost;                                                \
+    const int cj_natoms = atom->jclusters[cj].natoms;                           \
+    atom->border_map[Nghost] = cj;                                              \
+    atom->PBCx[Nghost] = dx;                                                    \
+    atom->PBCy[Nghost] = dy;                                                    \
+    atom->PBCz[Nghost] = dz;                                                    \
+    atom->jclusters[cg].natoms = cj_natoms;                                     \
+    Nghost_atoms += cj_natoms;                                                  \
+    int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);                                 \
+    int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg);                                 \
+    for(int cjj = 0; cjj < cj_natoms; cjj++) {                                  \
+        atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj];    \
+    }
+
+/* internal subroutines */
+void growPbc(Atom* atom) {
+    int nold = NmaxGhost;
+    NmaxGhost += DELTA;
+
+    atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+    atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
+}
+
+void setupPbc(Atom *atom, Parameter *param) {
+    DEBUG_MESSAGE("setupPbc start\n");
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+    MD_FLOAT Cutneigh = param->cutneigh;
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    int Nghost = -1;
+    int Nghost_atoms = 0;
+
+    for(int cj = 0; cj < ncj; cj++) {
+        if(atom->jclusters[cj].natoms > 0) {
+            if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
+                growClusters(atom);
+            }
+
+            if((Nghost + 7) * jfac >= NmaxGhost) {
+                growPbc(atom);
+            }
+
+            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
+            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
+            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
+            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
+            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
+            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
+
+            /* Setup ghost atoms */
+            /* 6 planes */
+            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
+            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
+            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
+            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
+            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
+            /* 8 corners */
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
+            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
+            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
+            /* 12 edges */
+            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
+            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
+            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
+            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
+            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
+            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
+            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
+            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
+        }
+    }
+
+    if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
+        growClusters(atom);
+    }
+
+    // Add dummy cluster at the end
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
+    }
+
+    // increase by one to make it the ghost atom count
+    atom->dummy_cj = ncj + Nghost + 1;
+    atom->Nghost = Nghost_atoms;
+    atom->Nclusters_ghost = Nghost + 1;
+    atom->Nclusters = atom->Nclusters_local + Nghost + 1;
+
+    // Update created ghost clusters positions
+    cpuUpdatePbc(atom, param, 1);
+    DEBUG_MESSAGE("setupPbc end\n");
+}
--- a/clusterpair/stats.c
+++ b/clusterpair/stats.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+
+#include <atom.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timers.h>
+
+void initStats(Stats *s) {
+    s->calculated_forces = 0;
+    s->num_neighs = 0;
+    s->force_iters = 0;
+    s->atoms_within_cutoff = 0;
+    s->atoms_outside_cutoff = 0;
+    s->clusters_within_cutoff = 0;
+    s->clusters_outside_cutoff = 0;
+}
+
+void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
+#ifdef COMPUTE_STATS
+
+    const int MxN = CLUSTER_M * CLUSTER_N;
+    double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
+    double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
+                                          (double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
+    double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
+    double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
+    double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
+
+    #ifdef EXPLICIT_TYPES
+    force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->num_neighs) * sizeof(int);
+    #endif
+
+    printf("Statistics:\n");
+    printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
+    printf("\tAverage atoms per cluster: %.4f\n", avg_atoms_cluster);
+    printf("\tAverage neighbors per atom: %.4f\n", avg_neigh_atom);
+    printf("\tAverage neighbors per cluster: %.4f\n", avg_neigh_cluster);
+    printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
+    printf("\tTotal number of computed pair interactions: %lld\n", stats->num_neighs * MxN);
+    printf("\tTotal number of SIMD iterations: %lld\n", stats->force_iters);
+    printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
+    printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->force_iters);
+
+    #ifdef USE_REFERENCE_VERSION
+    const double atoms_eff = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
+    printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, atoms_eff);
+    const double clusters_eff = (double)stats->clusters_within_cutoff / (double)(stats->clusters_within_cutoff + stats->clusters_outside_cutoff) * 100.0;
+    printf("\tClusters within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->clusters_within_cutoff, stats->clusters_outside_cutoff, clusters_eff);
+    #endif
+
+#endif
+}
--- a/clusterpair/tracing.c
+++ b/clusterpair/tracing.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <neighbor.h>
+#include <parameter.h>
+#include <atom.h>
+#include <tracing.h>
+
+void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
+    MEM_TRACER_INIT;
+    INDEX_TRACER_INIT;
+    int Nlocal = atom->Nlocal;
+    int *neighs;
+    unsigned int *neighs_imask;
+    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
+
+    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
+    for(int i = 0; i < Nlocal; i++) {
+        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
+        int numneighs = neighbor->numneigh[i];
+        MEM_TRACE(atom_x(i), 'R');
+        MEM_TRACE(atom_y(i), 'R');
+        MEM_TRACE(atom_z(i), 'R');
+        INDEX_TRACE_ATOM(i);
+
+        #ifdef EXPLICIT_TYPES
+        MEM_TRACE(atom->type[i], 'R');
+        #endif
+
+        DIST_TRACE_SORT(neighs, numneighs);
+        INDEX_TRACE(neighs, numneighs);
+        DIST_TRACE(neighs, numneighs);
+
+        for(int k = 0; k < numneighs; k++) {
+            int j = neighs[k];
+            MEM_TRACE(j, 'R');
+            MEM_TRACE(atom_x(j), 'R');
+            MEM_TRACE(atom_y(j), 'R');
+            MEM_TRACE(atom_z(j), 'R');
+
+            #ifdef EXPLICIT_TYPES
+            MEM_TRACE(atom->type[j], 'R');
+            #endif
+        }
+
+        /*
+        MEM_TRACE(fx[i], 'R');
+        MEM_TRACE(fx[i], 'W');
+        MEM_TRACE(fy[i], 'R');
+        MEM_TRACE(fy[i], 'W');
+        MEM_TRACE(fz[i], 'R');
+        MEM_TRACE(fz[i], 'W');
+        */
+    }
+
+    INDEX_TRACER_END;
+    MEM_TRACER_END;
+}
--- a/clusterpair/vtk.c
+++ b/clusterpair/vtk.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <atom.h>
+#include <vtk.h>
+
+void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
+    write_local_atoms_to_vtk_file(filename, atom, timestep);
+    write_ghost_atoms_to_vtk_file(filename, atom, timestep);
+    write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
+    write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
+}
+
+int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+    for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
+    for(int i = 0; i < atom->Nlocal; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nghost);
+    for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nghost, atom->Nghost * 2);
+    for(int i = 0; i < atom->Nghost; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nghost);
+    for(int i = 0; i < atom->Nghost; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = 0; i < atom->Nghost; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_local_edges_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+    int N = atom->Nclusters_local;
+    int tot_lines = 0;
+    int i = 0;
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET POLYDATA\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+    for(int ci = 0; ci < N; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+
+        tot_lines += atom->iclusters[ci].natoms;
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
+    for(int ci = 0; ci < N; ++ci) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%d ", i++);
+        }
+
+        fprintf(fp, "\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_edges_%d.vtk", filename, timestep);
+    FILE* fp = fopen(timestep_filename, "wb");
+    int N = atom->Nclusters_local + atom->Nclusters_ghost;
+    int tot_lines = 0;
+    int i = 0;
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET POLYDATA\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nghost);
+    for(int ci = atom->Nclusters_local; ci < N; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
+        }
+
+        tot_lines += atom->iclusters[ci].natoms;
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
+    for(int ci = atom->Nclusters_local; ci < N; ++ci) {
+        fprintf(fp, "%d ", atom->iclusters[ci].natoms);
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
+            fprintf(fp, "%d ", i++);
+        }
+
+        fprintf(fp, "\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
--- a/clusterpair/xtc.c
+++ b/clusterpair/xtc.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+//---
+#include <atom.h>
+#include <allocate.h>
+#include <xtc.h>
+
+#ifdef XTC_OUTPUT
+#include <gromacs/fileio/xtcio.h>
+
+static struct t_fileio *xtc_file = NULL;
+static rvec *x_buf = NULL;
+static rvec basis[3];
+
+void xtc_init(const char *filename, Atom *atom, int timestep) {
+    basis[0][XX] = 1.0;
+    basis[0][YY] = 0.0;
+    basis[0][ZZ] = 0.0;
+    basis[1][XX] = 0.0;
+    basis[1][YY] = 1.0;
+    basis[1][ZZ] = 0.0;
+    basis[2][XX] = 0.0;
+    basis[2][YY] = 0.0;
+    basis[2][ZZ] = 1.0;
+
+    xtc_file = open_xtc(filename, "w");
+    x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1));
+    xtc_write(atom, timestep, 1, 1);
+}
+
+void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
+    int i = 0;
+    for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
+            x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
+            x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
+            x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
+            i++;
+        }
+    }
+
+    write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000);
+}
+
+void xtc_end() {
+    free(x_buf);
+    close_xtc(xtc_file);
+}
+#endif