diff --git a/lammps/device.c b/common/device.c
similarity index 60%
rename from lammps/device.c
rename to common/device.c
index a90f8cb..d1d51ef 100644
--- a/lammps/device.c
+++ b/common/device.c
@@ -58,24 +58,6 @@ void memsetGPU(void *d_ptr, int value, size_t bytesize) {
     cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
 }
 
-void initDevice(Atom *atom, Neighbor *neighbor) {
-    DeviceAtom *d_atom = &(atom->d_atom);
-    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
-
-    d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
-    d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
-    d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
-    d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
-    d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
-
-    memcpyToGPU(d_atom->x,              atom->x,          sizeof(MD_FLOAT) * atom->Nmax * 3);
-    memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
-    memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
-    memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
-    memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
-    memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
-}
-
 #else
 void initDevice(Atom *atom, Neighbor *neighbor) {}
 void *allocateGPU(size_t bytesize) { return NULL; }
diff --git a/lammps/includes/device.h b/common/includes/device.h
similarity index 96%
rename from lammps/includes/device.h
rename to common/includes/device.h
index 3c6d0c4..f28cc62 100644
--- a/lammps/includes/device.h
+++ b/common/includes/device.h
@@ -4,6 +4,8 @@
  * Use of this source code is governed by a LGPL-3.0
  * license that can be found in the LICENSE file.
  */
+#include <stddef.h>
+//---
 #include <atom.h>
 #include <neighbor.h>
 
diff --git a/gromacs/cuda/force_lj.cu b/gromacs/cuda/force_lj.cu
new file mode 100644
index 0000000..96d3e26
--- /dev/null
+++ b/gromacs/cuda/force_lj.cu
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+extern "C" {
+
+#include <stdio.h>
+//---
+#include <cuda.h>
+#include <driver_types.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <atom.h>
+#include <device.h>
+#include <neighbor.h>
+#include <parameter.h>
+#include <stats.h>
+#include <timing.h>
+#include <util.h>
+
+}
+
+extern "C" {
+    MD_FLOAT *cuda_cl_x;
+    MD_FLOAT *cuda_cl_v;
+    MD_FLOAT *cuda_cl_f;
+    int *cuda_neighbors;
+    int *cuda_numneigh;
+    int *cuda_natoms;
+    int *natoms;
+    int *ngatoms;
+    int *cuda_border_map;
+    int *cuda_jclusters_natoms;
+    MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
+    MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
+    MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
+    int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
+    int isReneighboured;
+}
+
+extern "C"
+void initDevice(Atom *atom, Neighbor *neighbor) {
+    cuda_assert("cudaDeviceSetup", cudaDeviceReset());
+    cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
+    cuda_cl_x               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_cl_v               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_cl_f               =   (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    cuda_natoms             =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_jclusters_natoms   =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_border_map         =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCx               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCy               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_PBCz               =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_numneigh           =   (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
+    cuda_neighbors          =   (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
+    natoms = (int *) malloc(atom->Nclusters_max);
+    ngatoms = (int *) malloc(atom->Nclusters_max);
+    isReneighboured = 1;
+}
+
+extern "C"
+void copyDataToCUDADevice(Atom *atom) {
+    memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        natoms[ci] = atom->iclusters[ci].natoms;
+    }
+
+    memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
+
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = atom->Nclusters_local / jfac;
+    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
+        const int cj = ncj + cg;
+        ngatoms[cg] = atom->jclusters[cj].natoms;
+    }
+
+    memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
+    memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
+}
+
+extern "C"
+void copyDataFromCUDADevice(Atom *atom) {
+    memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+}
+
+extern "C"
+void cudaDeviceFree() {
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
+    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
+    free(natoms);
+    free(ngatoms);
+}
+
+__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
+                                         int *cuda_natoms,
+                                         int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (ci_pos >= Nclusters_local) return;
+
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
+    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+
+    for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
+        ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
+        ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
+        ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
+        ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
+        ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
+        ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
+    }
+}
+
+__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
+                                   int *cuda_jclusters_natoms,
+                                   int *cuda_PBCx,
+                                   int *cuda_PBCy,
+                                   int *cuda_PBCz,
+                                   int Nclusters_local,
+                                   int Nclusters_ghost,
+                                   MD_FLOAT param_xprd,
+                                   MD_FLOAT param_yprd,
+                                   MD_FLOAT param_zprd) {
+    unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
+    if (cg >= Nclusters_ghost) return;
+
+    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    int ncj = Nclusters_local / jfac;
+    MD_FLOAT xprd = param_xprd;
+    MD_FLOAT yprd = param_yprd;
+    MD_FLOAT zprd = param_zprd;
+
+    const int cj = ncj + cg;
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
+    MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+    MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
+
+    for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
+        cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
+        cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
+    }
+}
+
+__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
+                                         int Nclusters_local, int Nclusters_max,
+                                         int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
+                                         MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
+    unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
+    if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
+
+    int ci_cj0 = CJ0_FROM_CI(ci_pos);
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+    int numneighs = cuda_numneigh[ci_pos];
+    for(int k = 0; k < numneighs; k++) {
+        int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
+        MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
+
+        MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
+        MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
+        MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+
+        int cond;
+#if CLUSTER_M == CLUSTER_N
+        cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
+               (ci_cj0 != cj || cii_pos != cjj_pos);
+#elif CLUSTER_M < CLUSTER_N
+        cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
+                                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
+#endif
+        if(cond) {
+            MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
+            MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
+            MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+            if(rsq < cutforcesq) {
+                MD_FLOAT sr2 = 1.0 / rsq;
+                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+
+                if(half_neigh) {
+                    atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
+                    atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
+                    atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
+                }
+
+                fix += delx * force;
+                fiy += dely * force;
+                fiz += delz * force;
+
+                atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
+                atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
+                atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
+            }
+        }
+    }
+}
+
+__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
+                                          int *cuda_natoms,
+                                          int Nclusters_local, MD_FLOAT dtforce) {
+
+    unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
+    if (ci_pos >= Nclusters_local) return;
+
+    int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
+    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
+    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
+
+    for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
+        ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
+        ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
+        ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
+    }
+}
+
+extern "C"
+void cudaInitialIntegrate(Parameter *param, Atom *atom) {
+    const int threads_num = 16;
+    dim3 block_size = dim3(threads_num, 1, 1);
+    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
+    cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
+                                                         cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
+    cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
+    cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
+}
+
+/* update coordinates of ghost atoms */
+/* uses mapping created in setupPbc */
+extern "C"
+void cudaUpdatePbc(Atom *atom, Parameter *param) {
+    const int threads_num = 512;
+    dim3 block_size = dim3(threads_num, 1, 1);;
+    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
+    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
+                                       cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
+                                       atom->Nclusters_local, atom->Nclusters_ghost,
+                                       param->xprd, param->yprd, param->zprd);
+    cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
+    cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
+}
+
+extern "C"
+double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+
+    memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
+    if (isReneighboured) {
+        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+            memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
+            memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
+        }
+
+        isReneighboured = 0;
+    }
+
+    const int threads_num = 1;
+    dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
+    dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
+    double S = getTimeStamp();
+    LIKWID_MARKER_START("force");
+    computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
+                                                        atom->Nclusters_local, atom->Nclusters_max,
+                                                        cuda_numneigh, cuda_neighbors,
+                                                        neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
+                                                        sigma6, epsilon);
+    cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
+    cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
+    LIKWID_MARKER_STOP("force");
+    double E = getTimeStamp();
+    return E-S;
+}
+
+extern "C"
+void cudaFinalIntegrate(Parameter *param, Atom *atom) {
+    const int threads_num = 16;
+    dim3 block_size = dim3(threads_num, 1, 1);
+    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
+    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
+    cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
+    cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
+}
diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h
index 6716db2..8c4cb98 100644
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -11,29 +11,43 @@
 
 #define DELTA 20000
 
-#define CLUSTER_M 4
-
 // Nbnxn layouts (as of GROMACS):
 // Simd4xN: M=4, N=VECTOR_WIDTH
 // Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
+// Cuda: M=8, N=VECTOR_WIDTH
 
-// Simd2xNN (here used for single-precision)
-#if VECTOR_WIDTH > CLUSTER_M * 2
-#   define KERNEL_NAME              "Simd2xNN"
-#   define CLUSTER_N                (VECTOR_WIDTH / 2)
-#   define computeForceLJ           computeForceLJ_2xnn
-// Simd4xN
-#else
-#   define KERNEL_NAME              "Simd4xN"
+#ifdef CUDA_TARGET
+#   undef VECTOR_WIDTH
+#   define VECTOR_WIDTH             8
+#   define KERNEL_NAME              "CUDA"
+#   define CLUSTER_M                8
 #   define CLUSTER_N                VECTOR_WIDTH
-#   define computeForceLJ           computeForceLJ_4xn
-#endif
-
-#ifdef USE_REFERENCE_VERSION
-#   undef KERNEL_NAME
-#   undef computeForceLJ
-#   define KERNEL_NAME              "Reference"
-#   define computeForceLJ           computeForceLJ_ref
+#   define computeForceLJ           computeForceLJ_cuda
+#   define initialIntegrate         cudaInitialIntegrate
+#   define finalIntegrate           cudaFinalIntegrate
+#   define updatePbc                cudaUpdatePbc
+#else
+#   define CLUSTER_M                4
+// Simd2xNN (here used for single-precision)
+#   if VECTOR_WIDTH > CLUSTER_M * 2
+#       define KERNEL_NAME          "Simd2xNN"
+#       define CLUSTER_N            (VECTOR_WIDTH / 2)
+#       define computeForceLJ       computeForceLJ_2xnn
+// Simd4xN
+#   else
+#       define KERNEL_NAME          "Simd4xN"
+#       define CLUSTER_N            VECTOR_WIDTH
+#       define computeForceLJ       computeForceLJ_4xn
+#   endif
+#   ifdef USE_REFERENCE_VERSION
+#       undef KERNEL_NAME
+#       undef computeForceLJ
+#       define KERNEL_NAME          "Reference"
+#       define computeForceLJ       computeForceLJ_ref
+#   endif
+#   define initialIntegrate         cpuInitialIntegrate
+#   define finalIntegrate           cpuFinalIntegrate
+#   define updatePbc                cpuUpdatePbc
 #endif
 
 #if CLUSTER_M == CLUSTER_N
diff --git a/gromacs/includes/integrate.h b/gromacs/includes/integrate.h
new file mode 100644
index 0000000..5236ed8
--- /dev/null
+++ b/gromacs/includes/integrate.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+//---
+#include <atom.h>
+#include <parameter.h>
+#include <util.h>
+
+void cpuInitialIntegrate(Parameter *param, Atom *atom) {
+    DEBUG_MESSAGE("cpuInitialIntegrate start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+            ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
+            ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
+            ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
+        }
+    }
+
+    DEBUG_MESSAGE("cpuInitialIntegrate end\n");
+}
+
+void cpuFinalIntegrate(Parameter *param, Atom *atom) {
+    DEBUG_MESSAGE("cpuFinalIntegrate start\n");
+
+    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
+            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
+            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
+            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
+        }
+    }
+
+    DEBUG_MESSAGE("cpuFinalIntegrate end\n");
+}
+
+#ifdef CUDA_TARGET
+void cudaInitialIntegrate(Parameter*, Atom*);
+void cudaFinalIntegrate(Parameter*, Atom*);
+#endif
diff --git a/gromacs/includes/pbc.h b/gromacs/includes/pbc.h
index 4a04180..33ceb53 100644
--- a/gromacs/includes/pbc.h
+++ b/gromacs/includes/pbc.h
@@ -10,7 +10,11 @@
 #ifndef __PBC_H_
 #define __PBC_H_
 extern void initPbc();
-extern void updatePbc(Atom*, Parameter*, int);
+extern void cpuUpdatePbc(Atom*, Parameter*, int);
 extern void updateAtomsPbc(Atom*, Parameter*);
 extern void setupPbc(Atom*, Parameter*);
+
+#ifdef CUDA_TARGET
+extern void cudaUpdatePbc(Atom*, Parameter*, int);
+#endif
 #endif
diff --git a/gromacs/main.c b/gromacs/main.c
index b482992..e724348 100644
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -9,19 +9,21 @@
 //--
 #include <likwid-marker.h>
 //--
-#include <timing.h>
+#include <atom.h>
 #include <allocate.h>
+#include <device.h>
+#include <eam.h>
+#include <integrate.h>
 #include <neighbor.h>
 #include <parameter.h>
-#include <atom.h>
+#include <pbc.h>
 #include <stats.h>
 #include <thermo.h>
-#include <pbc.h>
 #include <timers.h>
-#include <eam.h>
+#include <timing.h>
+#include <util.h>
 #include <vtk.h>
 #include <xtc.h>
-#include <util.h>
 
 #define HLINE "----------------------------------------------------------------------------\n"
 
@@ -30,6 +32,14 @@ extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
 
+#ifdef CUDA_TARGET
+extern int isReneighboured;
+extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
+extern void copyDataToCUDADevice(Atom *atom);
+extern void copyDataFromCUDADevice(Atom *atom);
+extern void cudaDeviceFree();
+#endif
+
 double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
     if(param->force_field == FF_EAM) { initEam(eam, param); }
     double S, E;
@@ -43,7 +53,6 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
     initPbc(atom);
     initStats(stats);
     initNeighbor(neighbor, param);
-
     if(param->input_file == NULL) {
         createAtom(atom, param);
     } else {
@@ -56,6 +65,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
     buildClusters(atom);
     defineJClusters(atom);
     setupPbc(atom, param);
+    initDevice(atom, neighbor);
     binClusters(atom);
     buildNeighbor(atom, neighbor);
     E = getTimeStamp();
@@ -78,46 +88,6 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
     return E-S;
 }
 
-void initialIntegrate(Parameter *param, Atom *atom) {
-    DEBUG_MESSAGE("initialIntegrate start\n");
-
-    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
-        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
-        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
-        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
-
-        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
-            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
-            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
-            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
-            ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
-            ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
-            ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
-        }
-    }
-
-    DEBUG_MESSAGE("initialIntegrate end\n");
-}
-
-void finalIntegrate(Parameter *param, Atom *atom) {
-    DEBUG_MESSAGE("finalIntegrate start\n");
-
-    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
-        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
-        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
-        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
-
-        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
-            ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
-            ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
-            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
-        }
-    }
-
-    DEBUG_MESSAGE("finalIntegrate end\n");
-}
-
 void printAtomState(Atom *atom) {
     printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
             atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
@@ -244,6 +214,11 @@ int main(int argc, char** argv) {
     #if defined(MEM_TRACER) || defined(INDEX_TRACER)
     traceAddresses(&param, &atom, &neighbor, n + 1);
     #endif
+
+    #ifdef CUDA_TARGET
+    copyDataToCUDADevice(&atom);
+    #endif
+
     if(param.force_field == FF_EAM) {
         timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
     } else {
@@ -271,7 +246,16 @@ int main(int argc, char** argv) {
 
             updatePbc(&atom, &param, 0);
         } else {
+            #ifdef CUDA_TARGET
+            copyDataFromCUDADevice(&atom);
+            #endif
+
             timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
+
+            #ifdef CUDA_TARGET
+            copyDataToCUDADevice(&atom);
+            isReneighboured = 1;
+            #endif
         }
 
         #if defined(MEM_TRACER) || defined(INDEX_TRACER)
@@ -303,6 +287,10 @@ int main(int argc, char** argv) {
         }
     }
 
+    #ifdef CUDA_TARGET
+    copyDataFromCUDADevice(&atom);
+    #endif
+
     timer[TOTAL] = getTimeStamp() - timer[TOTAL];
     updateSingleAtoms(&atom);
     computeThermo(-1, &param, &atom);
@@ -311,6 +299,10 @@ int main(int argc, char** argv) {
         xtc_end();
     }
 
+    #ifdef CUDA_TARGET
+    cudaDeviceFree();
+    #endif
+
     printf(HLINE);
     printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
     printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
diff --git a/gromacs/pbc.c b/gromacs/pbc.c
index 8a484c5..d5320fa 100644
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -29,7 +29,7 @@ void initPbc(Atom* atom) {
 
 /* update coordinates of ghost atoms */
 /* uses mapping created in setupPbc */
-void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
+void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
     DEBUG_MESSAGE("updatePbc start\n");
     int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
     int ncj = atom->Nclusters_local / jfac;
diff --git a/lammps/device_spec.c b/lammps/device_spec.c
new file mode 100644
index 0000000..b6fec89
--- /dev/null
+++ b/lammps/device_spec.c
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <device.h>
+
+#ifdef CUDA_TARGET
+
+void initDevice(Atom *atom, Neighbor *neighbor) {
+    DeviceAtom *d_atom = &(atom->d_atom);
+    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
+
+    d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
+    d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
+
+    memcpyToGPU(d_atom->x,              atom->x,          sizeof(MD_FLOAT) * atom->Nmax * 3);
+    memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
+    memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
+}
+
+#endif