diff --git a/common/includes/parameter.h b/common/includes/parameter.h
index f4ff676..614be9f 100644
--- a/common/includes/parameter.h
+++ b/common/includes/parameter.h
@@ -21,6 +21,7 @@ typedef struct {
     char* input_file;
     char* vtk_file;
     char* xtc_file;
+    char* write_atom_file;
     MD_FLOAT epsilon;
     MD_FLOAT sigma;
     MD_FLOAT sigma6;
diff --git a/common/includes/timing.h b/common/includes/timing.h
index 484659f..7792085 100644
--- a/common/includes/timing.h
+++ b/common/includes/timing.h
@@ -7,8 +7,8 @@
 #ifndef __TIMING_H_
 #define __TIMING_H_
 
-extern double getTimeStamp();
-extern double getTimeResolution();
-extern double getTimeStamp_();
+extern double getTimeStamp(void);
+extern double getTimeResolution(void);
+extern double getTimeStamp_(void);
 
 #endif
diff --git a/common/parameter.c b/common/parameter.c
index 1d6d593..8aab646 100644
--- a/common/parameter.c
+++ b/common/parameter.c
@@ -17,6 +17,7 @@ void initParameter(Parameter *param) {
     param->vtk_file = NULL;
     param->xtc_file = NULL;
     param->eam_file = NULL;
+    param->write_atom_file = NULL;
     param->force_field = FF_LJ;
     param->epsilon = 1.0;
     param->sigma = 1.0;
diff --git a/lammps/atom.c b/lammps/atom.c
index a96eeb7..5dd0cdf 100644
--- a/lammps/atom.c
+++ b/lammps/atom.c
@@ -502,6 +502,21 @@ int readAtom_in(Atom* atom, Parameter* param) {
     return natoms;
 }
 
+void writeAtom(Atom *atom, Parameter *param) {
+    FILE *fp = fopen(param->write_atom_file, "w");
+
+    for(int i = 0; i < atom->Nlocal; i++) {
+        fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
+            atom->type[i], 1.0,
+            atom_x(i), atom_y(i), atom_z(i),
+            atom_vx(i), atom_vy(i), atom_vz(i));
+    }
+
+    fclose(fp);
+    fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
+        param->write_atom_file, param->xprd, param->yprd, param->zprd);
+}
+
 void growAtom(Atom *atom) {
     DeviceAtom *d_atom = &(atom->d_atom);
     int nold = atom->Nmax;
diff --git a/lammps/cuda/force.cu b/lammps/cuda/force.cu
index a6ec4d8..f96c826 100644
--- a/lammps/cuda/force.cu
+++ b/lammps/cuda/force.cu
@@ -29,7 +29,7 @@ extern "C" {
 }
 
 // cuda kernel
-__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
+__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     if(i >= Nlocal) {
         return;
@@ -46,6 +46,10 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
     MD_FLOAT fiy = 0;
     MD_FLOAT fiz = 0;
 
+#ifdef EXPLICIT_TYPES
+    const int type_i = atom->type[i];
+#endif
+
     for(int k = 0; k < numneighs; k++) {
         int j = neigh_neighbors[Nlocal * k + i];
         MD_FLOAT delx = xtmp - atom_x(j);
@@ -55,7 +59,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
 
 #ifdef EXPLICIT_TYPES
         const int type_j = atom->type[j];
-        const int type_ij = type_i * atom->ntypes + type_j;
+        const int type_ij = type_i * ntypes + type_j;
         const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
         const MD_FLOAT sigma6 = atom->sigma6[type_ij];
         const MD_FLOAT epsilon = atom->epsilon[type_ij];
@@ -138,11 +142,9 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
     const int num_threads_per_block = get_cuda_num_threads();
     int Nlocal = atom->Nlocal;
-#ifndef EXPLICIT_TYPES
     MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
     MD_FLOAT sigma6 = param->sigma6;
     MD_FLOAT epsilon = param->epsilon;
-#endif
 
     /*
     int nDevices;
@@ -165,7 +167,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
     double S = getTimeStamp();
     LIKWID_MARKER_START("force");
 
-    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
+    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
     cuda_assert("calc_force", cudaPeekAtLastError());
     cuda_assert("calc_force", cudaDeviceSynchronize());
     cudaProfilerStop();
diff --git a/lammps/cuda/neighbor.cu b/lammps/cuda/neighbor.cu
index b83b8ae..1e40e56 100644
--- a/lammps/cuda/neighbor.cu
+++ b/lammps/cuda/neighbor.cu
@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
 
 __global__ void compute_neighborhood(
     DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
-    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
+    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
 
     const int i = blockIdx.x * blockDim.x + threadIdx.x;
     if(i >= nlocal) {
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
 
 #ifdef EXPLICIT_TYPES
             int type_j = atom->type[j];
-            const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+            const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
 #else
             const MD_FLOAT cutoff = cutneighsq;
 #endif
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
                                                                     np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
                                                                     c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
                                                                     c_new_maxneighs,
-								                                    cutneighsq);
+								                                    cutneighsq, atom->ntypes);
 
         cuda_assert("compute_neighborhood", cudaPeekAtLastError());
         cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
diff --git a/lammps/device_spec.c b/lammps/device_spec.c
index b6fec89..42d6002 100644
--- a/lammps/device_spec.c
+++ b/lammps/device_spec.c
@@ -14,6 +14,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
 
     d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
     d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->cutneighsq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
     d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
     d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
     d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
@@ -22,6 +23,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
     memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
     memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
     memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->cutneighsq,     atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
     memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
     memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
 }
diff --git a/lammps/includes/atom.h b/lammps/includes/atom.h
index ea00284..0fe3b51 100644
--- a/lammps/includes/atom.h
+++ b/lammps/includes/atom.h
@@ -73,6 +73,7 @@ extern int readAtom_pdb(Atom*, Parameter*);
 extern int readAtom_gro(Atom*, Parameter*);
 extern int readAtom_dmp(Atom*, Parameter*);
 extern int readAtom_in(Atom*, Parameter*);
+extern void writeAtom(Atom*, Parameter*);
 extern void growAtom(Atom*);
 
 #ifdef AOS
diff --git a/lammps/main.c b/lammps/main.c
index ec43857..c623827 100644
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -207,6 +207,10 @@ int main(int argc, char** argv) {
             param.vtk_file = strdup(argv[++i]);
             continue;
         }
+        if((strcmp(argv[i], "-w") == 0)) {
+            param.write_atom_file = strdup(argv[++i]);
+            continue;
+        }
         if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
             printf("MD Bench: A minimalistic re-implementation of miniMD\n");
             printf(HLINE);
@@ -219,6 +223,7 @@ int main(int argc, char** argv) {
             printf("-half <int>:                use half (1) or full (0) neighbor lists\n");
             printf("-r / --radius <real>:       set cutoff radius\n");
             printf("-s / --skin <real>:         set skin (verlet buffer)\n");
+            printf("-w <file>:                  write input atoms to file\n");
             printf("--freq <real>:              processor frequency (GHz)\n");
             printf("--vtk <string>:             VTK file for visualization\n");
             printf(HLINE);
@@ -237,6 +242,10 @@ int main(int argc, char** argv) {
     traceAddresses(&param, &atom, &neighbor, n + 1);
     #endif
 
+    if(param.write_atom_file != NULL) {
+        writeAtom(&atom, &param);
+    }
+
     //writeInput(&param, &atom);
 
     timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);