diff --git a/gromacs/cuda/force_lj.cu b/gromacs/cuda/force_lj.cu
index 96d3e26..75d6929 100644
--- a/gromacs/cuda/force_lj.cu
+++ b/gromacs/cuda/force_lj.cu
@@ -196,10 +196,10 @@ __global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_
         int cond;
 #if CLUSTER_M == CLUSTER_N
         cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
-               (ci_cj0 != cj || cii_pos != cjj_pos);
+                            (ci_cj0 != cj || cii_pos != cjj_pos);
 #elif CLUSTER_M < CLUSTER_N
         cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
-                                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
+                            (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
 #endif
         if(cond) {
             MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
diff --git a/include_NVCC.mk b/include_NVCC.mk
index 3201e64..8a7fed7 100644
--- a/include_NVCC.mk
+++ b/include_NVCC.mk
@@ -6,10 +6,15 @@ ANSI_CFLAGS += -std=c99
 ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra
 
-CFLAGS   = -O3 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
+#
+# A100 + Native
+CFLAGS   = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
+# A40 + Native
+#CFLAGS   = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
+# Cascade Lake
 #CFLAGS   = -O3 -march=cascadelake  -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
-#CFLAGS   = -O3 -g # -fopenmp
-#CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
+# For GROMACS kernels, we need at least sm_61 due to atomicAdd with doubles
+# TODO: Check if this is required for full neighbor-lists and just compile kernel for that case if not
 #CFLAGS   = -O3 -g -arch=sm_61 # -fopenmp
 ASFLAGS  =  -masm=intel
 LFLAGS   =