Update neighbor.c

Final MPI version
Merge pull request #7 from RRZE-HPC/mucosim23
2024-04-15 18:12:27 +02:00 · 2024-04-15 16:53:25 +02:00 · 2024-01-17 15:14:08 +01:00 · 2024-01-13 15:09:03 +01:00 · 2024-01-11 17:16:17 +01:00 · 2024-01-11 17:09:18 +01:00
87 changed files with 6814 additions and 2738 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -51,14 +51,17 @@ Module.symvers
 Mkfile.old
 dkms.conf

+# Logs
+*.log
+
+# TODO list
+todo.txt
+
 # Build directories and executables
-GCC/
-ICC/
-ICX/
-CLANG/
-NVCC/
-MDBench-GCC*
-MDBench-ICC*
-MDBench-ICX*
-MDBench-CLANG*
-MDBench-NVCC*
+#GCC-*/
+#ICC-*/
+#ICX-*/
+#CLANG-*/
+#NVCC-*/
+build-*/
+MDBench-*
--- a/19
+++ b/19
@@ -1,6 +1,7 @@
 #CONFIGURE BUILD SYSTEM
-TARGET	   = MDBench-$(TAG)-$(OPT_SCHEME)
-BUILD_DIR  = ./$(TAG)-$(OPT_SCHEME)
+IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
+TARGET	   = MDBench-$(IDENTIFIER)
+BUILD_DIR  = ./build-$(IDENTIFIER)
 SRC_DIR    = ./$(OPT_SCHEME)
 ASM_DIR    = ./asm
 COMMON_DIR = ./common
@@ -16,6 +17,9 @@ include $(MAKE_DIR)/include_ISA.mk
 include $(MAKE_DIR)/include_GROMACS.mk
 INCLUDES  += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes

+ifeq ($(strip $(OPT_SCHEME)),gromacs)
+    DEFINES +=  -DGROMACS
+endif
 ifeq ($(strip $(DATA_LAYOUT)),AOS)
    DEFINES +=  -DAOS
 endif
@@ -29,6 +33,10 @@ ifneq ($(ASM_SYNTAX), ATT)
    ASFLAGS += -masm=intel
 endif

+ifeq ($(strip $(SORT_ATOMS)),true)
+    DEFINES += -DSORT_ATOMS
+endif
+
 ifeq ($(strip $(EXPLICIT_TYPES)),true)
    DEFINES += -DEXPLICIT_TYPES
 endif
@@ -151,6 +159,13 @@ $(BUILD_DIR)/%.o:  %.s
 clean:
 	$(info ===>  CLEAN)
 	@rm -rf $(BUILD_DIR)
+	@rm -rf $(TARGET)*
+	@rm -f tags
+
+cleanall:
+	$(info ===>  CLEAN)
+	@rm -rf build-*
+	@rm -rf MDBench-*
 	@rm -f tags

 distclean: clean
--- a/asm/.gitkeep
+++ b/asm/.gitkeep
--- a/asm/unused/force-mem-only-with-likwid.s
+++ b/asm/unused/force-mem-only-with-likwid.s
@@ -1,626 +0,0 @@
-# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
-# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
-# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
-# mark_description "ICC/force.s";
-	.file "force.c"
-	.text
-..TXTST0:
-.L_2__routine_start_computeForce_0:
-# -- Begin  computeForce
-	.text
-# mark_begin;
-       .align    16,0x90
-	.globl computeForce
-# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
-computeForce:
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %ecx
-# parameter 5: %r8d
-# parameter 6: %r9d
-..B1.1:                         # Preds ..B1.0
-                                # Execution count [1.00e+00]
-	.cfi_startproc
-..___tag_value_computeForce.1:
-..L2:
-                                                          #121.112
-        pushq     %rbp                                          #121.112
-	.cfi_def_cfa_offset 16
-        movq      %rsp, %rbp                                    #121.112
-	.cfi_def_cfa 6, 16
-	.cfi_offset 6, -16
-        andq      $-64, %rsp                                    #121.112
-        pushq     %r12                                          #121.112
-        pushq     %r13                                          #121.112
-        pushq     %r14                                          #121.112
-        pushq     %r15                                          #121.112
-        pushq     %rbx                                          #121.112
-        subq      $88, %rsp                                     #121.112
-        xorl      %eax, %eax                                    #124.16
-	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
-        movq      %rdx, %r15                                    #121.112
-        movq      %rsi, %r12                                    #121.112
-        movq      %rdi, %rbx                                    #121.112
-..___tag_value_computeForce.11:
-#       getTimeStamp()
-        call      getTimeStamp                                  #124.16
-..___tag_value_computeForce.12:
-                                # LOE rbx r12 r15 xmm0
-..B1.51:                        # Preds ..B1.1
-                                # Execution count [1.00e+00]
-        vmovsd    %xmm0, 24(%rsp)                               #124.16[spill]
-                                # LOE rbx r12 r15
-..B1.2:                         # Preds ..B1.51
-                                # Execution count [1.00e+00]
-        movl      4(%r12), %r13d                                #125.18
-        movq      64(%r12), %r9                                 #127.20
-        movq      72(%r12), %r14                                #127.45
-        movq      80(%r12), %r8                                 #127.70
-        vmovsd    72(%rbx), %xmm2                               #129.27
-        vmovsd    8(%rbx), %xmm1                                #130.23
-        vmovsd    (%rbx), %xmm0                                 #131.24
-        testl     %r13d, %r13d                                  #134.24
-        jle       ..B1.43       # Prob 50%                      #134.24
-                                # LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
-..B1.3:                         # Preds ..B1.2
-                                # Execution count [1.00e+00]
-        xorl      %ebx, %ebx                                    #134.5
-        movl      %r13d, %edx                                   #134.5
-        xorl      %ecx, %ecx                                    #134.5
-        movl      $1, %esi                                      #134.5
-        xorl      %eax, %eax                                    #135.17
-        shrl      $1, %edx                                      #134.5
-        je        ..B1.7        # Prob 9%                       #134.5
-                                # LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
-..B1.5:                         # Preds ..B1.3 ..B1.5
-                                # Execution count [2.50e+00]
-        movq      %rax, (%rcx,%r9)                              #135.9
-        incq      %rbx                                          #134.5
-        movq      %rax, (%rcx,%r14)                             #136.9
-        movq      %rax, (%rcx,%r8)                              #137.9
-        movq      %rax, 8(%rcx,%r9)                             #135.9
-        movq      %rax, 8(%rcx,%r14)                            #136.9
-        movq      %rax, 8(%rcx,%r8)                             #137.9
-        addq      $16, %rcx                                     #134.5
-        cmpq      %rdx, %rbx                                    #134.5
-        jb        ..B1.5        # Prob 63%                      #134.5
-                                # LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
-..B1.6:                         # Preds ..B1.5
-                                # Execution count [9.00e-01]
-        lea       1(%rbx,%rbx), %esi                            #135.9
-                                # LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
-..B1.7:                         # Preds ..B1.3 ..B1.6
-                                # Execution count [1.00e+00]
-        lea       -1(%rsi), %edx                                #134.5
-        cmpl      %r13d, %edx                                   #134.5
-        jae       ..B1.9        # Prob 9%                       #134.5
-                                # LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
-..B1.8:                         # Preds ..B1.7
-                                # Execution count [9.00e-01]
-        movslq    %esi, %rsi                                    #134.5
-        movq      %rax, -8(%r9,%rsi,8)                          #135.9
-        movq      %rax, -8(%r14,%rsi,8)                         #136.9
-        movq      %rax, -8(%r8,%rsi,8)                          #137.9
-                                # LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
-..B1.9:                         # Preds ..B1.7 ..B1.8
-                                # Execution count [5.00e-01]
-        movl      $.L_2__STRING.0, %edi                         #141.5
-        movq      %r8, 32(%rsp)                                 #141.5[spill]
-        movq      %r9, 80(%rsp)                                 #141.5[spill]
-        vmovsd    %xmm2, (%rsp)                                 #141.5[spill]
-        vmovsd    %xmm1, 8(%rsp)                                #141.5[spill]
-        vmovsd    %xmm0, 16(%rsp)                               #141.5[spill]
-..___tag_value_computeForce.18:
-#       likwid_markerStartRegion(const char *)
-        call      likwid_markerStartRegion                      #141.5
-..___tag_value_computeForce.19:
-                                # LOE r12 r14 r15 r13d
-..B1.10:                        # Preds ..B1.9
-                                # Execution count [9.00e-01]
-        vmovsd    16(%rsp), %xmm0                               #[spill]
-        xorl      %esi, %esi                                    #143.15
-        vmovsd    (%rsp), %xmm2                                 #[spill]
-        xorl      %eax, %eax                                    #143.5
-        vmulsd    %xmm2, %xmm2, %xmm13                          #129.45
-        xorl      %edi, %edi                                    #143.5
-        vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16            #173.13
-        vmulsd    .L_2il0floatpacket.3(%rip), %xmm0, %xmm0      #197.45
-        vmovdqu   .L_2il0floatpacket.1(%rip), %ymm15            #173.13
-        vmovups   .L_2il0floatpacket.4(%rip), %zmm5             #197.58
-        vmovsd    8(%rsp), %xmm1                                #[spill]
-        vbroadcastsd %xmm13, %zmm14                             #129.25
-        vbroadcastsd %xmm1, %zmm13                              #130.21
-        vbroadcastsd %xmm0, %zmm9                               #197.45
-        movslq    %r13d, %r13                                   #143.5
-        movq      24(%r15), %r10                                #145.25
-        movslq    16(%r15), %rdx                                #144.43
-        movq      8(%r15), %rcx                                 #144.19
-        movq      32(%rsp), %r8                                 #[spill]
-        movq      16(%r12), %rbx                                #146.25
-        shlq      $2, %rdx                                      #126.5
-        movq      %r13, 64(%rsp)                                #143.5[spill]
-        movq      %r10, 72(%rsp)                                #143.5[spill]
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.11:                        # Preds ..B1.41 ..B1.10
-                                # Execution count [5.00e+00]
-        movq      72(%rsp), %r9                                 #145.25[spill]
-        vxorpd    %xmm24, %xmm24, %xmm24                        #149.22
-        vmovapd   %xmm24, %xmm18                                #150.22
-        movl      (%r9,%rax,4), %r10d                           #145.25
-        vmovapd   %xmm18, %xmm4                                 #151.22
-        vmovsd    (%rdi,%rbx), %xmm10                           #146.25
-        vmovsd    8(%rdi,%rbx), %xmm6                           #147.25
-        vmovsd    16(%rdi,%rbx), %xmm12                         #148.25
-        testl     %r10d, %r10d                                  #173.32
-        jle       ..B1.41       # Prob 50%                      #173.32
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.12:                        # Preds ..B1.11
-                                # Execution count [4.50e+00]
-        vpxord    %zmm8, %zmm8, %zmm8                           #149.22
-        vmovaps   %zmm8, %zmm7                                  #150.22
-        vmovaps   %zmm7, %zmm11                                 #151.22
-        cmpl      $8, %r10d                                     #173.13
-        jl        ..B1.48       # Prob 10%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.13:                        # Preds ..B1.12
-                                # Execution count [4.50e+00]
-        cmpl      $1200, %r10d                                  #173.13
-        jl        ..B1.47       # Prob 10%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.14:                        # Preds ..B1.13
-                                # Execution count [4.50e+00]
-        movq      %rdx, %r15                                    #144.43
-        imulq     %rsi, %r15                                    #144.43
-        addq      %rcx, %r15                                    #126.5
-        movq      %r15, %r11                                    #173.13
-        andq      $63, %r11                                     #173.13
-        testl     $3, %r11d                                     #173.13
-        je        ..B1.16       # Prob 50%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.15:                        # Preds ..B1.14
-                                # Execution count [2.25e+00]
-        xorl      %r11d, %r11d                                  #173.13
-        jmp       ..B1.18       # Prob 100%                     #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.16:                        # Preds ..B1.14
-                                # Execution count [2.25e+00]
-        testl     %r11d, %r11d                                  #173.13
-        je        ..B1.18       # Prob 50%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.17:                        # Preds ..B1.16
-                                # Execution count [2.50e+01]
-        negl      %r11d                                         #173.13
-        addl      $64, %r11d                                    #173.13
-        shrl      $2, %r11d                                     #173.13
-        cmpl      %r11d, %r10d                                  #173.13
-        cmovl     %r10d, %r11d                                  #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.18:                        # Preds ..B1.15 ..B1.17 ..B1.16
-                                # Execution count [5.00e+00]
-        movl      %r10d, %r13d                                  #173.13
-        subl      %r11d, %r13d                                  #173.13
-        andl      $7, %r13d                                     #173.13
-        negl      %r13d                                         #173.13
-        addl      %r10d, %r13d                                  #173.13
-        cmpl      $1, %r11d                                     #173.13
-        jb        ..B1.26       # Prob 50%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.19:                        # Preds ..B1.18
-                                # Execution count [4.50e+00]
-        vmovdqa   %ymm15, %ymm4                                 #173.13
-        xorl      %r12d, %r12d                                  #173.13
-        vpbroadcastd %r11d, %ymm3                               #173.13
-        vbroadcastsd %xmm10, %zmm2                              #146.23
-        vbroadcastsd %xmm6, %zmm1                               #147.23
-        vbroadcastsd %xmm12, %zmm0                              #148.23
-        movslq    %r11d, %r9                                    #173.13
-        movq      %r8, 32(%rsp)                                 #173.13[spill]
-        movq      %r14, (%rsp)                                  #173.13[spill]
-                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.20:                        # Preds ..B1.24 ..B1.19
-                                # Execution count [2.50e+01]
-        vpcmpgtd  %ymm4, %ymm3, %k3                             #173.13
-        vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z}                 #174.25
-        kmovw     %k3, %r14d                                    #173.13
-        vpaddd    %ymm17, %ymm17, %ymm18                        #175.40
-        vpaddd    %ymm18, %ymm17, %ymm17                        #175.40
-                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
-..B1.23:                        # Preds ..B1.20
-                                # Execution count [1.25e+01]
-        kmovw     %k3, %k1                                      #175.40
-        kmovw     %k3, %k2                                      #175.40
-        vpxord    %zmm18, %zmm18, %zmm18                        #175.40
-        vpxord    %zmm19, %zmm19, %zmm19                        #175.40
-        vpxord    %zmm20, %zmm20, %zmm20                        #175.40
-        vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1}               #175.40
-        vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2}                #175.40
-        vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3}                 #175.40
-                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
-..B1.24:                        # Preds ..B1.23
-                                # Execution count [2.50e+01]
-        addq      $8, %r12                                      #173.13
-        #vpaddd    %ymm16, %ymm4, %ymm4                          #173.13
-        #vsubpd    %zmm18, %zmm0, %zmm29                         #177.40
-        #vsubpd    %zmm19, %zmm1, %zmm27                         #176.40
-        #vsubpd    %zmm20, %zmm2, %zmm26                         #175.40
-        #vmulpd    %zmm27, %zmm27, %zmm25                        #178.53
-        #vfmadd231pd %zmm26, %zmm26, %zmm25                      #178.53
-        #vfmadd231pd %zmm29, %zmm29, %zmm25                      #178.67
-        #vrcp14pd  %zmm25, %zmm24                                #195.42
-        #vcmppd    $1, %zmm14, %zmm25, %k2                       #194.26
-        #vfpclasspd $30, %zmm24, %k0                             #195.42
-        #kmovw     %k2, %r8d                                     #194.26
-        #knotw     %k0, %k1                                      #195.42
-        #vmovaps   %zmm25, %zmm17                                #195.42
-        #andl      %r8d, %r14d                                   #194.26
-        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
-        #kmovw     %r14d, %k3                                    #198.21
-        #vmulpd    %zmm17, %zmm17, %zmm18                        #195.42
-        #vfmadd213pd %zmm24, %zmm17, %zmm24{%k1}                 #195.42
-        #vfmadd213pd %zmm24, %zmm18, %zmm24{%k1}                 #195.42
-        #vmulpd    %zmm13, %zmm24, %zmm19                        #196.42
-        #vmulpd    %zmm9, %zmm24, %zmm21                         #197.58
-        #vmulpd    %zmm19, %zmm24, %zmm22                        #196.48
-        #vmulpd    %zmm22, %zmm24, %zmm20                        #196.54
-        #vfmsub213pd %zmm5, %zmm22, %zmm24                       #197.58
-        #vmulpd    %zmm21, %zmm20, %zmm23                        #197.65
-        #vmulpd    %zmm24, %zmm23, %zmm28                        #197.71
-        #vfmadd231pd %zmm26, %zmm28, %zmm8{%k3}                  #198.21
-        #vfmadd231pd %zmm27, %zmm28, %zmm7{%k3}                  #199.21
-        #vfmadd231pd %zmm29, %zmm28, %zmm11{%k3}                 #200.21
-        cmpq      %r9, %r12                                     #173.13
-        jb        ..B1.20       # Prob 82%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.25:                        # Preds ..B1.24
-                                # Execution count [4.50e+00]
-        movq      32(%rsp), %r8                                 #[spill]
-        movq      (%rsp), %r14                                  #[spill]
-        cmpl      %r11d, %r10d                                  #173.13
-        je        ..B1.40       # Prob 10%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.26:                        # Preds ..B1.25 ..B1.18 ..B1.47
-                                # Execution count [2.50e+01]
-        lea       8(%r11), %r9d                                 #173.13
-        cmpl      %r9d, %r13d                                   #173.13
-        jl        ..B1.34       # Prob 50%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.27:                        # Preds ..B1.26
-                                # Execution count [4.50e+00]
-        movq      %rdx, %r12                                    #144.43
-        imulq     %rsi, %r12                                    #144.43
-        vbroadcastsd %xmm10, %zmm1                              #146.23
-        vbroadcastsd %xmm6, %zmm0                               #147.23
-        vbroadcastsd %xmm12, %zmm2                              #148.23
-        movslq    %r11d, %r9                                    #173.13
-        addq      %rcx, %r12                                    #126.5
-        movq      %rdi, 8(%rsp)                                 #126.5[spill]
-        movq      %rdx, 16(%rsp)                                #126.5[spill]
-        movq      %rcx, 40(%rsp)                                #126.5[spill]
-        movq      %rax, 48(%rsp)                                #126.5[spill]
-        movq      %rsi, 56(%rsp)                                #126.5[spill]
-        movq      %r8, 32(%rsp)                                 #126.5[spill]
-        movq      %r14, (%rsp)                                  #126.5[spill]
-                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.28:                        # Preds ..B1.32 ..B1.27
-                                # Execution count [2.50e+01]
-        vmovdqu   (%r12,%r9,4), %ymm3                           #174.25
-        vpaddd    %ymm3, %ymm3, %ymm4                           #175.40
-        vpaddd    %ymm4, %ymm3, %ymm3                           #175.40
-        movl      (%r12,%r9,4), %r14d                           #174.25
-        movl      4(%r12,%r9,4), %r8d                           #174.25
-        movl      8(%r12,%r9,4), %edi                           #174.25
-        movl      12(%r12,%r9,4), %esi                          #174.25
-        lea       (%r14,%r14,2), %r14d                          #175.40
-        movl      16(%r12,%r9,4), %ecx                          #174.25
-        lea       (%r8,%r8,2), %r8d                             #175.40
-        movl      20(%r12,%r9,4), %edx                          #174.25
-        lea       (%rdi,%rdi,2), %edi                           #175.40
-        movl      24(%r12,%r9,4), %eax                          #174.25
-        lea       (%rsi,%rsi,2), %esi                           #175.40
-        movl      28(%r12,%r9,4), %r15d                         #174.25
-        lea       (%rcx,%rcx,2), %ecx                           #175.40
-        lea       (%rdx,%rdx,2), %edx                           #175.40
-        lea       (%rax,%rax,2), %eax                           #175.40
-        lea       (%r15,%r15,2), %r15d                          #175.40
-                                # LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.31:                        # Preds ..B1.28
-                                # Execution count [1.25e+01]
-        vpcmpeqb  %xmm0, %xmm0, %k1                             #175.40
-        vpcmpeqb  %xmm0, %xmm0, %k2                             #175.40
-        vpcmpeqb  %xmm0, %xmm0, %k3                             #175.40
-        vpxord    %zmm4, %zmm4, %zmm4                           #175.40
-        vpxord    %zmm17, %zmm17, %zmm17                        #175.40
-        vpxord    %zmm18, %zmm18, %zmm18                        #175.40
-        vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1}                 #175.40
-        vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2}                 #175.40
-        vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3}                  #175.40
-                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
-..B1.32:                        # Preds ..B1.31
-                                # Execution count [2.50e+01]
-        addl      $8, %r11d                                     #173.13
-        addq      $8, %r9                                       #173.13
-        #vsubpd    %zmm4, %zmm2, %zmm26                          #177.40
-        #vsubpd    %zmm17, %zmm0, %zmm24                         #176.40
-        #vsubpd    %zmm18, %zmm1, %zmm23                         #175.40
-        #vmulpd    %zmm24, %zmm24, %zmm3                         #178.53
-        #vfmadd231pd %zmm23, %zmm23, %zmm3                       #178.53
-        #vfmadd231pd %zmm26, %zmm26, %zmm3                       #178.67
-        #vrcp14pd  %zmm3, %zmm22                                 #195.42
-        #vcmppd    $1, %zmm14, %zmm3, %k2                        #194.26
-        #vfpclasspd $30, %zmm22, %k0                             #195.42
-        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
-        #knotw     %k0, %k1                                      #195.42
-        #vmulpd    %zmm3, %zmm3, %zmm4                           #195.42
-        #vfmadd213pd %zmm22, %zmm3, %zmm22{%k1}                  #195.42
-        #vfmadd213pd %zmm22, %zmm4, %zmm22{%k1}                  #195.42
-        #vmulpd    %zmm13, %zmm22, %zmm17                        #196.42
-        #vmulpd    %zmm9, %zmm22, %zmm19                         #197.58
-        #vmulpd    %zmm17, %zmm22, %zmm20                        #196.48
-        #vmulpd    %zmm20, %zmm22, %zmm18                        #196.54
-        #vfmsub213pd %zmm5, %zmm20, %zmm22                       #197.58
-        #vmulpd    %zmm19, %zmm18, %zmm21                        #197.65
-        #vmulpd    %zmm22, %zmm21, %zmm25                        #197.71
-        #vfmadd231pd %zmm23, %zmm25, %zmm8{%k2}                  #198.21
-        #vfmadd231pd %zmm24, %zmm25, %zmm7{%k2}                  #199.21
-        #vfmadd231pd %zmm26, %zmm25, %zmm11{%k2}                 #200.21
-        cmpl      %r13d, %r11d                                  #173.13
-        jb        ..B1.28       # Prob 82%                      #173.13
-                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.33:                        # Preds ..B1.32
-                                # Execution count [4.50e+00]
-        movq      8(%rsp), %rdi                                 #[spill]
-        movq      16(%rsp), %rdx                                #[spill]
-        movq      40(%rsp), %rcx                                #[spill]
-        movq      48(%rsp), %rax                                #[spill]
-        movq      56(%rsp), %rsi                                #[spill]
-        movq      32(%rsp), %r8                                 #[spill]
-        movq      (%rsp), %r14                                  #[spill]
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.34:                        # Preds ..B1.33 ..B1.26 ..B1.48
-                                # Execution count [5.00e+00]
-        lea       1(%r13), %r9d                                 #173.13
-        cmpl      %r10d, %r9d                                   #173.13
-        ja        ..B1.40       # Prob 50%                      #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.35:                        # Preds ..B1.34
-                                # Execution count [2.50e+01]
-        imulq     %rdx, %rsi                                    #144.43
-        vbroadcastsd %xmm10, %zmm4                              #146.23
-        subl      %r13d, %r10d                                  #173.13
-        addq      %rcx, %rsi                                    #126.5
-        vpbroadcastd %r10d, %ymm0                               #173.13
-        vpcmpgtd  %ymm15, %ymm0, %k3                            #173.13
-        movslq    %r13d, %r13                                   #173.13
-        kmovw     %k3, %r9d                                     #173.13
-        vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z}                  #174.25
-        vpaddd    %ymm1, %ymm1, %ymm2                           #175.40
-        vpaddd    %ymm2, %ymm1, %ymm0                           #175.40
-                                # LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
-..B1.38:                        # Preds ..B1.35
-                                # Execution count [1.25e+01]
-        kmovw     %k3, %k1                                      #175.40
-        kmovw     %k3, %k2                                      #175.40
-        vpxord    %zmm1, %zmm1, %zmm1                           #175.40
-        vpxord    %zmm2, %zmm2, %zmm2                           #175.40
-        vpxord    %zmm3, %zmm3, %zmm3                           #175.40
-        vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1}                 #175.40
-        vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2}                  #175.40
-        vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3}                   #175.40
-                                # LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.39:                        # Preds ..B1.38
-                                # Execution count [2.50e+01]
-        #vbroadcastsd %xmm6, %zmm6                               #147.23
-        #vbroadcastsd %xmm12, %zmm12                             #148.23
-        #vsubpd    %zmm1, %zmm12, %zmm23                         #177.40
-        #vsubpd    %zmm2, %zmm6, %zmm21                          #176.40
-        #vsubpd    %zmm3, %zmm4, %zmm20                          #175.40
-        #vmulpd    %zmm21, %zmm21, %zmm19                        #178.53
-        #vfmadd231pd %zmm20, %zmm20, %zmm19                      #178.53
-        #vfmadd231pd %zmm23, %zmm23, %zmm19                      #178.67
-        #vrcp14pd  %zmm19, %zmm18                                #195.42
-        #vcmppd    $1, %zmm14, %zmm19, %k2                       #194.26
-        #vfpclasspd $30, %zmm18, %k0                             #195.42
-        #kmovw     %k2, %esi                                     #194.26
-        #knotw     %k0, %k1                                      #195.42
-        #vmovaps   %zmm19, %zmm0                                 #195.42
-        #andl      %esi, %r9d                                    #194.26
-        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
-        #kmovw     %r9d, %k3                                     #198.21
-        #vmulpd    %zmm0, %zmm0, %zmm1                           #195.42
-        #vfmadd213pd %zmm18, %zmm0, %zmm18{%k1}                  #195.42
-        #vfmadd213pd %zmm18, %zmm1, %zmm18{%k1}                  #195.42
-        #vmulpd    %zmm13, %zmm18, %zmm2                         #196.42
-        #vmulpd    %zmm9, %zmm18, %zmm4                          #197.58
-        #vmulpd    %zmm2, %zmm18, %zmm10                         #196.48
-        #vmulpd    %zmm10, %zmm18, %zmm3                         #196.54
-        #vfmsub213pd %zmm5, %zmm10, %zmm18                       #197.58
-        #vmulpd    %zmm4, %zmm3, %zmm17                          #197.65
-        #vmulpd    %zmm18, %zmm17, %zmm22                        #197.71
-        #vfmadd231pd %zmm20, %zmm22, %zmm8{%k3}                  #198.21
-        #vfmadd231pd %zmm21, %zmm22, %zmm7{%k3}                  #199.21
-        #vfmadd231pd %zmm23, %zmm22, %zmm11{%k3}                 #200.21
-                                # LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.40:                        # Preds ..B1.25 ..B1.39 ..B1.34
-                                # Execution count [4.50e+00]
-        vmovups   .L_2il0floatpacket.10(%rip), %zmm19           #151.22
-        vpermd    %zmm11, %zmm19, %zmm0                         #151.22
-        vpermd    %zmm7, %zmm19, %zmm6                          #150.22
-        vpermd    %zmm8, %zmm19, %zmm20                         #149.22
-        vaddpd    %zmm11, %zmm0, %zmm11                         #151.22
-        vaddpd    %zmm7, %zmm6, %zmm7                           #150.22
-        vaddpd    %zmm8, %zmm20, %zmm8                          #149.22
-        vpermpd   $78, %zmm11, %zmm1                            #151.22
-        vpermpd   $78, %zmm7, %zmm10                            #150.22
-        vpermpd   $78, %zmm8, %zmm21                            #149.22
-        vaddpd    %zmm1, %zmm11, %zmm2                          #151.22
-        vaddpd    %zmm10, %zmm7, %zmm12                         #150.22
-        vaddpd    %zmm21, %zmm8, %zmm22                         #149.22
-        vpermpd   $177, %zmm2, %zmm3                            #151.22
-        vpermpd   $177, %zmm12, %zmm17                          #150.22
-        vpermpd   $177, %zmm22, %zmm23                          #149.22
-        vaddpd    %zmm3, %zmm2, %zmm4                           #151.22
-        vaddpd    %zmm17, %zmm12, %zmm18                        #150.22
-        vaddpd    %zmm23, %zmm22, %zmm24                        #149.22
-                                # LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.41:                        # Preds ..B1.40 ..B1.11
-                                # Execution count [5.00e+00]
-        movq      80(%rsp), %rsi                                #208.9[spill]
-        addq      $24, %rdi                                     #143.5
-        vaddsd    (%rsi,%rax,8), %xmm24, %xmm0                  #208.9
-        vmovsd    %xmm0, (%rsi,%rax,8)                          #208.9
-        movslq    %eax, %rsi                                    #143.32
-        vaddsd    (%r14,%rax,8), %xmm18, %xmm1                  #209.9
-        vmovsd    %xmm1, (%r14,%rax,8)                          #209.9
-        incq      %rsi                                          #143.32
-        vaddsd    (%r8,%rax,8), %xmm4, %xmm2                    #210.9
-        vmovsd    %xmm2, (%r8,%rax,8)                           #210.9
-        incq      %rax                                          #143.5
-        cmpq      64(%rsp), %rax                                #143.5[spill]
-        jb        ..B1.11       # Prob 82%                      #143.5
-        jmp       ..B1.44       # Prob 100%                     #143.5
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.43:                        # Preds ..B1.2
-                                # Execution count [5.00e-01]
-        movl      $.L_2__STRING.0, %edi                         #141.5
-..___tag_value_computeForce.48:
-#       likwid_markerStartRegion(const char *)
-        call      likwid_markerStartRegion                      #141.5
-..___tag_value_computeForce.49:
-                                # LOE
-..B1.44:                        # Preds ..B1.41 ..B1.43
-                                # Execution count [1.00e+00]
-        movl      $.L_2__STRING.0, %edi                         #219.5
-        vzeroupper                                              #219.5
-..___tag_value_computeForce.50:
-#       likwid_markerStopRegion(const char *)
-        call      likwid_markerStopRegion                       #219.5
-..___tag_value_computeForce.51:
-                                # LOE
-..B1.45:                        # Preds ..B1.44
-                                # Execution count [1.00e+00]
-        xorl      %eax, %eax                                    #221.16
-..___tag_value_computeForce.52:
-#       getTimeStamp()
-        call      getTimeStamp                                  #221.16
-..___tag_value_computeForce.53:
-                                # LOE xmm0
-..B1.46:                        # Preds ..B1.45
-                                # Execution count [1.00e+00]
-        vsubsd    24(%rsp), %xmm0, %xmm0                        #224.14[spill]
-        addq      $88, %rsp                                     #224.14
-	.cfi_restore 3
-        popq      %rbx                                          #224.14
-	.cfi_restore 15
-        popq      %r15                                          #224.14
-	.cfi_restore 14
-        popq      %r14                                          #224.14
-	.cfi_restore 13
-        popq      %r13                                          #224.14
-	.cfi_restore 12
-        popq      %r12                                          #224.14
-        movq      %rbp, %rsp                                    #224.14
-        popq      %rbp                                          #224.14
-	.cfi_def_cfa 7, 8
-	.cfi_restore 6
-        ret                                                     #224.14
-	.cfi_def_cfa 6, 16
-	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
-	.cfi_offset 6, -16
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
-                                # LOE
-..B1.47:                        # Preds ..B1.13
-                                # Execution count [4.50e-01]: Infreq
-        movl      %r10d, %r13d                                  #173.13
-        xorl      %r11d, %r11d                                  #173.13
-        andl      $-8, %r13d                                    #173.13
-        jmp       ..B1.26       # Prob 100%                     #173.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.48:                        # Preds ..B1.12
-                                # Execution count [4.50e-01]: Infreq
-        xorl      %r13d, %r13d                                  #173.13
-        jmp       ..B1.34       # Prob 100%                     #173.13
-        .align    16,0x90
-                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-	.cfi_endproc
-# mark_end;
-	.type	computeForce,@function
-	.size	computeForce,.-computeForce
-..LNcomputeForce.0:
-	.data
-# -- End  computeForce
-	.section .rodata, "a"
-	.align 64
-	.align 64
-.L_2il0floatpacket.2:
-	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.2,@object
-	.size	.L_2il0floatpacket.2,64
-	.align 64
-.L_2il0floatpacket.4:
-	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
-	.type	.L_2il0floatpacket.4,@object
-	.size	.L_2il0floatpacket.4,64
-	.align 64
-.L_2il0floatpacket.5:
-	.long	0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
-	.type	.L_2il0floatpacket.5,@object
-	.size	.L_2il0floatpacket.5,64
-	.align 64
-.L_2il0floatpacket.6:
-	.long	0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
-	.type	.L_2il0floatpacket.6,@object
-	.size	.L_2il0floatpacket.6,64
-	.align 64
-.L_2il0floatpacket.7:
-	.long	0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
-	.type	.L_2il0floatpacket.7,@object
-	.size	.L_2il0floatpacket.7,64
-	.align 64
-.L_2il0floatpacket.8:
-	.long	0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
-	.type	.L_2il0floatpacket.8,@object
-	.size	.L_2il0floatpacket.8,64
-	.align 64
-.L_2il0floatpacket.10:
-	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
-	.type	.L_2il0floatpacket.10,@object
-	.size	.L_2il0floatpacket.10,64
-	.align 32
-.L_2il0floatpacket.0:
-	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,32
-	.align 32
-.L_2il0floatpacket.1:
-	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,32
-	.align 8
-.L_2il0floatpacket.3:
-	.long	0x00000000,0x40480000
-	.type	.L_2il0floatpacket.3,@object
-	.size	.L_2il0floatpacket.3,8
-	.align 8
-.L_2il0floatpacket.9:
-	.long	0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.9,@object
-	.size	.L_2il0floatpacket.9,8
-	.section .rodata.str1.4, "aMS",@progbits,1
-	.align 4
-	.align 4
-.L_2__STRING.0:
-	.long	1668444006
-	.word	101
-	.type	.L_2__STRING.0,@object
-	.size	.L_2__STRING.0,6
-	.data
-	.section .note.GNU-stack, ""
-# End
--- a/asm/unused/force-mem-only.s
+++ b/asm/unused/force-mem-only.s
@@ -1,585 +0,0 @@
-# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
-# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
-# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
-	.file "force.c"
-	.text
-..TXTST0:
-.L_2__routine_start_computeForce_0:
-# -- Begin  computeForce
-	.text
-# mark_begin;
-       .align    16,0x90
-	.globl computeForce
-# --- computeForce(Parameter *, Atom *, Neighbor *, int)
-computeForce:
-# parameter 1: %rdi
-# parameter 2: %rsi
-# parameter 3: %rdx
-# parameter 4: %ecx
-..B1.1:                         # Preds ..B1.0
-                                # Execution count [1.00e+00]
-	.cfi_startproc
-..___tag_value_computeForce.1:
-..L2:
-                                                          #103.87
-        pushq     %rbp                                          #103.87
-	.cfi_def_cfa_offset 16
-        movq      %rsp, %rbp                                    #103.87
-	.cfi_def_cfa 6, 16
-	.cfi_offset 6, -16
-        andq      $-64, %rsp                                    #103.87
-        pushq     %r12                                          #103.87
-        pushq     %r13                                          #103.87
-        pushq     %r14                                          #103.87
-        subq      $104, %rsp                                    #103.87
-        xorl      %eax, %eax                                    #106.16
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
-        movq      %rdx, %r14                                    #103.87
-        movq      %rsi, %r13                                    #103.87
-        movq      %rdi, %r12                                    #103.87
-..___tag_value_computeForce.9:
-#       getTimeStamp()
-        call      getTimeStamp                                  #106.16
-..___tag_value_computeForce.10:
-                                # LOE rbx r12 r13 r14 r15 xmm0
-..B1.48:                        # Preds ..B1.1
-                                # Execution count [1.00e+00]
-        vmovsd    %xmm0, 16(%rsp)                               #106.16[spill]
-                                # LOE rbx r12 r13 r14 r15
-..B1.2:                         # Preds ..B1.48
-                                # Execution count [1.00e+00]
-        movl      4(%r13), %ecx                                 #107.18
-        movq      64(%r13), %r11                                #109.20
-        movq      72(%r13), %r10                                #109.45
-        movq      80(%r13), %r9                                 #109.70
-        vmovsd    72(%r12), %xmm2                               #111.27
-        vmovsd    8(%r12), %xmm1                                #112.23
-        vmovsd    (%r12), %xmm0                                 #113.24
-        testl     %ecx, %ecx                                    #116.24
-        jle       ..B1.42       # Prob 50%                      #116.24
-                                # LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
-..B1.3:                         # Preds ..B1.2
-                                # Execution count [1.00e+00]
-        xorl      %edi, %edi                                    #116.5
-        movl      %ecx, %edx                                    #116.5
-        xorl      %esi, %esi                                    #116.5
-        movl      $1, %r8d                                      #116.5
-        xorl      %eax, %eax                                    #117.17
-        shrl      $1, %edx                                      #116.5
-        je        ..B1.7        # Prob 9%                       #116.5
-                                # LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
-..B1.5:                         # Preds ..B1.3 ..B1.5
-                                # Execution count [2.50e+00]
-        movq      %rax, (%rsi,%r11)                             #117.9
-        incq      %rdi                                          #116.5
-        movq      %rax, (%rsi,%r10)                             #118.9
-        movq      %rax, (%rsi,%r9)                              #119.9
-        movq      %rax, 8(%rsi,%r11)                            #117.9
-        movq      %rax, 8(%rsi,%r10)                            #118.9
-        movq      %rax, 8(%rsi,%r9)                             #119.9
-        addq      $16, %rsi                                     #116.5
-        cmpq      %rdx, %rdi                                    #116.5
-        jb        ..B1.5        # Prob 63%                      #116.5
-                                # LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
-..B1.6:                         # Preds ..B1.5
-                                # Execution count [9.00e-01]
-        lea       1(%rdi,%rdi), %r8d                            #117.9
-                                # LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
-..B1.7:                         # Preds ..B1.3 ..B1.6
-                                # Execution count [1.00e+00]
-        lea       -1(%r8), %edx                                 #116.5
-        cmpl      %ecx, %edx                                    #116.5
-        jae       ..B1.9        # Prob 9%                       #116.5
-                                # LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
-..B1.8:                         # Preds ..B1.7
-                                # Execution count [9.00e-01]
-        movslq    %r8d, %r8                                     #116.5
-        movq      %rax, -8(%r11,%r8,8)                          #117.9
-        movq      %rax, -8(%r10,%r8,8)                          #118.9
-        movq      %rax, -8(%r9,%r8,8)                           #119.9
-                                # LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
-..B1.9:                         # Preds ..B1.7 ..B1.8
-                                # Execution count [9.00e-01]
-        vmulsd    %xmm2, %xmm2, %xmm13                          #111.45
-        xorl      %edi, %edi                                    #124.15
-        vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16            #153.13
-        vmulsd    .L_2il0floatpacket.3(%rip), %xmm0, %xmm0      #177.45
-        vmovdqu   .L_2il0floatpacket.1(%rip), %ymm15            #153.13
-        vmovups   .L_2il0floatpacket.4(%rip), %zmm5             #177.58
-        vbroadcastsd %xmm13, %zmm14                             #111.25
-        vbroadcastsd %xmm1, %zmm13                              #112.21
-        vbroadcastsd %xmm0, %zmm9                               #177.45
-        movq      16(%r13), %rdx                                #127.25
-        xorl      %r8d, %r8d                                    #124.5
-        movslq    %ecx, %r12                                    #124.5
-        xorl      %eax, %eax                                    #124.5
-        movq      24(%r14), %r13                                #126.25
-        movslq    16(%r14), %rcx                                #125.43
-        movq      8(%r14), %rsi                                 #125.19
-        shlq      $2, %rcx                                      #108.5
-        movq      %r12, 80(%rsp)                                #124.5[spill]
-        movq      %r13, 88(%rsp)                                #124.5[spill]
-        movq      %r11, 96(%rsp)                                #124.5[spill]
-        movq      %r15, 8(%rsp)                                 #124.5[spill]
-        movq      %rbx, (%rsp)                                  #124.5[spill]
-	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.10:                        # Preds ..B1.40 ..B1.9
-                                # Execution count [5.00e+00]
-        movq      88(%rsp), %rbx                                #126.25[spill]
-        vxorpd    %xmm24, %xmm24, %xmm24                        #130.22
-        vmovapd   %xmm24, %xmm18                                #131.22
-        movl      (%rbx,%r8,4), %r11d                           #126.25
-        vmovapd   %xmm18, %xmm4                                 #132.22
-        vmovsd    (%rax,%rdx), %xmm10                           #127.25
-        vmovsd    8(%rax,%rdx), %xmm6                           #128.25
-        vmovsd    16(%rax,%rdx), %xmm12                         #129.25
-        testl     %r11d, %r11d                                  #153.32
-        jle       ..B1.40       # Prob 50%                      #153.32
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.11:                        # Preds ..B1.10
-                                # Execution count [4.50e+00]
-        vpxord    %zmm8, %zmm8, %zmm8                           #130.22
-        vmovaps   %zmm8, %zmm7                                  #131.22
-        vmovaps   %zmm7, %zmm11                                 #132.22
-        cmpl      $8, %r11d                                     #153.13
-        jl        ..B1.45       # Prob 10%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.12:                        # Preds ..B1.11
-                                # Execution count [4.50e+00]
-        cmpl      $1200, %r11d                                  #153.13
-        jl        ..B1.44       # Prob 10%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.13:                        # Preds ..B1.12
-                                # Execution count [4.50e+00]
-        movq      %rcx, %r15                                    #125.43
-        imulq     %rdi, %r15                                    #125.43
-        addq      %rsi, %r15                                    #108.5
-        movq      %r15, %r12                                    #153.13
-        andq      $63, %r12                                     #153.13
-        testl     $3, %r12d                                     #153.13
-        je        ..B1.15       # Prob 50%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.14:                        # Preds ..B1.13
-                                # Execution count [2.25e+00]
-        xorl      %r12d, %r12d                                  #153.13
-        jmp       ..B1.17       # Prob 100%                     #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.15:                        # Preds ..B1.13
-                                # Execution count [2.25e+00]
-        testl     %r12d, %r12d                                  #153.13
-        je        ..B1.17       # Prob 50%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.16:                        # Preds ..B1.15
-                                # Execution count [2.50e+01]
-        negl      %r12d                                         #153.13
-        addl      $64, %r12d                                    #153.13
-        shrl      $2, %r12d                                     #153.13
-        cmpl      %r12d, %r11d                                  #153.13
-        cmovl     %r11d, %r12d                                  #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.17:                        # Preds ..B1.14 ..B1.16 ..B1.15
-                                # Execution count [5.00e+00]
-        movl      %r11d, %r14d                                  #153.13
-        subl      %r12d, %r14d                                  #153.13
-        andl      $7, %r14d                                     #153.13
-        negl      %r14d                                         #153.13
-        addl      %r11d, %r14d                                  #153.13
-        cmpl      $1, %r12d                                     #153.13
-        jb        ..B1.25       # Prob 50%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.18:                        # Preds ..B1.17
-                                # Execution count [4.50e+00]
-        vmovdqa   %ymm15, %ymm4                                 #153.13
-        xorl      %r13d, %r13d                                  #153.13
-        vpbroadcastd %r12d, %ymm3                               #153.13
-        vbroadcastsd %xmm10, %zmm2                              #127.23
-        vbroadcastsd %xmm6, %zmm1                               #128.23
-        vbroadcastsd %xmm12, %zmm0                              #129.23
-        movslq    %r12d, %rbx                                   #153.13
-        movq      %r9, 24(%rsp)                                 #153.13[spill]
-        movq      %r10, 32(%rsp)                                #153.13[spill]
-                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.19:                        # Preds ..B1.23 ..B1.18
-                                # Execution count [2.50e+01]
-        vpcmpgtd  %ymm4, %ymm3, %k3                             #153.13
-        vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z}                 #154.25
-        kmovw     %k3, %r10d                                    #153.13
-        vpaddd    %ymm17, %ymm17, %ymm18                        #155.40
-        vpaddd    %ymm18, %ymm17, %ymm17                        #155.40
-                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
-..B1.22:                        # Preds ..B1.19
-                                # Execution count [1.25e+01]
-        kmovw     %k3, %k1                                      #155.40
-        kmovw     %k3, %k2                                      #155.40
-        vpxord    %zmm18, %zmm18, %zmm18                        #155.40
-        vpxord    %zmm19, %zmm19, %zmm19                        #155.40
-        vpxord    %zmm20, %zmm20, %zmm20                        #155.40
-        vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1}               #155.40
-        vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2}                #155.40
-        vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3}                 #155.40
-                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
-..B1.23:                        # Preds ..B1.22
-                                # Execution count [2.50e+01]
-        addq      $8, %r13                                      #153.13
-        #vpaddd    %ymm16, %ymm4, %ymm4                          #153.13
-        #vsubpd    %zmm18, %zmm0, %zmm29                         #157.40
-        #vsubpd    %zmm19, %zmm1, %zmm27                         #156.40
-        #vsubpd    %zmm20, %zmm2, %zmm26                         #155.40
-        #vmulpd    %zmm27, %zmm27, %zmm25                        #158.53
-        #vfmadd231pd %zmm26, %zmm26, %zmm25                      #158.53
-        #vfmadd231pd %zmm29, %zmm29, %zmm25                      #158.67
-        #vrcp14pd  %zmm25, %zmm24                                #175.42
-        #vcmppd    $1, %zmm14, %zmm25, %k2                       #174.26
-        #vfpclasspd $30, %zmm24, %k0                             #175.42
-        #kmovw     %k2, %r9d                                     #174.26
-        #knotw     %k0, %k1                                      #175.42
-        #vmovaps   %zmm25, %zmm17                                #175.42
-        #andl      %r9d, %r10d                                   #174.26
-        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
-        #kmovw     %r10d, %k3                                    #178.21
-        #vmulpd    %zmm17, %zmm17, %zmm18                        #175.42
-        #vfmadd213pd %zmm24, %zmm17, %zmm24{%k1}                 #175.42
-        #vfmadd213pd %zmm24, %zmm18, %zmm24{%k1}                 #175.42
-        #vmulpd    %zmm13, %zmm24, %zmm19                        #176.42
-        #vmulpd    %zmm9, %zmm24, %zmm21                         #177.58
-        #vmulpd    %zmm19, %zmm24, %zmm22                        #176.48
-        #vmulpd    %zmm22, %zmm24, %zmm20                        #176.54
-        #vfmsub213pd %zmm5, %zmm22, %zmm24                       #177.58
-        #vmulpd    %zmm21, %zmm20, %zmm23                        #177.65
-        #vmulpd    %zmm24, %zmm23, %zmm28                        #177.71
-        #vfmadd231pd %zmm26, %zmm28, %zmm8{%k3}                  #178.21
-        #vfmadd231pd %zmm27, %zmm28, %zmm7{%k3}                  #179.21
-        #vfmadd231pd %zmm29, %zmm28, %zmm11{%k3}                 #180.21
-        cmpq      %rbx, %r13                                    #153.13
-        jb        ..B1.19       # Prob 82%                      #153.13
-                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.24:                        # Preds ..B1.23
-                                # Execution count [4.50e+00]
-        movq      24(%rsp), %r9                                 #[spill]
-        movq      32(%rsp), %r10                                #[spill]
-        cmpl      %r12d, %r11d                                  #153.13
-        je        ..B1.39       # Prob 10%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.25:                        # Preds ..B1.24 ..B1.17 ..B1.44
-                                # Execution count [2.50e+01]
-        lea       8(%r12), %ebx                                 #153.13
-        cmpl      %ebx, %r14d                                   #153.13
-        jl        ..B1.33       # Prob 50%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.26:                        # Preds ..B1.25
-                                # Execution count [4.50e+00]
-        movq      %rcx, %r13                                    #125.43
-        imulq     %rdi, %r13                                    #125.43
-        vbroadcastsd %xmm10, %zmm1                              #127.23
-        vbroadcastsd %xmm6, %zmm0                               #128.23
-        vbroadcastsd %xmm12, %zmm2                              #129.23
-        movslq    %r12d, %rbx                                   #153.13
-        addq      %rsi, %r13                                    #108.5
-        movq      %rax, 40(%rsp)                                #108.5[spill]
-        movq      %rcx, 48(%rsp)                                #108.5[spill]
-        movq      %rsi, 56(%rsp)                                #108.5[spill]
-        movq      %r8, 64(%rsp)                                 #108.5[spill]
-        movq      %rdi, 72(%rsp)                                #108.5[spill]
-        movq      %r9, 24(%rsp)                                 #108.5[spill]
-        movq      %r10, 32(%rsp)                                #108.5[spill]
-                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.27:                        # Preds ..B1.31 ..B1.26
-                                # Execution count [2.50e+01]
-        vmovdqu   (%r13,%rbx,4), %ymm3                          #154.25
-        vpaddd    %ymm3, %ymm3, %ymm4                           #155.40
-        vpaddd    %ymm4, %ymm3, %ymm3                           #155.40
-        movl      (%r13,%rbx,4), %r10d                          #154.25
-        movl      4(%r13,%rbx,4), %r9d                          #154.25
-        movl      8(%r13,%rbx,4), %r8d                          #154.25
-        movl      12(%r13,%rbx,4), %edi                         #154.25
-        lea       (%r10,%r10,2), %r10d                          #155.40
-        movl      16(%r13,%rbx,4), %esi                         #154.25
-        lea       (%r9,%r9,2), %r9d                             #155.40
-        movl      20(%r13,%rbx,4), %ecx                         #154.25
-        lea       (%r8,%r8,2), %r8d                             #155.40
-        movl      24(%r13,%rbx,4), %eax                         #154.25
-        lea       (%rdi,%rdi,2), %edi                           #155.40
-        movl      28(%r13,%rbx,4), %r15d                        #154.25
-        lea       (%rsi,%rsi,2), %esi                           #155.40
-        lea       (%rcx,%rcx,2), %ecx                           #155.40
-        lea       (%rax,%rax,2), %eax                           #155.40
-        lea       (%r15,%r15,2), %r15d                          #155.40
-                                # LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.30:                        # Preds ..B1.27
-                                # Execution count [1.25e+01]
-        vpcmpeqb  %xmm0, %xmm0, %k1                             #155.40
-        vpcmpeqb  %xmm0, %xmm0, %k2                             #155.40
-        vpcmpeqb  %xmm0, %xmm0, %k3                             #155.40
-        vpxord    %zmm4, %zmm4, %zmm4                           #155.40
-        vpxord    %zmm17, %zmm17, %zmm17                        #155.40
-        vpxord    %zmm18, %zmm18, %zmm18                        #155.40
-        vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1}                 #155.40
-        vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2}                 #155.40
-        vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3}                  #155.40
-                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
-..B1.31:                        # Preds ..B1.30
-                                # Execution count [2.50e+01]
-        addl      $8, %r12d                                     #153.13
-        addq      $8, %rbx                                      #153.13
-        #vsubpd    %zmm4, %zmm2, %zmm26                          #157.40
-        #vsubpd    %zmm17, %zmm0, %zmm24                         #156.40
-        #vsubpd    %zmm18, %zmm1, %zmm23                         #155.40
-        #vmulpd    %zmm24, %zmm24, %zmm3                         #158.53
-        #vfmadd231pd %zmm23, %zmm23, %zmm3                       #158.53
-        #vfmadd231pd %zmm26, %zmm26, %zmm3                       #158.67
-        #vrcp14pd  %zmm3, %zmm22                                 #175.42
-        #vcmppd    $1, %zmm14, %zmm3, %k2                        #174.26
-        #vfpclasspd $30, %zmm22, %k0                             #175.42
-        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
-        #knotw     %k0, %k1                                      #175.42
-        #vmulpd    %zmm3, %zmm3, %zmm4                           #175.42
-        #vfmadd213pd %zmm22, %zmm3, %zmm22{%k1}                  #175.42
-        #vfmadd213pd %zmm22, %zmm4, %zmm22{%k1}                  #175.42
-        #vmulpd    %zmm13, %zmm22, %zmm17                        #176.42
-        #vmulpd    %zmm9, %zmm22, %zmm19                         #177.58
-        #vmulpd    %zmm17, %zmm22, %zmm20                        #176.48
-        #vmulpd    %zmm20, %zmm22, %zmm18                        #176.54
-        #vfmsub213pd %zmm5, %zmm20, %zmm22                       #177.58
-        #vmulpd    %zmm19, %zmm18, %zmm21                        #177.65
-        #vmulpd    %zmm22, %zmm21, %zmm25                        #177.71
-        #vfmadd231pd %zmm23, %zmm25, %zmm8{%k2}                  #178.21
-        #vfmadd231pd %zmm24, %zmm25, %zmm7{%k2}                  #179.21
-        #vfmadd231pd %zmm26, %zmm25, %zmm11{%k2}                 #180.21
-        cmpl      %r14d, %r12d                                  #153.13
-        jb        ..B1.27       # Prob 82%                      #153.13
-                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.32:                        # Preds ..B1.31
-                                # Execution count [4.50e+00]
-        movq      40(%rsp), %rax                                #[spill]
-        movq      48(%rsp), %rcx                                #[spill]
-        movq      56(%rsp), %rsi                                #[spill]
-        movq      64(%rsp), %r8                                 #[spill]
-        movq      72(%rsp), %rdi                                #[spill]
-        movq      24(%rsp), %r9                                 #[spill]
-        movq      32(%rsp), %r10                                #[spill]
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.33:                        # Preds ..B1.32 ..B1.25 ..B1.45
-                                # Execution count [5.00e+00]
-        lea       1(%r14), %ebx                                 #153.13
-        cmpl      %r11d, %ebx                                   #153.13
-        ja        ..B1.39       # Prob 50%                      #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.34:                        # Preds ..B1.33
-                                # Execution count [2.50e+01]
-        imulq     %rcx, %rdi                                    #125.43
-        vbroadcastsd %xmm10, %zmm4                              #127.23
-        subl      %r14d, %r11d                                  #153.13
-        addq      %rsi, %rdi                                    #108.5
-        vpbroadcastd %r11d, %ymm0                               #153.13
-        vpcmpgtd  %ymm15, %ymm0, %k3                            #153.13
-        movslq    %r14d, %r14                                   #153.13
-        vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z}                  #154.25
-        kmovw     %k3, %edi                                     #153.13
-        vpaddd    %ymm1, %ymm1, %ymm2                           #155.40
-        vpaddd    %ymm2, %ymm1, %ymm0                           #155.40
-                                # LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
-..B1.37:                        # Preds ..B1.34
-                                # Execution count [1.25e+01]
-        kmovw     %k3, %k1                                      #155.40
-        kmovw     %k3, %k2                                      #155.40
-        vpxord    %zmm1, %zmm1, %zmm1                           #155.40
-        vpxord    %zmm2, %zmm2, %zmm2                           #155.40
-        vpxord    %zmm3, %zmm3, %zmm3                           #155.40
-        vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1}                 #155.40
-        vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2}                  #155.40
-        vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3}                   #155.40
-                                # LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.38:                        # Preds ..B1.37
-                                # Execution count [2.50e+01]
-        #vbroadcastsd %xmm6, %zmm6                               #128.23
-        #vbroadcastsd %xmm12, %zmm12                             #129.23
-        #vsubpd    %zmm1, %zmm12, %zmm23                         #157.40
-        #vsubpd    %zmm2, %zmm6, %zmm21                          #156.40
-        #vsubpd    %zmm3, %zmm4, %zmm20                          #155.40
-        #vmulpd    %zmm21, %zmm21, %zmm19                        #158.53
-        #vfmadd231pd %zmm20, %zmm20, %zmm19                      #158.53
-        #vfmadd231pd %zmm23, %zmm23, %zmm19                      #158.67
-        #vrcp14pd  %zmm19, %zmm18                                #175.42
-        #vcmppd    $1, %zmm14, %zmm19, %k2                       #174.26
-        #vfpclasspd $30, %zmm18, %k0                             #175.42
-        #kmovw     %k2, %ebx                                     #174.26
-        #knotw     %k0, %k1                                      #175.42
-        #vmovaps   %zmm19, %zmm0                                 #175.42
-        #andl      %ebx, %edi                                    #174.26
-        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
-        #kmovw     %edi, %k3                                     #178.21
-        #vmulpd    %zmm0, %zmm0, %zmm1                           #175.42
-        #vfmadd213pd %zmm18, %zmm0, %zmm18{%k1}                  #175.42
-        #vfmadd213pd %zmm18, %zmm1, %zmm18{%k1}                  #175.42
-        #vmulpd    %zmm13, %zmm18, %zmm2                         #176.42
-        #vmulpd    %zmm9, %zmm18, %zmm4                          #177.58
-        #vmulpd    %zmm2, %zmm18, %zmm10                         #176.48
-        #vmulpd    %zmm10, %zmm18, %zmm3                         #176.54
-        #vfmsub213pd %zmm5, %zmm10, %zmm18                       #177.58
-        #vmulpd    %zmm4, %zmm3, %zmm17                          #177.65
-        #vmulpd    %zmm18, %zmm17, %zmm22                        #177.71
-        #vfmadd231pd %zmm20, %zmm22, %zmm8{%k3}                  #178.21
-        #vfmadd231pd %zmm21, %zmm22, %zmm7{%k3}                  #179.21
-        #vfmadd231pd %zmm23, %zmm22, %zmm11{%k3}                 #180.21
-                                # LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.39:                        # Preds ..B1.24 ..B1.38 ..B1.33
-                                # Execution count [4.50e+00]
-        vmovups   .L_2il0floatpacket.10(%rip), %zmm19           #132.22
-        vpermd    %zmm11, %zmm19, %zmm0                         #132.22
-        vpermd    %zmm7, %zmm19, %zmm6                          #131.22
-        vpermd    %zmm8, %zmm19, %zmm20                         #130.22
-        vaddpd    %zmm11, %zmm0, %zmm11                         #132.22
-        vaddpd    %zmm7, %zmm6, %zmm7                           #131.22
-        vaddpd    %zmm8, %zmm20, %zmm8                          #130.22
-        vpermpd   $78, %zmm11, %zmm1                            #132.22
-        vpermpd   $78, %zmm7, %zmm10                            #131.22
-        vpermpd   $78, %zmm8, %zmm21                            #130.22
-        vaddpd    %zmm1, %zmm11, %zmm2                          #132.22
-        vaddpd    %zmm10, %zmm7, %zmm12                         #131.22
-        vaddpd    %zmm21, %zmm8, %zmm22                         #130.22
-        vpermpd   $177, %zmm2, %zmm3                            #132.22
-        vpermpd   $177, %zmm12, %zmm17                          #131.22
-        vpermpd   $177, %zmm22, %zmm23                          #130.22
-        vaddpd    %zmm3, %zmm2, %zmm4                           #132.22
-        vaddpd    %zmm17, %zmm12, %zmm18                        #131.22
-        vaddpd    %zmm23, %zmm22, %zmm24                        #130.22
-                                # LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.40:                        # Preds ..B1.39 ..B1.10
-                                # Execution count [5.00e+00]
-        movq      96(%rsp), %rbx                                #188.9[spill]
-        addq      $24, %rax                                     #124.5
-        movslq    %r8d, %rdi                                    #124.32
-        incq      %rdi                                          #124.32
-        #vaddsd    (%rbx,%r8,8), %xmm24, %xmm0                   #188.9
-        #vmovsd    %xmm0, (%rbx,%r8,8)                           #188.9
-        #vaddsd    (%r10,%r8,8), %xmm18, %xmm1                   #189.9
-        #vmovsd    %xmm1, (%r10,%r8,8)                           #189.9
-        #vaddsd    (%r9,%r8,8), %xmm4, %xmm2                     #190.9
-        #vmovsd    %xmm2, (%r9,%r8,8)                            #190.9
-        incq      %r8                                           #124.5
-        cmpq      80(%rsp), %r8                                 #124.5[spill]
-        jb        ..B1.10       # Prob 82%                      #124.5
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
-..B1.41:                        # Preds ..B1.40
-                                # Execution count [9.00e-01]
-        movq      8(%rsp), %r15                                 #[spill]
-	.cfi_restore 15
-        movq      (%rsp), %rbx                                  #[spill]
-	.cfi_restore 3
-                                # LOE rbx r15
-..B1.42:                        # Preds ..B1.2 ..B1.41
-                                # Execution count [1.00e+00]
-        xorl      %eax, %eax                                    #201.16
-        vzeroupper                                              #201.16
-..___tag_value_computeForce.43:
-#       getTimeStamp()
-        call      getTimeStamp                                  #201.16
-..___tag_value_computeForce.44:
-                                # LOE rbx r15 xmm0
-..B1.43:                        # Preds ..B1.42
-                                # Execution count [1.00e+00]
-        vsubsd    16(%rsp), %xmm0, %xmm0                        #204.14[spill]
-        addq      $104, %rsp                                    #204.14
-	.cfi_restore 14
-        popq      %r14                                          #204.14
-	.cfi_restore 13
-        popq      %r13                                          #204.14
-	.cfi_restore 12
-        popq      %r12                                          #204.14
-        movq      %rbp, %rsp                                    #204.14
-        popq      %rbp                                          #204.14
-	.cfi_def_cfa 7, 8
-	.cfi_restore 6
-        ret                                                     #204.14
-	.cfi_def_cfa 6, 16
-	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
-	.cfi_offset 6, -16
-	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
-	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
-                                # LOE
-..B1.44:                        # Preds ..B1.12
-                                # Execution count [4.50e-01]: Infreq
-        movl      %r11d, %r14d                                  #153.13
-        xorl      %r12d, %r12d                                  #153.13
-        andl      $-8, %r14d                                    #153.13
-        jmp       ..B1.25       # Prob 100%                     #153.13
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-..B1.45:                        # Preds ..B1.11
-                                # Execution count [4.50e-01]: Infreq
-        xorl      %r14d, %r14d                                  #153.13
-        jmp       ..B1.33       # Prob 100%                     #153.13
-        .align    16,0x90
-                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
-	.cfi_endproc
-# mark_end;
-	.type	computeForce,@function
-	.size	computeForce,.-computeForce
-..LNcomputeForce.0:
-	.data
-# -- End  computeForce
-	.section .rodata, "a"
-	.align 64
-	.align 64
-.L_2il0floatpacket.2:
-	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.2,@object
-	.size	.L_2il0floatpacket.2,64
-	.align 64
-.L_2il0floatpacket.4:
-	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
-	.type	.L_2il0floatpacket.4,@object
-	.size	.L_2il0floatpacket.4,64
-	.align 64
-.L_2il0floatpacket.5:
-	.long	0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
-	.type	.L_2il0floatpacket.5,@object
-	.size	.L_2il0floatpacket.5,64
-	.align 64
-.L_2il0floatpacket.6:
-	.long	0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
-	.type	.L_2il0floatpacket.6,@object
-	.size	.L_2il0floatpacket.6,64
-	.align 64
-.L_2il0floatpacket.7:
-	.long	0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
-	.type	.L_2il0floatpacket.7,@object
-	.size	.L_2il0floatpacket.7,64
-	.align 64
-.L_2il0floatpacket.8:
-	.long	0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
-	.type	.L_2il0floatpacket.8,@object
-	.size	.L_2il0floatpacket.8,64
-	.align 64
-.L_2il0floatpacket.10:
-	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
-	.type	.L_2il0floatpacket.10,@object
-	.size	.L_2il0floatpacket.10,64
-	.align 32
-.L_2il0floatpacket.0:
-	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,32
-	.align 32
-.L_2il0floatpacket.1:
-	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,32
-	.align 8
-.L_2il0floatpacket.3:
-	.long	0x00000000,0x40480000
-	.type	.L_2il0floatpacket.3,@object
-	.size	.L_2il0floatpacket.3,8
-	.align 8
-.L_2il0floatpacket.9:
-	.long	0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.9,@object
-	.size	.L_2il0floatpacket.9,8
-	.data
-	.section .note.GNU-stack, ""
-# End
--- a/asm/unused/force.s
+++ b/asm/unused/force.s
@@ -1,324 +0,0 @@
-.intel_syntax noprefix
-
-.text
-.align    16,0x90
-.globl computeForce
-computeForce:
-# parameter 1: rdi Parameter*
-# parameter 2: rsi Atom*
-# parameter 3: rdx Neighbor*
-        push      rbp
-        push      r12
-        push      r13
-        push      r14
-        push      r15
-        push      rbx
-        #call      getTimeStamp                                      # xmm0 <- getTimeStamp()
-        #vmovsd    QWORD PTR [-56+rsp], xmm0                         # [-56+rsp] <- xmm0 [spill]
-        mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal
-        vmovsd    xmm2, QWORD PTR [96+rdi]                          # xmm2 <- param->cutforce
-        vmovsd    xmm1, QWORD PTR [32+rdi]                          # xmm1 <- param->sigma6
-        vmovsd    xmm0, QWORD PTR [24+rdi]                          # xmm0 <- param->epsilon
-        mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx
-        mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy
-        mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz
-        test      r9d, r9d                                          # atom->Nlocal <= 0
-        jle       ..atom_loop_exit
-        xor       r10d, r10d                                        # r10d <- 0
-        mov       ecx, r9d                                          # ecx <- atom->Nlocal
-        xor       r8d, r8d                                          # r8d <- 0
-        mov       r11d, 1                                           # r11d <- 1
-        xor       eax, eax                                          # eax <- 0
-        shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1
-        je        ..zero_last_element                               # ecx == 0
-
-# Init forces to zero loop (unroll factor = 2)
-..init_force_loop:
-        mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0
-        mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0
-        mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0
-        mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0
-        mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0
-        mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0
-        add       r8, 16                                            # i++
-        inc       r10                                               # i++
-        cmp       r10, rcx                                          # i < Nlocal
-        jb        ..init_force_loop
-
-# Trick to make r11d contain value of last element to be zeroed plus 1
-# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
-        lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1
-..zero_last_element:
-        lea       ecx, DWORD PTR [-1+r11]                           # ecx <- i * 2
-        cmp       ecx, r9d                                          # i >= Nlocal
-        jae       ..before_atom_loop
-
-        # Set last element to zero
-        movsxd    r11, r11d                                         # r11 <- i * 2
-        mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0
-        mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0
-        mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0
-
-# Initialize registers to be used within atom loop
-..before_atom_loop:
-        vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq
-        vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...]
-        vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon
-        vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7]
-        vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...]
-        vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...]
-        vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...]
-        vbroadcastsd zmm14, xmm0                                    # zmm14 <- [48 * epsilon, ...]
-        movsxd    r9, r9d                                           # r9 <- atom->Nlocal
-        xor       r10d, r10d                                        # r10d <- 0 (i)
-        mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh
-        mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors
-        movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs
-        mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x
-        ### AOS
-        xor       eax, eax
-        ### SOA
-        #mov       rax, QWORD PTR [24+rsi]                          # rax <- atom->y
-        #mov       rsi, QWORD PTR [32+rsi]                          # rsi <- atom->z
-        ###
-        shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4
-
-        # Register spilling
-        mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal
-        mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh
-        mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy
-        mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx
-        mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15
-        mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx
-
-..atom_loop_begin:
-        mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh
-        vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 (fix)
-        vmovapd   xmm20, xmm25                                      # xmm20 <- 0 (fiy)
-        mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs)
-        vmovapd   xmm4, xmm20                                       # xmm4 <- 0 (fiz)
-
-        ### AOS
-        vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3]
-        vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1]
-        vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2]
-        ### SOA
-        #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i]
-        #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i]
-        #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i]
-        ###
-        vbroadcastsd zmm0, xmm8                                     # zmm0 <- atom_x(i)
-        vbroadcastsd zmm1, xmm9                                     # zmm1 <- atom_y(i)
-        vbroadcastsd zmm2, xmm10                                    # zmm2 <- atom_z(i)
-        test      r13d, r13d                                        # numneighs <= 0
-        jle       ..atom_loop_exit
-
-        vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix)
-        vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy)
-        vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz)
-        mov       rcx, r12                                          # rcx <- neighbor->maxneighs * 4
-        imul      rcx, r10                                          # rcx <- neighbor->maxneighs * 4 * i
-        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
-        xor       r9d, r9d                                          # r9d <- 0 (k)
-        mov       r14d, r13d                                        # r14d <- numneighs
-        cmp       r14d, 8
-        jl        ..compute_forces_remainder
-
-..compute_forces:
-        vpcmpeqb  k1, xmm0, xmm0
-        vpcmpeqb  k2, xmm0, xmm0
-        vpcmpeqb  k3, xmm0, xmm0
-        vmovdqu   ymm3, YMMWORD PTR [rcx+r9*4]
-        vpxord    zmm5, zmm5, zmm5
-        vpxord    zmm6, zmm6, zmm6
-
-        ### AOS
-        vpaddd     ymm4, ymm3, ymm3
-        vpaddd     ymm3, ymm3, ymm4
-        vpxord     zmm4, zmm4, zmm4
-        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
-        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
-        ### SOA
-        #vpxord     zmm4, zmm4, zmm4
-        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
-        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
-        ###
-
-        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
-        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
-        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
-        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
-        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
-        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
-
-        # Cutoff radius condition
-        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
-        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
-        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
-        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
-        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
-        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
-        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
-        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
-        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
-        vfmadd231pd zmm13{k5}, zmm30, zmm28                         # fix += force * delx
-        vfmadd231pd zmm12{k5}, zmm30, zmm29                         # fiy += force * dely
-        vfmadd231pd zmm11{k5}, zmm30, zmm31                         # fiz += force * delz
-        sub       r14d, 8
-        add       r9, 8
-        cmp       r14d, 8
-        jge       ..compute_forces
-
-# Check if there are remaining neighbors to be computed
-..compute_forces_remainder:
-        test      r14d, r14d
-        jle       ..sum_up_forces
-
-        vpbroadcastd ymm4, r14d
-        vpcmpgtd  k1, ymm4, ymm17
-        kmovw     r15d, k1
-        vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
-        kmovw     k2, k1
-        kmovw     k3, k1
-        vpxord    zmm5, zmm5, zmm5
-        vpxord    zmm6, zmm6, zmm6
-
-        ### AOS
-        vpaddd     ymm4, ymm3, ymm3
-        vpaddd     ymm3, ymm3, ymm4
-        vpxord     zmm4, zmm4, zmm4
-        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
-        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
-        #### SOA
-        #vpxord     zmm4, zmm4, zmm4
-        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
-        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
-        ###
-
-        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
-        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
-        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
-        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
-        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
-        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
-
-        # Cutoff radius condition
-        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
-        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
-        kmovw     r9d, k5                                           # r9d <- rsq < cutforcesq
-        and       r15d, r9d                                         # r15d <- rsq < cutforcesq && k < numneighs
-        kmovw     k3, r15d                                          # k3 <- rsq < cutforcesq && k < numneighs
-        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
-        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
-        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
-        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
-        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
-        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
-        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
-        vfmadd231pd zmm13{k3}, zmm30, zmm28                         # fix += force * delx
-        vfmadd231pd zmm12{k3}, zmm30, zmm29                         # fiy += force * dely
-        vfmadd231pd zmm11{k3}, zmm30, zmm31                         # fiz += force * delz
-
-# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
-# and add them (reduction) to obtain the final contribution for the current atom
-..sum_up_forces:
-        vmovups   zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
-        vpermd    zmm0, zmm10, zmm11
-        vpermd    zmm5, zmm10, zmm12
-        vpermd    zmm21, zmm10, zmm13
-        vaddpd    zmm11, zmm0, zmm11
-        vaddpd    zmm12, zmm5, zmm12
-        vaddpd    zmm13, zmm21, zmm13
-        vpermpd   zmm1, zmm11, 78
-        vpermpd   zmm6, zmm12, 78
-        vpermpd   zmm22, zmm13, 78
-        vaddpd    zmm2, zmm11, zmm1
-        vaddpd    zmm8, zmm12, zmm6
-        vaddpd    zmm23, zmm13, zmm22
-        vpermpd   zmm3, zmm2, 177
-        vpermpd   zmm9, zmm8, 177
-        vpermpd   zmm24, zmm23, 177
-        vaddpd    zmm4, zmm2, zmm3
-        vaddpd    zmm20, zmm8, zmm9
-        vaddpd    zmm25, zmm23, zmm24
-
-..atom_loop_exit:
-        mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill]
-        mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill]
-
-        ### AOS
-        add       rax, 24
-        ###
-
-        vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9
-        vmovsd    QWORD PTR [rcx+r10*8], xmm0                   #84.9
-        vaddsd    xmm1, xmm20, QWORD PTR [rbx+r10*8]            #85.9
-        vmovsd    QWORD PTR [rbx+r10*8], xmm1                   #85.9
-        vaddsd    xmm2, xmm4, QWORD PTR [rdi+r10*8]             #86.9
-        vmovsd    QWORD PTR [rdi+r10*8], xmm2                   #86.9
-        inc       r10                                           #55.5
-        cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill]
-        jb        ..atom_loop_begin
-        vzeroupper                                              #93.12
-        vxorpd    xmm0, xmm0, xmm0                              #93.12
-        #call      getTimeStamp                                  # xmm0 <- getTimeStamp()
-        #vsubsd    xmm0, xmm0, QWORD PTR [-56+rsp]               # xmm0 <- E-S
-        pop       rbx
-        pop       r15
-        pop       r14                                           #93.12
-        pop       r13                                           #93.12
-        pop       r12                                           #93.12
-        pop       rbp                                           #93.12
-        ret                                                     #93.12
-
-.type	computeForce,@function
-.size	computeForce,.-computeForce
-
-
-..LNcomputeForce.0:
-	.data
-# -- End  computeForce
-	.section .rodata, "a"
-	.align 64
-	.align 64
-.L_2il0floatpacket.2:
-	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.2,@object
-	.size	.L_2il0floatpacket.2,64
-	.align 64
-.L_2il0floatpacket.4:
-	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
-	.type	.L_2il0floatpacket.4,@object
-	.size	.L_2il0floatpacket.4,64
-	.align 64
-.L_2il0floatpacket.6:
-	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
-	.type	.L_2il0floatpacket.6,@object
-	.size	.L_2il0floatpacket.6,64
-	.align 32
-.L_2il0floatpacket.0:
-	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,32
-	.align 32
-.L_2il0floatpacket.1:
-	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,32
-	.align 8
-.L_2il0floatpacket.3:
-	.long	0x00000000,0x40480000
-	.type	.L_2il0floatpacket.3,@object
-	.size	.L_2il0floatpacket.3,8
-	.align 8
-.L_2il0floatpacket.5:
-	.long	0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.5,@object
-	.size	.L_2il0floatpacket.5,8
-	.data
-	.section .note.GNU-stack, ""
-# End
--- a/asm/unused/force_lj.s
+++ b/asm/unused/force_lj.s
@@ -1,326 +0,0 @@
-.intel_syntax noprefix
-
-.text
-.align    16,0x90
-.globl computeForceLJ
-computeForceLJ:
-# parameter 1: rdi Parameter*
-# parameter 2: rsi Atom*
-# parameter 3: rdx Neighbor*
-        push      rbp
-        push      r12
-        push      r13
-        push      r14
-        push      r15
-        push      rbx
-        mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal
-        vmovsd    xmm2, QWORD PTR [96+rdi]                          # xmm2 <- param->cutforce
-        vmovsd    xmm1, QWORD PTR [32+rdi]                          # xmm1 <- param->sigma6
-        vmovsd    xmm0, QWORD PTR [24+rdi]                          # xmm0 <- param->epsilon
-        mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx
-        mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy
-        mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz
-        test      r9d, r9d                                          # atom->Nlocal <= 0
-        jle       ..atom_loop_exit
-        xor       r10d, r10d                                        # r10d <- 0
-        mov       ecx, r9d                                          # ecx <- atom->Nlocal
-        xor       r8d, r8d                                          # r8d <- 0
-        mov       r11d, 1                                           # r11d <- 1
-        xor       eax, eax                                          # eax <- 0
-        shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1
-        je        ..zero_last_element                               # ecx == 0
-
-# Init forces to zero loop (unroll factor = 2)
-..init_force_loop:
-        mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0
-        mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0
-        mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0
-        mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0
-        mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0
-        mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0
-        add       r8, 16                                            # i++
-        inc       r10                                               # i++
-        cmp       r10, rcx                                          # i < Nlocal
-        jb        ..init_force_loop
-
-# Trick to make r11d contain value of last element to be zeroed plus 1
-# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
-        lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1
-..zero_last_element:
-        lea       ecx, DWORD PTR [-1+r11]                           # ecx <- i * 2
-        cmp       ecx, r9d                                          # i >= Nlocal
-        jae       ..before_atom_loop
-
-        # Set last element to zero
-        movsxd    r11, r11d                                         # r11 <- i * 2
-        mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0
-        mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0
-        mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0
-
-# Initialize registers to be used within atom loop
-..before_atom_loop:
-        vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq
-        vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...]
-        vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon
-        vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7]
-        vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...]
-        vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...]
-        vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...]
-        vbroadcastsd zmm14, xmm0                                    # zmm14 <- [48 * epsilon, ...]
-        movsxd    r9, r9d                                           # r9 <- atom->Nlocal
-        xor       r10d, r10d                                        # r10d <- 0 (i)
-        mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh
-        mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors
-        movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs
-        mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x
-        ### AOS
-        xor       eax, eax
-        ### SOA
-        #mov       rax, QWORD PTR [24+rsi]                          # rax <- atom->y
-        #mov       rsi, QWORD PTR [32+rsi]                          # rsi <- atom->z
-        ###
-        shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4
-
-        # Register spilling
-        mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal
-        mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh
-        mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy
-        mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx
-        mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15
-        mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx
-        #sub       rsp, 64
-        #call      getTimeStamp                                      # xmm0 <- getTimeStamp()
-        #vmovsd    QWORD PTR [-56+rsp], xmm0                         # [-56+rsp] <- xmm0 [spill]
-        #add       rsp, 64
-
-..atom_loop_begin:
-        mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh
-        vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 (fix)
-        vmovapd   xmm20, xmm25                                      # xmm20 <- 0 (fiy)
-        mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs)
-        vmovapd   xmm4, xmm20                                       # xmm4 <- 0 (fiz)
-
-        ### AOS
-        vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3]
-        vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1]
-        vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2]
-        ### SOA
-        #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i]
-        #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i]
-        #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i]
-        ###
-        vbroadcastsd zmm0, xmm8                                     # zmm0 <- atom_x(i)
-        vbroadcastsd zmm1, xmm9                                     # zmm1 <- atom_y(i)
-        vbroadcastsd zmm2, xmm10                                    # zmm2 <- atom_z(i)
-        test      r13d, r13d                                        # numneighs <= 0
-        jle       ..atom_loop_exit
-
-        vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix)
-        vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy)
-        vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz)
-        mov       rcx, r12                                          # rcx <- neighbor->maxneighs * 4
-        imul      rcx, r10                                          # rcx <- neighbor->maxneighs * 4 * i
-        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
-        xor       r9d, r9d                                          # r9d <- 0 (k)
-        mov       r14d, r13d                                        # r14d <- numneighs
-        cmp       r14d, 8
-        jl        ..compute_forces_remainder
-
-..compute_forces:
-        vpcmpeqb  k1, xmm0, xmm0
-        vpcmpeqb  k2, xmm0, xmm0
-        vpcmpeqb  k3, xmm0, xmm0
-        vmovdqu   ymm3, YMMWORD PTR [rcx+r9*4]
-        vpxord    zmm5, zmm5, zmm5
-        vpxord    zmm6, zmm6, zmm6
-
-        ### AOS
-        vpaddd     ymm4, ymm3, ymm3
-        vpaddd     ymm3, ymm3, ymm4
-        vpxord     zmm4, zmm4, zmm4
-        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
-        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
-        ### SOA
-        #vpxord     zmm4, zmm4, zmm4
-        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
-        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
-        ###
-
-        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
-        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
-        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
-        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
-        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
-        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
-
-        # Cutoff radius condition
-        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
-        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
-        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
-        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
-        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
-        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
-        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
-        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
-        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
-        vfmadd231pd zmm13{k5}, zmm30, zmm28                         # fix += force * delx
-        vfmadd231pd zmm12{k5}, zmm30, zmm29                         # fiy += force * dely
-        vfmadd231pd zmm11{k5}, zmm30, zmm31                         # fiz += force * delz
-        sub       r14d, 8
-        add       r9, 8
-        cmp       r14d, 8
-        jge       ..compute_forces
-
-# Check if there are remaining neighbors to be computed
-..compute_forces_remainder:
-        test      r14d, r14d
-        jle       ..sum_up_forces
-
-        vpbroadcastd ymm4, r14d
-        vpcmpgtd  k1, ymm4, ymm17
-        kmovw     r15d, k1
-        vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
-        kmovw     k2, k1
-        kmovw     k3, k1
-        vpxord    zmm5, zmm5, zmm5
-        vpxord    zmm6, zmm6, zmm6
-
-        ### AOS
-        vpaddd     ymm4, ymm3, ymm3
-        vpaddd     ymm3, ymm3, ymm4
-        vpxord     zmm4, zmm4, zmm4
-        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
-        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
-        #### SOA
-        #vpxord     zmm4, zmm4, zmm4
-        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
-        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
-        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
-        ###
-
-        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
-        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
-        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
-        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
-        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
-        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
-
-        # Cutoff radius condition
-        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
-        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
-        kmovw     r9d, k5                                           # r9d <- rsq < cutforcesq
-        and       r15d, r9d                                         # r15d <- rsq < cutforcesq && k < numneighs
-        kmovw     k3, r15d                                          # k3 <- rsq < cutforcesq && k < numneighs
-        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
-        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
-        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
-        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
-        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
-        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
-        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
-        vfmadd231pd zmm13{k3}, zmm30, zmm28                         # fix += force * delx
-        vfmadd231pd zmm12{k3}, zmm30, zmm29                         # fiy += force * dely
-        vfmadd231pd zmm11{k3}, zmm30, zmm31                         # fiz += force * delz
-
-# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
-# and add them (reduction) to obtain the final contribution for the current atom
-..sum_up_forces:
-        vmovups   zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
-        vpermd    zmm0, zmm10, zmm11
-        vpermd    zmm5, zmm10, zmm12
-        vpermd    zmm21, zmm10, zmm13
-        vaddpd    zmm11, zmm0, zmm11
-        vaddpd    zmm12, zmm5, zmm12
-        vaddpd    zmm13, zmm21, zmm13
-        vpermpd   zmm1, zmm11, 78
-        vpermpd   zmm6, zmm12, 78
-        vpermpd   zmm22, zmm13, 78
-        vaddpd    zmm2, zmm11, zmm1
-        vaddpd    zmm8, zmm12, zmm6
-        vaddpd    zmm23, zmm13, zmm22
-        vpermpd   zmm3, zmm2, 177
-        vpermpd   zmm9, zmm8, 177
-        vpermpd   zmm24, zmm23, 177
-        vaddpd    zmm4, zmm2, zmm3
-        vaddpd    zmm20, zmm8, zmm9
-        vaddpd    zmm25, zmm23, zmm24
-
-..atom_loop_exit:
-        mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill]
-        mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill]
-
-        ### AOS
-        add       rax, 24
-        ###
-
-        vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9
-        vmovsd    QWORD PTR [rcx+r10*8], xmm0                   #84.9
-        vaddsd    xmm1, xmm20, QWORD PTR [rbx+r10*8]            #85.9
-        vmovsd    QWORD PTR [rbx+r10*8], xmm1                   #85.9
-        vaddsd    xmm2, xmm4, QWORD PTR [rdi+r10*8]             #86.9
-        vmovsd    QWORD PTR [rdi+r10*8], xmm2                   #86.9
-        inc       r10                                           #55.5
-        cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill]
-        jb        ..atom_loop_begin
-        vzeroupper                                              #93.12
-        vxorpd    xmm0, xmm0, xmm0                              #93.12
-        #call      getTimeStamp                                  # xmm0 <- getTimeStamp()
-        #vsubsd    xmm0, xmm0, QWORD PTR [-56+rsp]               # xmm0 <- E-S
-        pop       rbx
-        pop       r15
-        pop       r14                                           #93.12
-        pop       r13                                           #93.12
-        pop       r12                                           #93.12
-        pop       rbp                                           #93.12
-        ret                                                     #93.12
-
-.type	computeForceLJ,@function
-.size	computeForceLJ,.-computeForceLJ
-
-
-..LNcomputeForce.0:
-	.data
-# -- End  computeForceLJ
-	.section .rodata, "a"
-	.align 64
-	.align 64
-.L_2il0floatpacket.2:
-	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.2,@object
-	.size	.L_2il0floatpacket.2,64
-	.align 64
-.L_2il0floatpacket.4:
-	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
-	.type	.L_2il0floatpacket.4,@object
-	.size	.L_2il0floatpacket.4,64
-	.align 64
-.L_2il0floatpacket.6:
-	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
-	.type	.L_2il0floatpacket.6,@object
-	.size	.L_2il0floatpacket.6,64
-	.align 32
-.L_2il0floatpacket.0:
-	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
-	.type	.L_2il0floatpacket.0,@object
-	.size	.L_2il0floatpacket.0,32
-	.align 32
-.L_2il0floatpacket.1:
-	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
-	.type	.L_2il0floatpacket.1,@object
-	.size	.L_2il0floatpacket.1,32
-	.align 8
-.L_2il0floatpacket.3:
-	.long	0x00000000,0x40480000
-	.type	.L_2il0floatpacket.3,@object
-	.size	.L_2il0floatpacket.3,8
-	.align 8
-.L_2il0floatpacket.5:
-	.long	0x00000000,0x3ff00000
-	.type	.L_2il0floatpacket.5,@object
-	.size	.L_2il0floatpacket.5,8
-	.data
-	.section .note.GNU-stack, ""
-# End
--- a/common/box.c
+++ b/common/box.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <parameter.h>
+#include <util.h>
+#include <box.h>
+#include <mpi.h>
+
+int overlapBox(int dim, int dir, const Box* mybox, const Box* other, Box* cut, MD_FLOAT xprd, MD_FLOAT cutneigh)
+{
+  int pbc = -100;
+  MD_FLOAT min[3], max[3];
+  int same = (mybox->id == other->id) ? 1 : 0;
+  
+  //projections
+  min[_x] = MAX(mybox->lo[_x], other->lo[_x]); max[_x] = MIN(mybox->hi[_x], other->hi[_x]); 
+  min[_y] = MAX(mybox->lo[_y], other->lo[_y]); max[_y] = MIN(mybox->hi[_y], other->hi[_y]);
+  min[_z] = MAX(mybox->lo[_z], other->lo[_z]); max[_z] = MIN(mybox->hi[_z], other->hi[_z]);
+  
+  //Intersection no periodic case
+  if(!same){
+    if (dir ==  0)  max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ cutneigh);
+    if (dir ==  1)  min[dim] = MAX(mybox->lo[dim], other->lo[dim]- cutneigh);
+    if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) pbc = 0;
+  }
+
+  //Intersection periodic case
+  if(pbc < 0)
+  {
+    if(dir == 0){
+      min[dim] = MAX(mybox->lo[dim] , other->lo[dim]- xprd);
+      max[dim] = MIN(mybox->hi[dim] , other->hi[dim]- xprd + cutneigh);
+
+    } else {
+      min[dim] = MAX(mybox->lo[dim], other->lo[dim]+ xprd - cutneigh);
+      max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ xprd); 
+
+    } 
+    if((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) 
+      pbc = (dir == 0) ? 1:-1;
+  }   
+  
+  //storing the cuts
+  cut->lo[_x] = min[_x]; cut->hi[_x] = max[_x]; 
+  cut->lo[_y] = min[_y]; cut->hi[_y] = max[_y];
+  cut->lo[_z] = min[_z]; cut->hi[_z] = max[_z];
+
+  return pbc;
+}
+
+int overlapFullBox(Parameter* param, MD_FLOAT *cutneigh ,const Box* mybox, const Box* other)
+{
+  MD_FLOAT min[3], max[3];
+  MD_FLOAT xprd = param->xprd; 
+  MD_FLOAT yprd = param->yprd; 
+  MD_FLOAT zprd = param->zprd;
+  
+  for(int k = -1; k < 2; k++)
+  {
+    for(int j = -1; j < 2; j++)
+    {
+      for(int i= -1; i < 2; i++)
+      {
+        min[_x] = MAX(mybox->lo[_x], other->lo[_x]-cutneigh[_x] + i*xprd);
+        min[_y] = MAX(mybox->lo[_y], other->lo[_y]-cutneigh[_y] + j*yprd); 
+        min[_z] = MAX(mybox->lo[_z], other->lo[_z]-cutneigh[_z] + k*zprd);
+        max[_x] = MIN(mybox->hi[_x], other->hi[_x]+cutneigh[_x] + i*xprd);
+        max[_y] = MIN(mybox->hi[_y], other->hi[_y]+cutneigh[_y] + j*yprd);
+        max[_z] = MIN(mybox->hi[_z], other->hi[_z]+cutneigh[_z] + k*zprd);
+        if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) 
+          return 1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+void expandBox(int iswap, const Box* me, const Box* other, Box* cut, MD_FLOAT cutneigh)
+ {
+    if(iswap==2 || iswap==3){
+      if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
+      if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
+    }
+
+    if(iswap==4 || iswap==5){
+      if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
+      if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
+      if(me->lo[_y] <= other->lo[_y]) cut->lo[_y] -= cutneigh;
+      if(me->hi[_y] >= other->hi[_y]) cut->hi[_y] += cutneigh;
+    }
+}
+
--- a/common/comm.c
+++ b/common/comm.c
@@ -0,0 +1,556 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <comm.h>   
+#include <allocate.h>
+#include <mpi.h>
+#include <util.h>
+
+#define NEIGHMIN  6       
+#define BUFFACTOR 2
+#define BUFMIN    1000
+#define BUFEXTRA  100
+#define world MPI_COMM_WORLD
+
+MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE; 
+static inline void allocDynamicBuffers(Comm*);
+static inline void freeDynamicBuffers(Comm*);
+static inline void freeBuffers(Comm*);
+
+void defineReverseList(Comm* comm){
+  int dim = 0;
+  int index = 0;
+  int me = comm->myproc;
+  
+  //Set the inverse list
+  for(int iswap = 0; iswap<6; iswap++){
+    int dim = comm->swapdim[iswap]; 
+    int dir = comm->swapdir[iswap];
+    int invswap = comm->swap[dim][(dir+1)%2]; 
+    
+    for(int ineigh = comm->sendfrom[invswap]; ineigh< comm->sendtill[invswap]; ineigh++)
+      comm->nrecv[index++] = comm->nsend[ineigh]; 
+     
+    comm->recvfrom[iswap] = (iswap == 0) ? 0 : comm->recvtill[iswap-1];
+    comm->recvtill[iswap] = index;
+  }
+
+  //set if myproc is unique in the swap 
+  for(int iswap = 0; iswap<6; iswap++){
+    int sizeswap = comm->sendtill[iswap]-comm->sendfrom[iswap]; 
+    int index = comm->sendfrom[iswap];
+    int myneigh = comm->nsend[index];
+    comm->othersend[iswap] = (sizeswap != 1 || comm->myproc != myneigh) ?  1 : 0;
+  }
+}
+
+void addNeighToExchangeList(Comm* comm, int newneigh){
+
+    int numneigh = comm->numneighexch;
+   
+    if(comm->numneighexch>=comm->maxneighexch){
+      size_t oldByteSize = comm->maxneighexch*sizeof(int);
+      comm->maxneighexch *=2; 
+      comm->nexch = (int*) reallocate(comm->nexch, ALIGNMENT,  comm->maxneighexch * sizeof(int), oldByteSize);
+    }
+    
+    // Add the new element to the list
+    comm->nexch[numneigh] = newneigh;
+    comm->numneighexch++;
+}
+
+//Exported functions
+void neighComm(Comm *comm, Parameter* param, Grid *grid)
+{
+  int me = comm->myproc; 
+  int numproc = comm ->numproc;
+  int PAD = 6;   //number of elements for processor in the map
+  int ineigh = 0;
+  int sneigh = 0;
+  MD_FLOAT *map = grid->map;
+  MD_FLOAT cutneigh = param->cutneigh;
+  MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
+  Box mybox, other, cut;
+ 
+  //needed for rebalancing
+  freeDynamicBuffers(comm);
+
+  //Local box
+  mybox.id = me;
+  mybox.lo[_x] = map[me*PAD+0];  mybox.hi[_x] = map[me*PAD+3];
+  mybox.lo[_y] = map[me*PAD+1];  mybox.hi[_y] = map[me*PAD+4];
+  mybox.lo[_z] = map[me*PAD+2];  mybox.hi[_z] = map[me*PAD+5];
+
+  //Check for all possible neighbours only for exchange atoms
+  comm->numneighexch = 0;
+  for(int proc = 0; proc <numproc; proc++){
+      other.id = proc;
+      other.lo[_x] = map[proc*PAD+0];  other.hi[_x] = map[proc*PAD+3];
+      other.lo[_y] = map[proc*PAD+1];  other.hi[_y] = map[proc*PAD+4];
+      other.lo[_z] = map[proc*PAD+2];  other.hi[_z] = map[proc*PAD+5];
+    
+    if(proc != me){
+      int intersection = overlapFullBox(param,grid->cutneigh,&mybox,&other);
+      if(intersection) addNeighToExchangeList(comm,proc);
+    }
+  }
+  
+  //MAP is stored as follows: xlo,ylo,zlo,xhi,yhi,zhi
+  for(int iswap = 0; iswap <6; iswap++)
+  {
+    int dir = comm->swapdir[iswap]; 
+    int dim = comm->swapdim[iswap]; 
+
+    for(int proc = 0; proc < numproc; proc++)
+    {      
+      //Check for neighbours along dimmensions, for forwardComm, backwardComm  and ghostComm
+      other.id = proc;
+      other.lo[_x] = map[proc*PAD+0];  other.hi[_x] = map[proc*PAD+3];
+      other.lo[_y] = map[proc*PAD+1];  other.hi[_y] = map[proc*PAD+4];
+      other.lo[_z] = map[proc*PAD+2];  other.hi[_z] = map[proc*PAD+5]; 
+          
+      //return if two boxes intersect: -100 not intersection, 0, 1 and -1 intersection for each different pbc.  
+      int pbc = overlapBox(dim,dir,&mybox,&other,&cut,prd[dim],cutneigh);
+      if(pbc == -100) continue;   
+      
+      expandBox(iswap, &mybox, &other, &cut, cutneigh);
+ 
+      if(ineigh >= comm->maxneigh) {
+          size_t oldByteSize = comm->maxneigh*sizeof(int);
+          size_t oldBoxSize = comm->maxneigh*sizeof(Box); 
+          comm->maxneigh  = 2*ineigh;  
+          comm->nsend     = (int*) reallocate(comm->nsend, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->nrecv     = (int*) reallocate(comm->nrecv, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->pbc_x     = (int*) reallocate(comm->pbc_x, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->pbc_y     = (int*) reallocate(comm->pbc_y, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->pbc_z     = (int*) reallocate(comm->pbc_z, ALIGNMENT,  comm->maxneigh * sizeof(int), oldByteSize);
+          comm->boxes     = (Box*) reallocate(comm->boxes, ALIGNMENT,  comm->maxneigh * sizeof(Box), oldBoxSize);
+        }
+
+      comm->boxes[ineigh] = cut;  
+      comm->nsend[ineigh] = proc;
+      comm->pbc_x[ineigh] = (dim == _x) ? pbc : 0;
+      comm->pbc_y[ineigh] = (dim == _y) ? pbc : 0; 
+      comm->pbc_z[ineigh] = (dim == _z) ? pbc : 0; 
+      ineigh++; 
+    }
+
+    comm->sendfrom[iswap] = (iswap == 0) ? 0:comm->sendtill[iswap-1];
+    comm->sendtill[iswap] = ineigh;
+    comm->numneigh = ineigh; 
+  }
+
+  allocDynamicBuffers(comm);
+  defineReverseList(comm);
+}
+    
+void initComm(int* argc, char*** argv, Comm* comm)
+{
+  //MPI Initialize
+  MPI_Init(argc, argv);
+  MPI_Comm_size(MPI_COMM_WORLD, &(comm->numproc));
+  MPI_Comm_rank(MPI_COMM_WORLD, &(comm->myproc));
+  comm->numneigh = 0;
+  comm->numneighexch = 0;
+  comm->nrecv=NULL;
+  comm->nsend=NULL;
+  comm->nexch=NULL;  
+  comm->pbc_x=NULL; 
+  comm->pbc_y=NULL;  
+  comm->pbc_z=NULL;  
+  comm->boxes=NULL;  
+  comm->atom_send=NULL;     
+  comm->atom_recv=NULL;   
+  comm->off_atom_send=NULL; 
+  comm->off_atom_recv=NULL;
+  comm->maxsendlist=NULL; 
+  comm->sendlist=NULL;
+  comm->buf_send=NULL; 
+  comm->buf_recv=NULL; 
+}
+ 
+void endComm(Comm* comm)
+{
+  comm->maxneigh = 0;
+  comm->maxneighexch =0;
+  comm->maxsend = 0; 
+  comm->maxrecv = 0;
+  freeBuffers(comm);
+  MPI_Finalize();
+}
+
+void setupComm(Comm* comm, Parameter* param, Grid* grid){
+ 
+  comm->swap[_x][0] = 0; comm->swap[_x][1] =1;
+  comm->swap[_y][0] = 2; comm->swap[_y][1] =3;
+  comm->swap[_z][0] = 4; comm->swap[_z][1] =5;
+
+  comm->swapdim[0] = comm->swapdim[1] = _x;
+  comm->swapdim[2] = comm->swapdim[3] = _y;
+  comm->swapdim[4] = comm->swapdim[5] = _z;
+
+  comm->swapdir[0] = comm->swapdir[2] = comm->swapdir[4] = 0;
+  comm->swapdir[1] = comm->swapdir[3] = comm->swapdir[5] = 1;
+  
+  for(int i = 0;  i<6; i++){
+    comm->sendfrom[i] = 0;
+    comm->sendtill[i] = 0;
+    comm->recvfrom[i] = 0;
+    comm->recvtill[i] = 0;  
+  }
+
+  comm->forwardSize   = FORWARD_SIZE;      //send coordiantes x,y,z
+  comm->reverseSize   = REVERSE_SIZE;      //return forces fx, fy, fz
+  comm->ghostSize     = GHOST_SIZE;        //send x,y,z,type;
+  comm->exchangeSize  = EXCHANGE_SIZE;     //send x,y,z,vx,vy,vz,type
+ 
+  //Allocate memory for recv buffer and recv buffer
+  comm->maxsend = BUFMIN; 
+  comm->maxrecv = BUFMIN;
+  comm->buf_send = (MD_FLOAT*) allocate(ALIGNMENT,(comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT));
+  comm->buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT)); 
+
+  comm->maxneighexch = NEIGHMIN;
+  comm->nexch  = (int*) allocate(ALIGNMENT,  comm->maxneighexch * sizeof(int));
+
+  comm->maxneigh = NEIGHMIN;
+  comm->nsend  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->nrecv  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->pbc_x  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->pbc_y  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->pbc_z  = (int*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(int));
+  comm->boxes  = (Box*) allocate(ALIGNMENT,  comm->maxneigh * sizeof(Box));
+  
+  neighComm(comm, param, grid); 
+}
+
+void forwardComm(Comm* comm, Atom* atom, int iswap)
+{ 
+  int nrqst=0, offset=0, nsend=0, nrecv=0; 
+  int pbc[3];
+  int size = comm->forwardSize; 
+  int maxrqst = comm->numneigh;
+  MD_FLOAT* buf;
+  MPI_Request requests[maxrqst];
+  
+  for(int ineigh = comm->sendfrom[iswap]; ineigh < comm->sendtill[iswap]; ineigh++){
+    offset = comm->off_atom_send[ineigh];
+    pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh];  pbc[_z]=comm->pbc_z[ineigh];
+    packForward(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &comm->buf_send[offset*size],pbc);
+  }
+   
+  //Receives elements 
+  if(comm->othersend[iswap])  
+    for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){      
+      offset = comm->off_atom_recv[ineigh]*size;
+      nrecv  = comm->atom_recv[ineigh]*size;
+      MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
+    }
+   
+  //Send elements 
+  if(comm->othersend[iswap]) 
+    for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){  
+      offset = comm->off_atom_send[ineigh]*size;
+      nsend  = comm->atom_send[ineigh]*size;
+      MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);      
+    } 
+
+  if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+  
+  if(comm->othersend[iswap]) buf = comm->buf_recv;
+  else buf = comm->buf_send;
+  
+  /* unpack buffer */   
+  for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
+    offset = comm->off_atom_recv[ineigh];
+    unpackForward(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &buf[offset*size]);
+  }
+}
+
+void reverseComm(Comm* comm, Atom* atom, int iswap)
+{ 
+  int nrqst=0, offset=0, nsend=0, nrecv=0 ;
+  int size = comm->reverseSize; 
+  int maxrqst = comm->numneigh;
+  MD_FLOAT* buf;
+  MPI_Request requests[maxrqst];
+  
+  for(int ineigh = comm->recvfrom[iswap]; ineigh < comm->recvtill[iswap]; ineigh++){
+    offset = comm->off_atom_recv[ineigh]; 
+    packReverse(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &comm->buf_send[offset*size]);
+  }
+  //Receives elements 
+  if(comm->othersend[iswap])   
+    for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){      
+      offset = comm->off_atom_send[ineigh]*size;
+      nrecv  = comm->atom_send[ineigh]*size; 
+      MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nsend[ineigh],0,world,&requests[nrqst++]);
+    }
+  //Send elements  
+  if(comm->othersend[iswap]) 
+    for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){  
+      offset = comm->off_atom_recv[ineigh]*size;
+      nsend  = comm->atom_recv[ineigh]*size;  
+      MPI_Send(&comm->buf_send[offset],nsend,type,comm->nrecv[ineigh],0,world);        
+    } 
+  if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+  if(comm->othersend[iswap])  buf = comm->buf_recv;
+  else buf = comm->buf_send; 
+
+  /* unpack buffer */   
+  for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
+    offset =  comm->off_atom_send[ineigh]; 
+    unpackReverse(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &buf[offset*size]);
+  } 
+}
+
+void ghostComm(Comm* comm, Atom* atom,int iswap){
+  
+  MD_FLOAT xlo=0, xhi=0, ylo=0, yhi=0, zlo=0, zhi=0; 
+  MD_FLOAT* buf;
+  int nrqst=0, nsend=0, nrecv=0, offset=0, ineigh=0, pbc[3];
+  int all_recv=0, all_send=0, currentSend=0; 
+  int size = comm->ghostSize; 
+  int maxrqrst = comm->numneigh;
+  MPI_Request requests[maxrqrst];
+  for(int i = 0; i<maxrqrst; i++) 
+    requests[maxrqrst]=MPI_REQUEST_NULL;    
+  if(iswap%2==0) comm->iterAtom = LOCAL+GHOST;
+  int iter = 0; 
+  for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
+      {          
+        Box* tile = &comm->boxes[ineigh];
+        
+        xlo = tile->lo[_x]; ylo = tile->lo[_y]; zlo = tile->lo[_z]; 
+        xhi = tile->hi[_x]; yhi = tile->hi[_y]; zhi = tile->hi[_z];   
+        pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh];  pbc[_z]=comm->pbc_z[ineigh];
+        nsend = 0; 
+    
+        for(int i = 0; i < comm->iterAtom ; i++) 
+        { 
+          if(IsinRegionToSend(i)){
+                if(nsend >= comm->maxsendlist[ineigh]) growList(comm,ineigh,nsend);
+                if(currentSend + size >= comm->maxsend) growSend(comm,currentSend); 
+                comm->sendlist[ineigh][nsend++] = i;
+                currentSend += packGhost(atom, i, &comm->buf_send[currentSend], pbc);  
+          }   
+        }
+        comm->atom_send[ineigh]     = nsend;          //#atoms send per neigh   
+        comm->off_atom_send[ineigh] = all_send;       //offset atom respect to neighbours in a swap
+        all_send += nsend;                            //all atoms send
+      } 
+  //Receives how many elements to be received.
+  if(comm->othersend[iswap])
+    for(nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++)
+      MPI_Irecv(&comm->atom_recv[ineigh],1,MPI_INT,comm->nrecv[ineigh],0,world,&requests[nrqst++]);
+  
+  if(!comm->othersend[iswap]) comm->atom_recv[comm->recvfrom[iswap]] = nsend; 
+
+  //Communicate how many elements to be sent.
+  if(comm->othersend[iswap])
+    for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
+      MPI_Send(&comm->atom_send[ineigh],1,MPI_INT,comm->nsend[ineigh],0,world);    
+   if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+
+  //Define offset to store in the recv_buff    
+  for(int ineigh = comm->recvfrom[iswap]; ineigh<comm->recvtill[iswap]; ineigh++){ 
+    comm->off_atom_recv[ineigh] = all_recv;
+    all_recv += comm->atom_recv[ineigh];
+  }
+
+  if(all_recv*size>=comm->maxrecv) growRecv(comm,all_recv*size);
+
+  //Receives elements 
+  if(comm->othersend[iswap])
+    for (nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
+      offset = comm->off_atom_recv[ineigh]*size;  
+      nrecv = comm->atom_recv[ineigh]*size;
+      MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
+    } 
+  //Send elements
+  if(comm->othersend[iswap])
+    for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
+      offset = comm->off_atom_send[ineigh]*size;
+      nsend  = comm->atom_send[ineigh]*size;  
+      MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world); 
+    }
+  if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+  
+  if(comm->othersend[iswap]) buf = comm->buf_recv;
+  else buf = comm->buf_send; 
+  //unpack elements
+  comm->firstrecv[iswap] = LOCAL+GHOST; 
+  for(int i = 0; i < all_recv; i++)
+    unpackGhost(atom, LOCAL+GHOST, &buf[i*size]); 
+  
+  //Increases the buffer if needed
+  int max_size = MAX(comm->forwardSize,comm->reverseSize);
+  int max_buf = max_size * MAX(all_recv, all_send); 
+  if(max_buf>=comm->maxrecv) growRecv(comm,max_buf);
+  if(max_buf>=comm->maxsend) growSend(comm,max_buf);
+}
+
+void exchangeComm(Comm* comm, Atom* atom){
+
+  MD_FLOAT x,y,z;
+  MD_FLOAT *lo = atom->mybox.lo; 
+  MD_FLOAT *hi = atom->mybox.hi;
+  int size = comm->exchangeSize;
+  int numneigh = comm->numneighexch;
+  int offset_recv[numneigh];
+  int size_recv[numneigh];
+  MPI_Request requests[numneigh];
+  int i =0,  nsend = 0, nrecv = 0;
+  int nrqst = 0;
+  int nlocal, offset,m;
+
+  /* enforce PBC */
+  pbc(atom);
+  
+  if(comm->numneigh == 0) return;
+
+  nlocal = atom->Nlocal;
+  while(i < nlocal) {
+    if(atom_x(i) < lo[_x] || atom_x(i) >= hi[_x] ||
+       atom_y(i) < lo[_y] || atom_y(i) >= hi[_y] ||
+       atom_z(i) < lo[_z] || atom_z(i) >= hi[_z]) {
+      if(nsend+size >= comm->maxsend) growSend(comm, nsend);
+      nsend += packExchange(atom, i, &comm->buf_send[nsend]);
+      copy(atom, i, nlocal-1);
+      nlocal--;
+    } else i++;
+  }
+  atom->Nlocal = nlocal;
+
+  /* send/recv number of to share atoms with neighbouring procs*/
+  for(int ineigh = 0; ineigh < numneigh; ineigh++) 
+    MPI_Irecv(&size_recv[ineigh],1,MPI_INT,comm->nexch[ineigh],0,world,&requests[nrqst++]);
+
+  for (int ineigh = 0; ineigh < numneigh; ineigh++) 
+    MPI_Send(&nsend,1,MPI_INT,comm->nexch[ineigh],0,world); 
+  MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
+
+  //Define offset to store in the recv_buff
+  for(int ineigh = 0; ineigh<numneigh; ineigh++){ 
+    offset_recv[ineigh] = nrecv; 
+    nrecv += size_recv[ineigh];
+  }
+
+  if(nrecv >= comm->maxrecv) growRecv(comm,nrecv); 
+
+    //Receives elements 
+    nrqst=0;
+    for (int ineigh = 0; ineigh< numneigh; ineigh++){
+      offset = offset_recv[ineigh];
+      MPI_Irecv(&comm->buf_recv[offset], size_recv[ineigh], type, comm->nexch[ineigh],0,world,&requests[nrqst++]);
+    }
+    //Send elements 
+    for (int ineigh = 0; ineigh< numneigh; ineigh++)
+      MPI_Send(comm->buf_send,nsend,type,comm->nexch[ineigh],0,world); 
+    MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);  
+
+    nlocal = atom->Nlocal;
+    m = 0;
+    while(m < nrecv) {
+      x = comm->buf_recv[m + _x]; 
+      y = comm->buf_recv[m + _y];
+      z = comm->buf_recv[m + _z];
+
+      if(x >= lo[_x] && x < hi[_x] &&
+         y >= lo[_y] && y < hi[_y] &&
+         z >= lo[_z] && z < hi[_z]){
+        m += unpackExchange(atom, nlocal++, &comm->buf_recv[m]);
+      } else {
+        m += size;
+      }
+    } 
+    atom->Nlocal = nlocal;
+    
+    int all_atoms=0;
+    MPI_Allreduce(&atom->Nlocal, &all_atoms, 1, MPI_INT, MPI_SUM, world);
+    if(atom->Natoms!=all_atoms && comm->myproc ==0){
+      printf("Losing atoms! current atoms:%d expected atoms:%d\n",all_atoms,atom->Natoms);
+    }
+}
+
+//Internal functions
+
+inline void growRecv(Comm* comm, int n)
+{ 
+  comm -> maxrecv = BUFFACTOR * n;
+  if(comm->buf_recv) free(comm -> buf_recv);
+  comm -> buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
+}
+
+inline void growSend(Comm* comm, int n)
+{
+  size_t oldByteSize = (comm->maxsend+BUFEXTRA)*sizeof(MD_FLOAT);
+  comm -> maxsend = BUFFACTOR * n;
+  comm -> buf_send = (MD_FLOAT*) reallocate(comm->buf_send, ALIGNMENT, (comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT), oldByteSize);
+}
+
+inline void growList(Comm* comm, int ineigh, int n)
+{
+  size_t oldByteSize = comm->maxsendlist[ineigh]*sizeof(int);
+  comm->maxsendlist[ineigh] = BUFFACTOR * n;
+  comm->sendlist[ineigh] = (int*) reallocate(comm->sendlist[ineigh],ALIGNMENT, comm->maxsendlist[ineigh] * sizeof(int), oldByteSize);
+}
+
+static inline void  allocDynamicBuffers(Comm* comm)
+{  
+  //Buffers depending on the # of my neighs 
+  int numneigh = comm->numneigh; 
+  comm->atom_send   = (int*) allocate(ALIGNMENT,  numneigh * sizeof(int));
+  comm->atom_recv   = (int*) allocate(ALIGNMENT,  numneigh * sizeof(int));
+  comm->off_atom_send = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
+  comm->off_atom_recv = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
+  comm->maxsendlist   = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
+ 
+  for(int i = 0; i < numneigh; i++) 
+    comm->maxsendlist[i] = BUFMIN;
+
+  comm->sendlist = (int**) allocate(ALIGNMENT, numneigh * sizeof(int*));
+  for(int i = 0; i < numneigh; i++) 
+    comm->sendlist[i] = (int*) allocate(ALIGNMENT, comm->maxsendlist[i] * sizeof(int));
+}
+
+static inline void freeDynamicBuffers(Comm* comm)
+{
+  int numneigh =comm->numneigh;
+  
+  if(comm->atom_send) free(comm->atom_send);
+  if(comm->atom_recv) free(comm->atom_recv);
+  if(comm->off_atom_send) free(comm->off_atom_send);
+  if(comm->off_atom_recv) free(comm->off_atom_recv);
+  if(comm->maxsendlist) free(comm->maxsendlist);
+  if(comm->sendlist){
+    for(int i = 0; i < numneigh; i++) 
+      if(comm->sendlist[i]) free(comm->sendlist[i]);
+  } 
+  if(comm->sendlist) free(comm->sendlist);
+}
+
+static inline void freeBuffers(Comm* comm)
+{
+  if(comm->nrecv) free(comm->nrecv);
+  if(comm->nsend) free(comm->nsend);
+  if(comm->nexch) free(comm->nexch);  
+  if(comm->pbc_x) free(comm->pbc_x); 
+  if(comm->pbc_y) free(comm->pbc_y);  
+  if(comm->pbc_z) free(comm->pbc_z);  
+  if(comm->boxes) free(comm->boxes);  
+  if(comm->atom_send) free(comm->atom_send);     
+  if(comm->atom_recv) free(comm->atom_recv);   
+  if(comm->off_atom_send) free(comm->off_atom_send); 
+  if(comm->off_atom_recv) free(comm->off_atom_recv);
+  if(comm->maxsendlist) free(comm->maxsendlist); 
+  
+  if(comm->sendlist){
+    for(int i = 0; i < comm->numneigh; i++) 
+      if(comm->sendlist[i]) free(comm->sendlist[i]); 
+  }
+  if(comm->sendlist) free(comm->sendlist);
+
+  if(comm->buf_send) free(comm->buf_send); 
+  if(comm->buf_recv) free(comm->buf_recv);   
+}
--- a/common/grid.c
+++ b/common/grid.c
@@ -0,0 +1,490 @@
+#include <stdio.h>
+#include <grid.h>
+#include <mpi.h>
+#include <parameter.h>
+#include <allocate.h>
+#include <util.h>
+#include <math.h>
+
+static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
+
+//Grommacs Balancing
+MD_FLOAT f_normalization(MD_FLOAT* x,MD_FLOAT* fx, MD_FLOAT minx, int nprocs) {
+
+  MD_FLOAT sum=0;
+  for(int n = 0; n<nprocs; n++){
+    fx[n] = MAX(minx,x[n]);
+    sum+=fx[n];
+  }
+
+  for(int n = 0; n<nprocs; n++)
+    fx[n] /= sum;    
+}
+
+void fixedPointIteration(MD_FLOAT* x0, int nprocs, MD_FLOAT minx)
+{ 
+  MD_FLOAT tolerance = 1e-3;
+  MD_FLOAT alpha = 0.5;
+  MD_FLOAT *fx = (MD_FLOAT*) malloc(nprocs*sizeof(MD_FLOAT));
+  int maxIterations = 100; 
+    
+  for (int i = 0; i < maxIterations; i++) {
+
+    int converged = 1; 
+    f_normalization(x0,fx,minx,nprocs);
+
+    for(int n=0; n<nprocs; n++)
+      fx[n]= (1-alpha) * x0[n] + alpha * fx[n];
+    
+    for (int n=0; n<nprocs; n++) {
+        if (fabs(fx[n] - x0[n]) >= tolerance) {
+            converged = 0;
+            break;
+        }      
+    }
+    
+    for (int n=0; n<nprocs; n++) 
+        x0[n] = fx[n];
+
+    if(converged){
+      for(int n = 0; n<nprocs; n++)    
+      return;
+    } 
+  }
+
+
+}
+
+void staggeredBalance(Grid* grid, Atom* atom, Parameter* param, double newTime)
+{ 
+  int me;
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  int *coord = grid->coord;
+  int *nprocs  = grid ->nprocs;
+  //Elapsed time since the last rebalance
+  double time = newTime - grid->Timer;
+  grid->Timer = newTime;
+  //store the older dimm to compare later for exchange
+  MD_FLOAT lo[3], hi[3];
+  for(int dim = 0; dim< 3; dim++){ 
+    lo[dim] = atom->mybox.lo[dim];
+    hi[dim] = atom->mybox.hi[dim]; 
+  }
+  
+  //Define parameters
+  MPI_Comm subComm[3]; 
+  int color[3] = {0,0,0};
+  int id[3] = {0,0,0};
+  MD_FLOAT ** load = (MD_FLOAT**) malloc(3*sizeof(MD_FLOAT*));
+  for(int dim = 0; dim<3; dim++) 
+    load[dim] = (MD_FLOAT*) malloc(nprocs[dim]*sizeof(MD_FLOAT));
+ 
+  int maxprocs = MAX(MAX(nprocs[_x],nprocs[_y]),nprocs[_z]);
+  MD_FLOAT* cellSize = (MD_FLOAT*) malloc(maxprocs*sizeof(MD_FLOAT)); 
+  MD_FLOAT* limits = (MD_FLOAT*) malloc(2*maxprocs*sizeof(MD_FLOAT)); //limits: (x0, x1), (x1, x2)... Repeat values in between to perfom MPI_Scatter later 
+  MD_FLOAT t_sum[3] = {0,0,0}; 
+  MD_FLOAT recv_buf[2] = {0,0};        //Each proc only receives 2 elments per dimension xlo and xhi
+  MD_FLOAT balancedLoad[3] = {0,0,0};  //1/nprocs
+  MD_FLOAT minLoad[3]  = {0,0,0};      //beta*(1/nprocs) 
+  MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
+  MD_FLOAT boundaries[6] ={0,0,0,0,0,0}; // xlo,xhi,ylo,yhi,zlo,zhi
+
+  //Create sub-communications along each dimension
+  for(int dim = 0; dim<3; dim++){
+     if(dim == _x){
+        color[_x] = (coord[_y] == 0 && coord[_z] ==0) ? 1:MPI_UNDEFINED;
+        id[_x] = me;
+     } else if(dim == _y) {
+        color[_y] = coord[_z] == 0 ? coord[_x]:MPI_UNDEFINED; 
+        id[_y] = (coord[_y] == 0 && coord[_z] == 0) ? 0:me;
+     } else {
+        color[_z]= coord[_y]*nprocs[_x]+coord[_x]; 
+        id[_z] = coord[_z] == 0 ? 0 : me; 
+     }
+    MPI_Comm_split(world, color[dim], id[dim], &subComm[dim]);
+  } 
+
+  //Set the minimum load and the balance load
+  for(int dim = 0; dim<3; dim++){
+    balancedLoad[dim] = 1./nprocs[dim]; 
+    minLoad[dim]  = 0.8*balancedLoad[dim]; 
+  }
+  //set and communicate the workload in reverse order
+  for(int dim = _z; dim>= _x; dim--)
+  {
+    if(subComm[dim] != MPI_COMM_NULL){
+      MPI_Gather(&time,1,type,load[dim],1,type,0,subComm[dim]);
+
+      if(id[dim] == 0)
+      {
+        for(int n=0; n<nprocs[dim]; n++) 
+          t_sum[dim] += load[dim][n];
+
+        for(int n=0; n<nprocs[dim]; n++)
+          load[dim][n] /= t_sum[dim];
+      }
+      time =t_sum[dim];
+    }
+    MPI_Barrier(world);
+  }
+
+  //Brodacast the new boundaries along dimensions
+  for(int dim=0; dim<3; dim++){
+    
+    if(subComm[dim] != MPI_COMM_NULL){
+
+      MPI_Bcast(boundaries,6,type,0,subComm[dim]);
+      if(id[dim] == 0) {
+        fixedPointIteration(load[dim], nprocs[dim], minLoad[dim]); 
+        MD_FLOAT inv_sum=0;
+        for(int n=0; n<nprocs[dim];n++)
+          inv_sum +=(1/load[dim][n]);
+        
+        for(int n=0; n<nprocs[dim];n++)
+          cellSize[n] = (prd[dim]/load[dim][n])*(1./inv_sum); 
+   
+        MD_FLOAT sum=0;
+        for(int n=0; n<nprocs[dim]; n++){
+          limits[2*n] = sum; 
+          limits[2*n+1] = sum+cellSize[n];
+          sum+= cellSize[n]; 
+        }
+        limits[2*nprocs[dim]-1] = prd[dim];
+      } 
+      MPI_Scatter(limits,2,type,recv_buf,2,type,0,subComm[dim]); 
+      boundaries[2*dim] = recv_buf[0];
+      boundaries[2*dim+1] = recv_buf[1];
+    }
+     MPI_Barrier(world);
+  }  
+
+  atom->mybox.lo[_x]=boundaries[0]; atom->mybox.hi[_x]=boundaries[1];
+  atom->mybox.lo[_y]=boundaries[2]; atom->mybox.hi[_y]=boundaries[3];
+  atom->mybox.lo[_z]=boundaries[4]; atom->mybox.hi[_z]=boundaries[5];
+ 
+  MD_FLOAT domain[6] = {boundaries[0], boundaries[2], boundaries[4], boundaries[1], boundaries[3], boundaries[5]};
+  MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
+  
+  //because cells change dynamically, It is required to increase the neighbouring exchange region 
+  for(int dim =_x; dim<=_z; dim++){
+    MD_FLOAT dr,dr_max; 
+    int n = grid->nprocs[dim]; 
+    MD_FLOAT maxdelta = 0.2*prd[dim];
+    dr = MAX(fabs(lo[dim] - atom->mybox.lo[dim]),fabs(hi[dim] - atom->mybox.hi[dim]));
+    MPI_Allreduce(&dr, &dr_max, 1, type, MPI_MAX, world);
+    grid->cutneigh[dim] = param->cutneigh+dr_max; 
+  }
+
+  for(int dim=0; dim<3; dim++) {
+    if(subComm[dim] != MPI_COMM_NULL){
+      MPI_Comm_free(&subComm[dim]);
+    }
+    free(load[dim]);
+  }
+  free(load); 
+  free(limits);
+}
+
+//RCB Balancing
+MD_FLOAT meanTimeBisect(Atom *atom, MPI_Comm subComm, int dim, double time)
+{
+  MD_FLOAT mean=0, sum=0, total_sum=0, weightAtoms= 0, total_weight=0;
+
+  for(int i=0; i<atom->Nlocal; i++){
+    sum += atom_pos(i);
+  }
+  sum*=time;
+  weightAtoms = atom->Nlocal*time;
+  MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
+  MPI_Allreduce(&weightAtoms, &total_weight, 1, type, MPI_SUM, subComm);
+
+  mean = total_sum/total_weight;
+  return mean;
+}
+
+MD_FLOAT meanBisect(Atom* atom, MPI_Comm subComm, int dim, double time)
+{  
+  int Natoms = 0;
+  MD_FLOAT sum=0, mean=0, total_sum=0;
+
+  for(int i=0; i<atom->Nlocal; i++){
+    sum += atom_pos(i);
+  }
+  MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
+  MPI_Allreduce(&atom->Nlocal, &Natoms, 1, MPI_INT, MPI_SUM, subComm);
+  mean = total_sum/Natoms;
+  return mean;
+} 
+
+void nextBisectionLevel(Grid* grid, Atom* atom, RCB_Method method, MPI_Comm subComm, int dim ,int* color, int ilevel, double time)
+{ 
+  int rank, size;
+  int branch = 0, i = 0, m = 0;
+  int nsend = 0, nrecv = 0, nrecv2 = 0;
+  int values_per_atom = 7; 
+  MD_FLOAT bisection, pos;
+  MPI_Request request[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
+  MPI_Comm_rank(subComm,&rank);
+  MPI_Comm_size(subComm,&size);
+   
+  int odd = size%2;
+  int extraProc = odd ? size-1:size;
+  int half = (int) (0.5*size);
+  int partner = (rank<half) ? rank+half:rank-half;
+  if(odd && rank == extraProc) partner = 0;
+  //Apply the bisection 
+  bisection = method(atom,subComm,dim,time);
+  //Define the new boundaries
+  if(rank<half){
+    atom->mybox.hi[dim] = bisection;
+    branch = 0;
+  } else {
+    atom->mybox.lo[dim] = bisection;
+    branch = 1;
+  }
+  //Define new color for the further communicaton
+  *color = (branch << ilevel) | *color;
+  //Grow the send buffer
+  if(atom->Nlocal>=grid->maxsend){
+      if(grid->buf_send) free(grid->buf_send); 
+      grid->buf_send = (MD_FLOAT*) malloc(atom->Nlocal*values_per_atom* sizeof(MD_FLOAT));
+      grid->maxsend = atom->Nlocal;
+  }
+  //buffer particles to send
+  while(i < atom->Nlocal) {
+    pos = atom_pos(i);
+    if(pos < atom->mybox.lo[dim] || pos >= atom->mybox.hi[dim]) {
+      nsend += packExchange(atom, i, &grid->buf_send[nsend]);
+      copy(atom, i, atom->Nlocal-1);
+      atom->Nlocal--;
+    } else i++;
+  }
+
+  //Communicate the number of elements to be sent
+  if(rank < extraProc){
+    MPI_Irecv(&nrecv,1,MPI_INT,partner,0,subComm,&request[0]);
+  }
+  if(odd && rank == 0){ 
+    MPI_Irecv(&nrecv2,1,MPI_INT,extraProc,0,subComm,&request[1]);
+  }
+  MPI_Send(&nsend,1,MPI_INT,partner,0,subComm);
+  MPI_Waitall(2,request,MPI_STATUS_IGNORE);
+
+  //Grow the recv buffer 
+  if(nrecv+nrecv2>=grid->maxrecv){
+      if(grid->buf_recv) free(grid->buf_recv); 
+      grid->buf_recv = (MD_FLOAT*) malloc((nrecv+nrecv2)*values_per_atom*sizeof(MD_FLOAT));
+      grid->maxrecv = nrecv+nrecv2;
+  } 
+
+  //communicate elements in the buffer
+  request[0] = MPI_REQUEST_NULL; 
+  request[1] = MPI_REQUEST_NULL;
+
+  if(rank < extraProc){
+    MPI_Irecv(grid->buf_recv,nrecv,type,partner,0,subComm,&request[0]);
+  }
+  if(odd && rank == 0){ 
+    MPI_Irecv(&grid->buf_recv[nrecv],nrecv2,type,extraProc,0,subComm,&request[1]);
+  }
+  MPI_Send (grid->buf_send,nsend,type,partner,0,subComm); 
+  MPI_Waitall(2,request,MPI_STATUS_IGNORE);
+
+  //store atoms in atom list
+  while(m < nrecv+nrecv2){ 
+    m += unpackExchange(atom, atom->Nlocal++, &grid->buf_recv[m]);
+  }
+}
+
+void rcbBalance(Grid* grid, Atom* atom, Parameter* param, RCB_Method method, int ndim, double newTime)
+{
+  int me, nprocs=0, ilevel=0, nboxes=1;
+  int color = 0, size =0;
+  int index, prd[3];
+  MPI_Comm subComm;
+  MPI_Comm_size(world, &nprocs);
+  MPI_Comm_rank(world, &me);
+  
+  //set the elapsed time since the last dynamic balance
+  double time = newTime - grid->Timer;
+  
+  prd[_x] = atom->mybox.xprd = param->xprd; 
+  prd[_y] = atom->mybox.yprd = param->yprd; 
+  prd[_z] = atom->mybox.zprd = param->zprd;
+
+  //Sort by larger dimension 
+  int largerDim[3] ={_x, _y, _z};
+
+  for(int i = 0; i< 2; i++){
+    for(int j = i+1; j<3; j++)
+    {
+      if(prd[largerDim[j]]>prd[largerDim[i]]){
+        MD_FLOAT tmp = largerDim[j];
+        largerDim[j] = largerDim[i];
+        largerDim[i] = tmp;
+      }  
+    }
+  }
+  //Initial Partition
+  atom->mybox.lo[_x] = 0; atom->mybox.hi[_x] = atom->mybox.xprd;
+  atom->mybox.lo[_y] = 0; atom->mybox.hi[_y] = atom->mybox.yprd;
+  atom->mybox.lo[_z] = 0; atom->mybox.hi[_z] = atom->mybox.zprd;
+  
+  //Recursion tree 
+  while(nboxes<nprocs)
+  {  
+    index = ilevel%ndim; 
+    MPI_Comm_split(world, color, me, &subComm);
+    MPI_Comm_size(subComm,&size);
+    if(size > 1){
+      nextBisectionLevel(grid, atom, method, subComm, largerDim[index], &color, ilevel, time);
+    }
+    MPI_Comm_free(&subComm);
+    nboxes = pow(2,++ilevel);
+  }
+  //Set the new timer grid
+  grid->Timer = newTime;
+
+  //Creating the global map
+  MD_FLOAT domain[6] = {atom->mybox.lo[_x], atom->mybox.lo[_y], atom->mybox.lo[_z], atom->mybox.hi[_x], atom->mybox.hi[_y], atom->mybox.hi[_z]};
+  MPI_Allgather(domain, 6, type, grid->map, 6, type, world);  
+  
+  //Define the same cutneighbour in all dimensions for the exchange communication
+  for(int dim =_x; dim<=_z; dim++)
+    grid->cutneigh[dim] = param->cutneigh;
+}
+
+//Regular grid
+void cartisian3d(Grid* grid, Parameter* param, Box* box)
+{
+  int me, nproc;
+  MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  
+  int numdim=3;
+  int reorder=0;
+  int periods[3]={1,1,1}; 
+  int mycoord[3]={0,0,0};
+  int griddim[3]={0,0,0};
+  MD_FLOAT len[3];
+  MPI_Comm cartesian;
+
+  box->xprd = param->xprd;
+  box->yprd = param->yprd;
+  box->zprd = param->zprd;
+
+ //Creates a cartesian 3d grid 
+  MPI_Dims_create(nproc, numdim, griddim); 
+  MPI_Cart_create(world,numdim,griddim,periods,reorder,&cartesian); 
+  grid->nprocs[_x] = griddim[_x];
+  grid->nprocs[_y] = griddim[_y]; 
+  grid->nprocs[_z] = griddim[_z];
+
+  //Coordinates position in the grid
+  MPI_Cart_coords(cartesian,me,3,mycoord); 
+  grid->coord[_x] = mycoord[_x];
+  grid->coord[_y] = mycoord[_y];
+  grid->coord[_z] = mycoord[_z];
+
+  //boundaries of my local box, with origin in (0,0,0). 
+  len[_x] = param->xprd / griddim[_x];
+  len[_y] = param->yprd / griddim[_y];
+  len[_z] = param->zprd / griddim[_z];
+
+  box->lo[_x] = mycoord[_x] * len[_x];
+  box->hi[_x] = (mycoord[_x] + 1) * len[_x];
+  box->lo[_y] = mycoord[_y] * len[_y];
+  box->hi[_y] = (mycoord[_y] + 1) * len[_y];
+  box->lo[_z] = mycoord[_z] * len[_z];
+  box->hi[_z] = (mycoord[_z] + 1) * len[_z];
+  
+  MD_FLOAT domain[6] = {box->lo[_x], box->lo[_y], box->lo[_z], box->hi[_x], box->hi[_y], box->hi[_z]};
+  MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
+  MPI_Comm_free(&cartesian);
+
+  //Define the same cutneighbour in all dimensions for the exchange communication
+  for(int dim =_x; dim<=_z; dim++)
+    grid->cutneigh[dim] = param->cutneigh;
+}
+
+//Other Functions from the grid
+void initGrid(Grid* grid)
+{ //start with regular grid
+  int nprocs;
+  MPI_Comm_size(world, &nprocs);
+  grid->map_size = 6 * nprocs;             
+  grid->map  = (MD_FLOAT*) allocate(ALIGNMENT, grid->map_size * sizeof(MD_FLOAT));  
+  //========rcb=======
+  grid->maxsend = 0; 
+  grid->maxrecv = 0;
+  grid->buf_send = NULL;  
+  grid->buf_recv = NULL;
+  //====staggered=====
+  grid->Timer = 0.;
+}
+
+void setupGrid(Grid* grid, Atom* atom, Parameter* param)
+{
+  int me; 
+  MD_FLOAT xlo, ylo, zlo, xhi, yhi, zhi; 
+  MPI_Comm_rank(MPI_COMM_WORLD, &me);
+  initGrid(grid);
+
+  //Set the origin at (0,0,0)
+  if(param->input_file){
+    for(int i=0; i<atom->Nlocal; i++){
+      atom_x(i) = atom_x(i) - param->xlo;
+      atom_y(i) = atom_y(i) - param->ylo;
+      atom_z(i) = atom_z(i) - param->zlo;
+    }
+  }
+
+  cartisian3d(grid, param, &atom->mybox);
+  
+  xlo = atom->mybox.lo[_x]; xhi = atom->mybox.hi[_x];  
+  ylo = atom->mybox.lo[_y]; yhi = atom->mybox.hi[_y];
+  zlo = atom->mybox.lo[_z]; zhi = atom->mybox.hi[_z];  
+
+  int i = 0; 
+  while(i < atom->Nlocal) 
+  {
+    if(atom_x(i) >= xlo && atom_x(i)< xhi &&  
+       atom_y(i) >= ylo && atom_y(i)< yhi &&  
+       atom_z(i) >= zlo && atom_z(i)< zhi)
+      {
+        i++;
+      } else {
+        copy(atom, i, atom->Nlocal-1);
+        atom->Nlocal--; 
+      }
+  } 
+
+  //printGrid(grid);
+  if(!param->balance){
+    MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); 
+    printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
+    MPI_Barrier(world);
+  }  
+}
+
+void printGrid(Grid* grid)
+{
+  int me, nprocs;
+  MPI_Comm_size(world, &nprocs);
+  MPI_Comm_rank(world, &me);
+  MD_FLOAT* map = grid->map;
+  if(me==0)
+  {
+ 
+    printf("GRID:\n");
+    printf("===================================================================================================\n");
+    for(int i=0; i<nprocs; i++)
+      printf("Box:%i\txlo:%.4f\txhi:%.4f\tylo:%.4f\tyhi:%.4f\tzlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
+    printf("\n\n");    
+    //printf("Box processor:%i\n xlo:%.4f\txhi:%.4f\n ylo:%.4f\tyhi:%.4f\n zlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
+  }
+  MPI_Barrier(world);
+}
+
+
+
--- a/common/includes/box.h
+++ b/common/includes/box.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <parameter.h>
+
+#ifndef __BOX_H_
+#define __BOX_H_
+
+typedef struct {
+  int id;
+  MD_FLOAT xprd, yprd, zprd;     //Domain Dimension
+  MD_FLOAT lo[3];               //smallest coordinate of my subdomain
+  MD_FLOAT hi[3];               //Highest coordinate of my subdomain
+} Box;
+
+int overlapBox(int, int , const Box*, const Box* , Box* , MD_FLOAT , MD_FLOAT);
+int overlapFullBox(Parameter*, MD_FLOAT*, const Box*, const Box*);
+void expandBox(int , const Box*, const Box* , Box* , MD_FLOAT);
+#endif
--- a/common/includes/comm.h
+++ b/common/includes/comm.h
@@ -0,0 +1,104 @@
+#include <atom.h>
+#include <parameter.h>
+#include <box.h>
+#include <grid.h>
+
+#ifndef COMM_H
+#define COMM_H
+
+#ifdef GROMACS
+#define FORWARD_SIZE  (3*CLUSTER_N)   
+#define REVERSE_SIZE  (3*CLUSTER_N)
+#define GHOST_SIZE    (4*CLUSTER_N+10)
+#define EXCHANGE_SIZE 7
+
+#define JFAC MAX(1, CLUSTER_N / CLUSTER_M)
+#define LOCAL atom->Nclusters_local / JFAC
+#define GHOST atom->Nclusters_ghost 
+
+#define IsinRegionToSend(cj)                                                                  \
+           ((atom->jclusters[(cj)].bbminx >= xlo || atom->jclusters[(cj)].bbmaxx >= xlo)  &&  \
+            (atom->jclusters[(cj)].bbminx  < xhi || atom->jclusters[(cj)].bbmaxx  < xhi)  &&  \
+            (atom->jclusters[(cj)].bbminy >= ylo || atom->jclusters[(cj)].bbmaxy >= ylo)  &&  \
+            (atom->jclusters[(cj)].bbminy  < yhi || atom->jclusters[(cj)].bbmaxy  < yhi)  &&  \
+            (atom->jclusters[(cj)].bbminz >= zlo || atom->jclusters[(cj)].bbmaxz >= zlo)  &&  \
+            (atom->jclusters[(cj)].bbminz  < zhi || atom->jclusters[(cj)].bbmaxz  < zhi))  
+
+#else
+
+#define FORWARD_SIZE  3   
+#define REVERSE_SIZE  3
+#define GHOST_SIZE    4
+#define EXCHANGE_SIZE 7
+#define LOCAL atom->Nlocal
+#define GHOST atom->Nghost
+
+#define IsinRegionToSend(i)                                 \
+           ((atom_x((i)) >= xlo && atom_x((i)) < xhi) &&    \
+            (atom_y((i)) >= ylo && atom_y((i)) < yhi) &&    \
+            (atom_z((i)) >= zlo && atom_z((i)) < zhi)) 
+
+#endif 
+
+typedef struct {
+  int myproc;                       // my proc ID
+  int numproc;                      // # of processors
+	
+  int numneigh;                     // # of all my neighs along all swaps 
+  int maxneigh;										  // Buffer size for my neighs
+	int sendfrom[6];                  //return the lowest neigh index to send in each swap
+  int sendtill[6];                  //return the highest neigh index to send in each swao
+  int recvfrom[6];                  //return the lowest neigh index to recv in each swap
+  int recvtill[6];                  //return the highest neigh index to recv in each swap
+  int* nsend;											  // neigh whose I want to send
+  int* nrecv;                       // neigh whose I want to recv
+
+	int* pbc_x;                       // if pbc in x
+	int* pbc_y;                       // if pbc in y
+	int* pbc_z;                       // if pbc in z
+	
+  int* atom_send, *atom_recv;       // # of atoms to send/recv for each of my neighs 
+	int* off_atom_send;               // atom offset to send, inside of a swap
+  int* off_atom_recv;               // atom offset to recv, inside of a swap
+         
+  int* nexch;                        //procs to exchange
+  int numneighexch;                  //# of neighbours to exchange
+  int maxneighexch;                  //max buff size to store neighbours
+
+	int numswap;                      // # of swaps to perform, it is 6
+  int swapdim[6]; 									// dimension of the swap (_x, _y or _z)
+	int swapdir[6];										// direction of the swap 0 or 1
+  int swap[3][2];                   // given a dim and dir, knows the swap
+  int othersend[6];                 // Determine if a proc interact with more procs in a given swap
+
+	int firstrecv[6];                 // where to put 1st recv atom in each swap
+  int** sendlist;                   // list of atoms to send in each swap   
+  int* maxsendlist;								  // max # of atoms send in each list-swap
+
+	int maxsend;											// max elements in buff sender 									
+	int maxrecv;											// max elements in buff receiver
+  MD_FLOAT* buf_send;               // sender buffer for all comm
+	MD_FLOAT* buf_recv;               // receicer buffer for all comm
+	 	  
+	int forwardSize;					        // # of paramaters per atom in forward comm.
+	int reverseSize;			        		// # of parameters per atom in reverse
+  int exchangeSize;                 // # of parameters per atom in exchange
+	int ghostSize;                    // # of parameters per atom in ghost list                               
+
+  int  iterAtom;                     //last atom to iterate in each swap.
+  Box* boxes; 											 // Boundaries to  be sent to other procs as ghost.
+} Comm;
+
+
+void initComm(int*, char***, Comm*); 						    //Init MPI 
+void endComm(Comm*);													      //End MPI
+void setupComm(Comm*,Parameter*,Grid*);             //Creates a 3d grid or rcb grid
+void neighComm(Comm*,Parameter*,Grid*);             //Find neighbours within cut-off and defines ghost regions
+void forwardComm(Comm*,Atom*,int);							    //Send info in one direction
+void reverseComm(Comm*,Atom*,int);							    //Return info after forward communication
+void exchangeComm(Comm*,Atom*);							        //Exchange info between procs
+void ghostComm(Comm*, Atom*,int);                   //Build the ghost neighbours to send during next forwards
+void growSend(Comm*,int);										        //Grows the size of the buffer sender
+void growRecv(Comm*,int);										        //Grows the size of the buffer receiver
+void growList(Comm*, int, int);                     //Grows the size of the list to send
+#endif
--- a/common/includes/grid.h
+++ b/common/includes/grid.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+
+
+#include <parameter.h>
+#include <box.h>
+#include <atom.h>
+#include <mpi.h>
+
+#ifndef __MAP_H_
+#define __MAP_H_
+
+#define world MPI_COMM_WORLD
+#define atom_pos(i) ((dim == _x) ? atom_x((i)) : (dim == _y) ? atom_y((i)) : atom_z((i)))
+
+enum {RCB=1, meanTimeRCB, Staggered};
+
+typedef struct {
+  int balance_every;
+  int  map_size;
+  MD_FLOAT* map;
+  //===Param for Staggerd balance
+  int nprocs[3]; 
+  int coord[3];
+  MD_FLOAT cutneigh[3];
+  double Timer;
+  //===Param for RCB balance 
+  MD_FLOAT* buf_send;
+  MD_FLOAT* buf_recv;
+  int maxsend; 
+  int maxrecv; 
+} Grid; 
+
+
+typedef MD_FLOAT(*RCB_Method)(Atom*,MPI_Comm,int,double);
+
+void setupGrid(Grid*, Atom*, Parameter*);
+void cartisian3d(Grid*, Parameter*, Box*);
+void rcbBalance(Grid*, Atom*, Parameter* ,RCB_Method, int, double);
+void staggeredBalance(Grid*, Atom*, Parameter*, double); 
+void printGrid(Grid*); 
+//rcb methods
+MD_FLOAT meanBisect(Atom* , MPI_Comm, int, double);
+MD_FLOAT meanTimeBisect(Atom*, MPI_Comm, int, double);
+#endif
+
+
--- a/common/includes/parameter.h
+++ b/common/includes/parameter.h
@@ -8,9 +8,11 @@
 #define __PARAMETER_H_

 #if PRECISION == 1
-#define MD_FLOAT float
+#   define MD_FLOAT float
+#   define MD_UINT  unsigned int
 #else
-#define MD_FLOAT double
+#   define MD_FLOAT double
+#   define MD_UINT  unsigned long long int
 #endif

 typedef struct {
@@ -19,6 +21,7 @@ typedef struct {
    char* input_file;
    char* vtk_file;
    char* xtc_file;
+    char* write_atom_file;
    MD_FLOAT epsilon;
    MD_FLOAT sigma;
    MD_FLOAT sigma6;
@@ -50,6 +53,10 @@ typedef struct {
    MD_FLOAT k_dn;
    MD_FLOAT gx, gy, gz;
    MD_FLOAT reflect_x, reflect_y, reflect_z;
+    //MPI implementation
+    int balance;
+    int method;
+    int balance_every;
 } Parameter;

 void initParameter(Parameter*);
--- a/common/includes/shell_methods.h
+++ b/common/includes/shell_methods.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of MD-Bench.
+ * Use of this source code is governed by a LGPL-3.0
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+#include <math.h>
+#include <comm.h>
+#include <atom.h>
+#include <timing.h>
+#include <parameter.h>
+#include <util.h>
+
+//static void addDummyCluster(Atom*);
+
+double forward(Comm* comm, Atom *atom, Parameter* param){
+    double S, E;    
+    S = getTimeStamp();  
+    if(param->method == halfShell){
+        for(int iswap = 0; iswap < 5; iswap++) 
+            forwardComm(comm, atom, iswap);
+    } else if(param->method == eightShell){
+        for(int iswap = 0; iswap < 6; iswap+=2) 
+            forwardComm(comm, atom, iswap);
+    } else {
+        for(int iswap = 0; iswap < 6; iswap++) 
+            forwardComm(comm, atom, iswap);
+    }
+    E = getTimeStamp();
+    return E-S;
+}
+
+double reverse(Comm* comm, Atom *atom, Parameter* param){
+    double S, E;    
+    S = getTimeStamp(); 
+    if(param->method == halfShell){
+        for(int iswap = 4; iswap >= 0; iswap--) 
+            reverseComm(comm, atom, iswap);
+    } else if(param->method == eightShell){
+        for(int iswap = 4; iswap >= 0; iswap-=2) 
+            reverseComm(comm, atom, iswap);
+    } else if(param->method == halfStencil){
+        for(int iswap = 5; iswap >= 0; iswap--) 
+            reverseComm(comm, atom, iswap);
+    }  else { }  //Full Shell Reverse does nothing 
+    E = getTimeStamp();
+    return E-S;
+}
+
+void ghostNeighbor(Comm* comm, Atom* atom, Parameter* param)
+{   
+    #ifdef GROMACS
+    atom->Nclusters_ghost = 0;
+    #endif
+    atom->Nghost = 0;    
+    if(param->method == halfShell){
+        for(int iswap=0; iswap<5; iswap++) 
+            ghostComm(comm,atom,iswap);
+    } else if(param->method == eightShell){
+        for(int iswap = 0; iswap<6; iswap+=2)
+            ghostComm(comm, atom,iswap);
+    } else {
+        for(int iswap=0; iswap<6; iswap++) 
+            ghostComm(comm,atom,iswap);
+    }
+}
--- a/common/includes/simd/avx2_double.h
+++ b/common/includes/simd/avx2_double.h
@@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
    t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
    t0 = _mm256_add_pd(t0, t2);
    t1 = _mm256_add_pd(t1, t2);
-    t0 = _mm256_blend_pd(t0, t1, 0b1100);
+    t0 = _mm256_blend_pd(t0, t1, 0xC);
+    //t0 = _mm256_blend_pd(t0, t1, 0b1100);
    t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
    _mm256_store_pd(m, t1);

-    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
+    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
+    //t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
    a0 = _mm256_castpd256_pd128(t0);
    a1 = _mm256_extractf128_pd(t0, 0x1);
    a0 = _mm_add_sd(a0, a1);
@@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
 }

 // Functions used in LAMMPS kernel
-static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
+#define simd_gather(vidx, m, s)     _mm256_i32gather_pd(m, vidx, s);
 static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
 static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
 static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
--- a/common/includes/simd/avx512_double.h
+++ b/common/includes/simd/avx512_double.h
@@ -9,10 +9,13 @@
 #   include <zmmintrin.h>
 #endif

-#define MD_SIMD_FLOAT   __m512d
-#define MD_SIMD_MASK    __mmask8
-#define MD_SIMD_INT     __m256i
+#define MD_SIMD_FLOAT       __m512d
+#define MD_SIMD_MASK        __mmask8
+#define MD_SIMD_INT         __m256i
+#define MD_SIMD_BITMASK     MD_SIMD_INT
+#define MD_SIMD_IBOOL       __mmask16

+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
 static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
--- a/common/includes/simd/avx512_float.h
+++ b/common/includes/simd/avx512_float.h
@@ -7,11 +7,30 @@
 #include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
-#include <zmmintrin.h>
+#ifndef NO_ZMM_INTRIN
+#   include <zmmintrin.h>
+#endif

 #define MD_SIMD_FLOAT       __m512
 #define MD_SIMD_MASK        __mmask16
+#define MD_SIMD_INT         __m256i
+#define MD_SIMD_IBOOL       __mmask16
+#define MD_SIMD_INT32       __m512i
+#define MD_SIMD_BITMASK     MD_SIMD_INT32

+static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
+    return _mm512_load_si512(m);
+}
+
+static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
+    return _mm512_set1_epi32(a);
+}
+
+static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
+    return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
+}
+
+static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
 static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
@@ -69,7 +88,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
    return _mm_cvtss_f32(t3);
 }

-inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
+static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
    __m256 t;
    a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
    t = _mm256_load_ps(m);
--- a/common/includes/timers.h
+++ b/common/includes/timers.h
@@ -9,9 +9,15 @@

 typedef enum {
    TOTAL = 0,
-    NEIGH,
    FORCE,
+    NEIGH,
+    FORWARD,
+    REVERSE,
+    UPDATE,
+    BALANCE,
+    SETUP,
+    REST,
    NUMTIMER
-} timertype;
+ } timerComm;

 #endif
--- a/common/includes/timing.h
+++ b/common/includes/timing.h
@@ -7,8 +7,8 @@
 #ifndef __TIMING_H_
 #define __TIMING_H_

-extern double getTimeStamp();
-extern double getTimeResolution();
-extern double getTimeStamp_();
+extern double getTimeStamp(void);
+extern double getTimeResolution(void);
+extern double getTimeStamp_(void);

 #endif
--- a/common/includes/util.h
+++ b/common/includes/util.h
@@ -4,6 +4,8 @@
 * Use of this source code is governed by a LGPL-3.0
 * license that can be found in the LICENSE file.
 */
+#include <math.h>
+
 #ifndef __UTIL_H_
 #define __UTIL_H_

@@ -35,12 +37,19 @@
 #   define PRECISION_STRING     "double"
 #endif

+#define BigOrEqual(a,b) (fabs((a)-(b))<1e-9 || (a)>(b))
+#define Equal(a,b) (fabs((a)-(b))<1e-9)
+
+enum {_x=0, _y, _z}; 
+enum {fullShell=0, halfShell, eightShell, halfStencil};
+
+
 extern double myrandom(int*);
 extern void random_reset(int *seed, int ibase, double *coord);
 extern int str2ff(const char *string);
 extern const char* ff2str(int ff);
-extern int get_num_threads();
 extern void readline(char *line, FILE *fp);
 extern void debug_printf(const char *format, ...);
+extern int get_cuda_num_threads();

 #endif
--- a/common/parameter.c
+++ b/common/parameter.c
@@ -11,12 +11,14 @@
 #include <atom.h>
 #include <parameter.h>
 #include <util.h>
+#include <mpi.h>

 void initParameter(Parameter *param) {
    param->input_file = NULL;
    param->vtk_file = NULL;
    param->xtc_file = NULL;
    param->eam_file = NULL;
+    param->write_atom_file = NULL;
    param->force_field = FF_LJ;
    param->epsilon = 1.0;
    param->sigma = 1.0;
@@ -53,13 +55,17 @@ void initParameter(Parameter *param) {
    param->reflect_x = 0.0;
    param->reflect_y = 0.0;
    param->reflect_z = 0.0;
+    //MPI
+    param->balance = 0;
+    param->method = 0;
+    param->balance_every =param->reneigh_every; 
 }

 void readParameter(Parameter *param, const char *filename) {
    FILE *fp = fopen(filename, "r");
    char line[MAXLINE];
    int i;
-
+    
    if(!fp) {
        fprintf(stderr, "Could not open parameter file: %s\n", filename);
        exit(-1);
@@ -71,8 +77,8 @@ void readParameter(Parameter *param, const char *filename) {
        for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
        line[i] = '\0';

-        char *tok = strtok(line, " ");
-        char *val = strtok(NULL, " ");
+        char *tok = strtok(line, "\t ");
+        char *val = strtok(NULL, "\t ");

        #define PARSE_PARAM(p,f)   if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
        #define PARSE_STRING(p)    PARSE_PARAM(p, strdup)
@@ -116,34 +122,39 @@ void readParameter(Parameter *param, const char *filename) {
            PARSE_INT(x_out_every);
            PARSE_INT(v_out_every);
            PARSE_INT(half_neigh);
+            PARSE_INT(method);
+            PARSE_INT(balance);
+            PARSE_INT(balance_every);
        }
    }
-
    // Update dtforce
    param->dtforce = 0.5 * param->dt;

    // Update sigma6 parameter
    MD_FLOAT s2 = param->sigma * param->sigma;
    param->sigma6 = s2 * s2 * s2;
+    
+    //Update balance parameter, 10 could be change
+    param->balance_every *=param->reneigh_every;
    fclose(fp);
 }

 void printParameter(Parameter *param) {
    printf("Parameters:\n");
    if(param->input_file != NULL) {
-        printf("Input file: %s\n", param->input_file);
+        printf("\tInput file: %s\n", param->input_file);
    }

    if(param->vtk_file != NULL) {
-        printf("VTK file: %s\n", param->vtk_file);
+        printf("\tVTK file: %s\n", param->vtk_file);
    }

    if(param->xtc_file != NULL) {
-        printf("XTC file: %s\n", param->xtc_file);
+        printf("\tXTC file: %s\n", param->xtc_file);
    }

    if(param->eam_file != NULL) {
-        printf("EAM file: %s\n", param->eam_file);
+        printf("\tEAM file: %s\n", param->eam_file);
    }

    printf("\tForce field: %s\n", ff2str(param->force_field));
@@ -169,6 +180,11 @@ void printParameter(Parameter *param) {
    printf("\tNumber of timesteps: %d\n", param->ntimes);
    printf("\tReport stats every (timesteps): %d\n", param->nstat);
    printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
+    #ifdef SORT_ATOMS
+    printf("\tSort atoms when reneighboring: yes\n");
+    #else
+    printf("\tSort atoms when reneighboring: no\n");
+    #endif
    printf("\tPrune every (timesteps): %d\n", param->prune_every);
    printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
    printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
@@ -177,4 +193,19 @@ void printParameter(Parameter *param) {
    printf("\tSkin: %e\n", param->skin);
    printf("\tHalf neighbor lists: %d\n", param->half_neigh);
    printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
+
+    // ================ New MPI features =============
+    char str[20]; 
+    strcpy(str, (param->method == 1) ? "Half Shell"  :
+                (param->method == 2) ? "Eight Shell" :
+                (param->method == 3) ? "Half Stencil":                      
+                                       "Full Shell");
+    printf("\tMethod: %s\n", str);
+    strcpy(str, (param->balance == 1) ? "mean RCB"      : 
+                (param->balance == 2) ? "mean Time RCB" :
+                (param->balance == 3) ? "Staggered"     :
+                                        "cartisian");
+    printf("\tPartition: %s\n", str);
+    if(param->balance) 
+        printf("\tRebalancing every (timesteps): %d\n",param->balance_every); 
 }
--- a/common/thermo.c
+++ b/common/thermo.c
@@ -10,6 +10,7 @@

 #include <thermo.h>
 #include <util.h>
+#include <mpi.h>

 static int *steparr;
 static MD_FLOAT *tmparr;
@@ -24,6 +25,7 @@ static MD_FLOAT t_act;
 static MD_FLOAT p_act;
 static MD_FLOAT e_act;
 static int mstat;
+static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;

 /* exported subroutines */
 void setupThermo(Parameter *param, int natoms)
@@ -53,57 +55,73 @@ void setupThermo(Parameter *param, int natoms)

 void computeThermo(int iflag, Parameter *param, Atom *atom)
 {
-    MD_FLOAT t = 0.0, p;
+    MD_FLOAT t_sum = 0.0, t = 0.0, p;
+    int me; 
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
    for(int i = 0; i < atom->Nlocal; i++) {
        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
    }

-    t = t * t_scale;
-    p = (t * dof_boltz) * p_scale;
-    int istep = iflag;
+    MPI_Reduce(&t, &t_sum, 1, type, MPI_SUM, 0 ,MPI_COMM_WORLD);
+    if(me == 0)
+    {
+        t = t_sum * t_scale;
+        p = (t * dof_boltz) * p_scale;
+        int istep = iflag;

-    if(iflag == -1){
-        istep = param->ntimes;
-    }
-    if(iflag == 0){
-        mstat = 0;
-    }
+        if(iflag == -1){
+            istep = param->ntimes;
+        }
+        if(iflag == 0){
+            mstat = 0;
+        }

-    steparr[mstat] = istep;
-    tmparr[mstat] = t;
-    prsarr[mstat] = p;
-    mstat++;
-    fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
+        steparr[mstat] = istep;
+        tmparr[mstat] = t;
+        prsarr[mstat] = p;
+        mstat++;
+        fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
+    }
 }

 void adjustThermo(Parameter *param, Atom *atom)
 {
    /* zero center-of-mass motion */
    MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
-
+    MD_FLOAT v_sum[3], vtot[3];  
+    
    for(int i = 0; i < atom->Nlocal; i++) {
        vxtot += atom_vx(i);
        vytot += atom_vy(i);
        vztot += atom_vz(i);
    }
+    
+    vtot[0] = vxtot; vtot[1] = vytot; vtot[2] = vztot;  

-    vxtot = vxtot / atom->Natoms;
-    vytot = vytot / atom->Natoms;
-    vztot = vztot / atom->Natoms;
+    MPI_Allreduce(vtot, v_sum, 3, type, MPI_SUM, MPI_COMM_WORLD);
+    
+    vxtot = v_sum[0] / atom->Natoms;
+    vytot = v_sum[1] / atom->Natoms;
+    vztot = v_sum[2] / atom->Natoms;

    for(int i = 0; i < atom->Nlocal; i++) {
        atom_vx(i) -= vxtot;
        atom_vy(i) -= vytot;
        atom_vz(i) -= vztot;
    }
-
-    t_act = 0;
+   
    MD_FLOAT t = 0.0;
+    MD_FLOAT t_sum = 0.0;

    for(int i = 0; i < atom->Nlocal; i++) {
        t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
    }

+    MPI_Allreduce(&t, &t_sum, 1,type, MPI_SUM,MPI_COMM_WORLD);
+
+    t = t_sum; 
    t *= t_scale;
    MD_FLOAT factor = sqrt(param->temp / t);

--- a/common/util.c
+++ b/common/util.c
@@ -10,6 +10,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <util.h>
+#include <math.h>

 /* Park/Miller RNG w/out MASKING, so as to be like f90s version */
 #define IA 16807
@@ -79,13 +80,14 @@ const char* ff2str(int ff) {
    return "invalid";
 }

-int get_num_threads() {
+int get_cuda_num_threads() {
    const char *num_threads_env = getenv("NUM_THREADS");
    return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
 }

 void readline(char *line, FILE *fp) {
    if(fgets(line, MAXLINE, fp) == NULL) {
+        printf("error %i\n",errno);
        if(errno != 0) {
            perror("readline()");
            exit(-1);
--- a/config.mk
+++ b/config.mk
@@ -1,20 +1,22 @@
 # Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
-TAG ?= ICC
+TAG ?= MPIICC
 # Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
 ISA ?= AVX512
 # Optimization scheme (lammps/gromacs/clusters_per_bin)
 OPT_SCHEME ?= gromacs
 # Enable likwid (true or false)
-ENABLE_LIKWID ?= true
+ENABLE_LIKWID ?= false
 # SP or DP
 DATA_TYPE ?= DP
 # AOS or SOA
-DATA_LAYOUT ?= AOS
+DATA_LAYOUT ?= SOA
 # Assembly syntax to generate (ATT/INTEL)
 ASM_SYNTAX ?= ATT
 # Debug
 DEBUG ?= false

+# Sort atoms when reneighboring (true or false)
+SORT_ATOMS ?= true
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
 # Trace memory addresses for cache simulator (true or false)
@@ -22,13 +24,13 @@ MEM_TRACER ?= false
 # Trace indexes and distances for gather-md (true or false)
 INDEX_TRACER ?= false
 # Compute statistics
-COMPUTE_STATS ?= true
+COMPUTE_STATS ?= false

 # Configurations for lammps optimization scheme
 # Use omp simd pragma when running with half neighbor-lists
-ENABLE_OMP_SIMD ?= true
+ENABLE_OMP_SIMD ?= false
 # Use kernel with explicit SIMD intrinsics
-USE_SIMD_KERNEL ?= false
+USE_SIMD_KERNEL ?= true

 # Configurations for gromacs optimization scheme
 # Use reference version
--- a/data/argon_1000/mdbench_params.conf
+++ b/data/argon_1000/mdbench_params.conf
@@ -6,7 +6,7 @@ dt 0.001
 temp 80
 x_out_freq 500
 v_out_freq 5
-cutforce 0.9
-skin 0.0
+cutforce 1.8
+skin 0.1
 reneigh_every 100
 nstat 125000
--- a/1
+++ b/1
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -12,7 +12,8 @@
 #include <atom.h>
 #include <allocate.h>
 #include <util.h>
-
+#include <mpi.h>
+ 
 void initAtom(Atom *atom) {
    atom->x  = NULL; atom->y  = NULL; atom->z  = NULL;
    atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
@@ -27,6 +28,7 @@ void initAtom(Atom *atom) {
    atom->Nclusters = 0;
    atom->Nclusters_local = 0;
    atom->Nclusters_ghost = 0;
+    atom->NmaxGhost = 0;        //Temporal
    atom->Nclusters_max = 0;
    atom->type = NULL;
    atom->ntypes = 0;
@@ -37,9 +39,19 @@ void initAtom(Atom *atom) {
    atom->iclusters = NULL;
    atom->jclusters = NULL;
    atom->icluster_bin = NULL;
+    atom->PBCx = NULL; 
+    atom->PBCy = NULL; 
+    atom->PBCz = NULL;
+    initMasks(atom);
+    //MPI
+    Box *mybox = &(atom->mybox);                  
+    mybox->xprd = mybox->yprd = mybox->zprd = 0;          
+    mybox->lo[0]  = mybox->lo[1]  = mybox->lo[2] = 0;             
+    mybox->hi[0]  = mybox->hi[1]  = mybox->hi[2] = 0;   
 }

 void createAtom(Atom *atom, Parameter *param) {
+    
    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
@@ -50,6 +62,7 @@ void createAtom(Atom *atom, Parameter *param) {
    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
+
    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
        atom->epsilon[i] = param->epsilon;
        atom->sigma6[i] = param->sigma6;
@@ -88,7 +101,7 @@ void createAtom(Atom *atom, Parameter *param) {
            ytmp = 0.5 * alat * j;
            ztmp = 0.5 * alat * k;

-            if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
+            if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {        
                n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
                for(m = 0; m < 5; m++) { myrandom(&n); }
                vxtmp = myrandom(&n);
@@ -126,22 +139,26 @@ int type_str2int(const char *type) {
 }

 int readAtom(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    int len = strlen(param->input_file);
    if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
    if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
    if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
-    fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
+    if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
    exit(-1);
    return -1;
 }

 int readAtom_pdb(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    int read_atoms = 0;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }
@@ -151,11 +168,11 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
        char *item = strtok(line, " ");
        if(strncmp(item, "CRYST1", 6) == 0) {
            param->xlo = 0.0;
-            param->xhi = atof(strtok(NULL, " "));
+            param->xhi = atof(strtok(NULL, "\t "));
            param->ylo = 0.0;
-            param->yhi = atof(strtok(NULL, " "));
+            param->yhi = atof(strtok(NULL, "\t "));
            param->zlo = 0.0;
-            param->zhi = atof(strtok(NULL, " "));
+            param->zhi = atof(strtok(NULL, "\t "));
            param->xprd = param->xhi - param->xlo;
            param->yprd = param->yhi - param->ylo;
            param->zprd = param->zhi - param->zlo;
@@ -164,23 +181,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
            char *label;
            int atom_id, comp_id;
            MD_FLOAT occupancy, charge;
-            atom_id = atoi(strtok(NULL, " ")) - 1;
+            atom_id = atoi(strtok(NULL, "\t ")) - 1;

            while(atom_id + 1 >= atom->Nmax) {
                growAtom(atom);
            }

-            atom->type[atom_id] = type_str2int(strtok(NULL, " "));
-            label = strtok(NULL, " ");
-            comp_id = atoi(strtok(NULL, " "));
-            atom_x(atom_id) = atof(strtok(NULL, " "));
-            atom_y(atom_id) = atof(strtok(NULL, " "));
-            atom_z(atom_id) = atof(strtok(NULL, " "));
+            atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
+            label = strtok(NULL, "\t ");
+            comp_id = atoi(strtok(NULL, "\t "));
+            atom_x(atom_id) = atof(strtok(NULL, "\t "));
+            atom_y(atom_id) = atof(strtok(NULL, "\t "));
+            atom_z(atom_id) = atof(strtok(NULL, "\t "));
            atom->vx[atom_id] = 0.0;
            atom->vy[atom_id] = 0.0;
            atom->vz[atom_id] = 0.0;
-            occupancy = atof(strtok(NULL, " "));
-            charge = atof(strtok(NULL, " "));
+            occupancy = atof(strtok(NULL, "\t "));
+            charge = atof(strtok(NULL, "\t "));
            atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
            atom->Natoms++;
            atom->Nlocal++;
@@ -192,14 +209,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
                  strncmp(item, "ENDMDL", 6) == 0) {
            // Do nothing
        } else {
-            fprintf(stderr, "Invalid item: %s\n", item);
+            if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
            exit(-1);
            return -1;
        }
    }

    if(!read_atoms) {
-        fprintf(stderr, "Input error: No atoms read!\n");
+        if(me==0) fprintf(stderr, "Input error: No atoms read!\n");
        exit(-1);
        return -1;
    }
@@ -215,12 +232,15 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
    fclose(fp);
    return read_atoms;
 }

 int readAtom_gro(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    char desc[MAXLINE];
@@ -229,7 +249,7 @@ int readAtom_gro(Atom* atom, Parameter* param) {
    int i = 0;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }
@@ -239,25 +259,25 @@ int readAtom_gro(Atom* atom, Parameter* param) {
    desc[i] = '\0';
    readline(line, fp);
    atoms_to_read = atoi(strtok(line, " "));
-    fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
+    if(me==0) fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);

    while(!feof(fp) && read_atoms < atoms_to_read) {
        readline(line, fp);
-        char *label = strtok(line, " ");
-        int type = type_str2int(strtok(NULL, " "));
-        int atom_id = atoi(strtok(NULL, " ")) - 1;
+        char *label = strtok(line, "\t ");
+        int type = type_str2int(strtok(NULL, "\t "));
+        int atom_id = atoi(strtok(NULL, "\t ")) - 1;
        atom_id = read_atoms;
        while(atom_id + 1 >= atom->Nmax) {
            growAtom(atom);
        }

        atom->type[atom_id] = type;
-        atom_x(atom_id) = atof(strtok(NULL, " "));
-        atom_y(atom_id) = atof(strtok(NULL, " "));
-        atom_z(atom_id) = atof(strtok(NULL, " "));
-        atom->vx[atom_id] = atof(strtok(NULL, " "));
-        atom->vy[atom_id] = atof(strtok(NULL, " "));
-        atom->vz[atom_id] = atof(strtok(NULL, " "));
+        atom_x(atom_id) = atof(strtok(NULL, "\t "));
+        atom_y(atom_id) = atof(strtok(NULL, "\t "));
+        atom_z(atom_id) = atof(strtok(NULL, "\t "));
+        atom->vx[atom_id] = atof(strtok(NULL, "\t "));
+        atom->vy[atom_id] = atof(strtok(NULL, "\t "));
+        atom->vz[atom_id] = atof(strtok(NULL, "\t "));
        atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
        atom->Natoms++;
        atom->Nlocal++;
@@ -267,18 +287,18 @@ int readAtom_gro(Atom* atom, Parameter* param) {
    if(!feof(fp)) {
        readline(line, fp);
        param->xlo = 0.0;
-        param->xhi = atof(strtok(line, " "));
+        param->xhi = atof(strtok(line, "\t "));
        param->ylo = 0.0;
-        param->yhi = atof(strtok(NULL, " "));
+        param->yhi = atof(strtok(NULL, "\t "));
        param->zlo = 0.0;
-        param->zhi = atof(strtok(NULL, " "));
+        param->zhi = atof(strtok(NULL, "\t "));
        param->xprd = param->xhi - param->xlo;
        param->yprd = param->yhi - param->ylo;
        param->zprd = param->zhi - param->zlo;
    }

    if(read_atoms != atoms_to_read) {
-        fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
+        if(me==0) fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
        exit(-1);
        return -1;
    }
@@ -294,12 +314,14 @@ int readAtom_gro(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
    fclose(fp);
    return read_atoms;
 }

 int readAtom_dmp(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    int natoms = 0;
@@ -308,7 +330,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
    int ts = -1;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }
@@ -331,47 +353,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
                }
            } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
                readline(line, fp);
-                param->xlo = atof(strtok(line, " "));
-                param->xhi = atof(strtok(NULL, " "));
+                param->xlo = atof(strtok(line, "\t "));
+                param->xhi = atof(strtok(NULL, "\t "));
                param->xprd = param->xhi - param->xlo;

                readline(line, fp);
-                param->ylo = atof(strtok(line, " "));
-                param->yhi = atof(strtok(NULL, " "));
+                param->ylo = atof(strtok(line, "\t "));
+                param->yhi = atof(strtok(NULL, "\t "));
                param->yprd = param->yhi - param->ylo;

                readline(line, fp);
-                param->zlo = atof(strtok(line, " "));
-                param->zhi = atof(strtok(NULL, " "));
+                param->zlo = atof(strtok(line, "\t "));
+                param->zhi = atof(strtok(NULL, "\t "));
                param->zprd = param->zhi - param->zlo;
            } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
                for(int i = 0; i < natoms; i++) {
                    readline(line, fp);
-                    atom_id = atoi(strtok(line, " ")) - 1;
-                    atom->type[atom_id] = atoi(strtok(NULL, " "));
-                    atom_x(atom_id) = atof(strtok(NULL, " "));
-                    atom_y(atom_id) = atof(strtok(NULL, " "));
-                    atom_z(atom_id) = atof(strtok(NULL, " "));
-                    atom->vx[atom_id] = atof(strtok(NULL, " "));
-                    atom->vy[atom_id] = atof(strtok(NULL, " "));
-                    atom->vz[atom_id] = atof(strtok(NULL, " "));
+                    atom_id = atoi(strtok(line, "\t ")) - 1;
+                    atom->type[atom_id] = atoi(strtok(NULL, "\t "));
+                    atom_x(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_y(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_z(atom_id) = atof(strtok(NULL, "\t "));
+                    atom->vx[atom_id] = atof(strtok(NULL, "\t "));
+                    atom->vy[atom_id] = atof(strtok(NULL, "\t "));
+                    atom->vz[atom_id] = atof(strtok(NULL, "\t "));
                    atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
                    read_atoms++;
                }
            } else {
-                fprintf(stderr, "Invalid item: %s\n", item);
+                if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
                exit(-1);
                return -1;
            }
        } else {
-            fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
+            if(me==0) fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
            exit(-1);
            return -1;
        }
    }

    if(ts < 0 || !natoms || !read_atoms) {
-        fprintf(stderr, "Input error: atom data was not read!\n");
+        if(me==0) fprintf(stderr, "Input error: atom data was not read!\n");
        exit(-1);
        return -1;
    }
@@ -387,11 +409,118 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    if(me==0) fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
    fclose(fp);
    return natoms;
 }

+void initMasks(Atom *atom) {
+    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
+    unsigned int mask0, mask1, mask2, mask3;
+
+    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
+    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
+    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
+    //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
+
+    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
+        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+    }
+
+    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
+        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
+        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
+    }
+
+    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
+        atom->exclusion_filter[i] = (1U << i);
+    }
+
+    #if CLUSTER_M == CLUSTER_N
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x3 * cond0);
+        mask2 = (unsigned int)(0xf - 0x7 * cond0);
+        mask3 = (unsigned int)(0xf - 0xf * cond0);
+        atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+        mask0 = (unsigned int)(0xf - 0x1 * cond0);
+        mask1 = (unsigned int)(0xf - 0x2 * cond0);
+        mask2 = (unsigned int)(0xf - 0x4 * cond0);
+        mask3 = (unsigned int)(0xf - 0x8 * cond0);
+        atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+        atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+        atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
+        atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
+
+        atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
+        atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
+    }
+    #else
+    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
+        for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
+            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
+            #endif
+
+            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+            #if CLUSTER_M < CLUSTER_N
+            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
+            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
+            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
+            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+
+            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
+            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
+
+            #if CLUSTER_M < CLUSTER_N
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
+
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
+            #else
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
+            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
+
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
+            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
+            #endif
+        }
+    }
+    #endif
+}
+
 void growAtom(Atom *atom) {
    int nold = atom->Nmax;
    atom->Nmax += DELTA;
@@ -421,3 +550,249 @@ void growClusters(Atom *atom) {
    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
 }
+
+/* MPI added*/
+void growPbc(Atom* atom) {
+    int nold = atom->NmaxGhost;
+    atom->NmaxGhost += DELTA;
+
+    if (atom->PBCx || atom->PBCy || atom->PBCz){
+        atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
+        atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
+        atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
+    } else {
+        atom->PBCx = (int*) malloc(atom->NmaxGhost * sizeof(int));
+        atom->PBCy = (int*) malloc(atom->NmaxGhost * sizeof(int));
+        atom->PBCz = (int*) malloc(atom->NmaxGhost * sizeof(int));
+    } 
+}
+
+void packForward(Atom* atom, int nc, int* list, MD_FLOAT* buf, int* pbc)
+{
+    for(int i = 0; i < nc; i++) {
+        int cj = list[i];
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+        int displ = i*CLUSTER_N;
+        
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            buf[3*(displ+cjj)+0] = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
+            buf[3*(displ+cjj)+1] = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
+            buf[3*(displ+cjj)+2] = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd; 
+        }
+
+        for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+            buf[3*(displ+cjj)+0] = -1; //x 
+            buf[3*(displ+cjj)+1] = -1; //y
+            buf[3*(displ+cjj)+2] = -1; //z
+        }
+    }
+}
+
+void unpackForward(Atom* atom, int nc, int c0, MD_FLOAT* buf)
+{
+    for(int i = 0; i < nc; i++) {
+        int cj = c0+i;
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); 
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+        int displ = i*CLUSTER_N;
+
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            if(cj_x[CL_X_OFFSET + cjj]<INFINITY) cj_x[CL_X_OFFSET + cjj] = buf[3*(displ+cjj)+0];
+            if(cj_x[CL_Y_OFFSET + cjj]<INFINITY) cj_x[CL_Y_OFFSET + cjj] = buf[3*(displ+cjj)+1]; 
+            if(cj_x[CL_Z_OFFSET + cjj]<INFINITY) cj_x[CL_Z_OFFSET + cjj] = buf[3*(displ+cjj)+2];
+        }
+    }
+}
+
+int packGhost(Atom* atom, int cj, MD_FLOAT* buf, int* pbc)
+{    
+    //#of elements per cluster natoms,x0,y0,z0,type_0, . . ,xn,yn,zn,type_n,bbminx,bbmaxxy,bbminy,bbmaxy,bbminz,bbmaxz
+    //count = 4*N_CLUSTER+7, if N_CLUSTER =4 => count = 23 value/cluster + trackpbc[x] + trackpbc[y] + trackpbc[z]
+    int m = 0;
+    if(atom->jclusters[cj].natoms > 0) {    
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); 
+        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
+        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
+        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
+        
+        buf[m++] = atom->jclusters[cj].natoms;
+
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+        
+            MD_FLOAT xtmp = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
+            MD_FLOAT ytmp = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
+            MD_FLOAT ztmp = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
+            
+            buf[m++] = xtmp;
+            buf[m++] = ytmp;
+            buf[m++] = ztmp;
+            buf[m++]= atom->cl_type[cj_sca_base + cjj];
+
+            if(bbminx > xtmp) { bbminx = xtmp; }
+            if(bbmaxx < xtmp) { bbmaxx = xtmp; }
+            if(bbminy > ytmp) { bbminy = ytmp; }
+            if(bbmaxy < ytmp) { bbmaxy = ytmp; }
+            if(bbminz > ztmp) { bbminz = ztmp; }
+            if(bbmaxz < ztmp) { bbmaxz = ztmp; }
+        }
+
+        for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+            buf[m++] = -1; //x 
+            buf[m++] = -1; //y
+            buf[m++] = -1; //z
+            buf[m++] = -1; //type
+        }
+        
+        buf[m++] = bbminx;
+        buf[m++] = bbmaxx;
+        buf[m++] = bbminy;
+        buf[m++] = bbmaxy;
+        buf[m++] = bbminz;
+        buf[m++] = bbmaxz;
+        //TODO: check atom->ncj
+        int ghostId = cj-atom->ncj;
+        //check for ghost particles  
+        buf[m++] = (cj-atom->ncj>=0) ? pbc[_x]+atom->PBCx[ghostId]:pbc[_x];
+        buf[m++] = (cj-atom->ncj>=0) ? pbc[_y]+atom->PBCy[ghostId]:pbc[_y];
+        buf[m++] = (cj-atom->ncj>=0) ? pbc[_z]+atom->PBCz[ghostId]:pbc[_z];
+    }  
+    return m;
+}
+    
+int unpackGhost(Atom* atom, int cj, MD_FLOAT* buf)
+{
+    int m = 0;
+    int jfac =  MAX(1, CLUSTER_N / CLUSTER_M);
+    if(cj*jfac>=atom->Nclusters_max) growClusters(atom);
+    if(atom->Nclusters_ghost>=atom->NmaxGhost) growPbc(atom); 
+
+    int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); 
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; 
+
+    atom->jclusters[cj].natoms = buf[m++];
+    for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+
+        cj_x[CL_X_OFFSET + cjj] = buf[m++];
+        cj_x[CL_Y_OFFSET + cjj] = buf[m++];
+        cj_x[CL_Z_OFFSET + cjj] = buf[m++];
+        atom->cl_type[cj_sca_base + cjj] = buf[m++];
+        atom->Nghost++;
+    }
+
+    for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+        cj_x[CL_X_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
+        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
+        atom->cl_type[cj_sca_base + cjj] = -1;
+        m+=4;
+    }
+
+    atom->jclusters[cj].bbminx = buf[m++];
+    atom->jclusters[cj].bbmaxx = buf[m++];
+    atom->jclusters[cj].bbminy = buf[m++];
+    atom->jclusters[cj].bbmaxy = buf[m++];
+    atom->jclusters[cj].bbminz = buf[m++];
+    atom->jclusters[cj].bbmaxz = buf[m++];
+    atom->PBCx[atom->Nclusters_ghost] = buf[m++];
+    atom->PBCy[atom->Nclusters_ghost] = buf[m++];
+    atom->PBCz[atom->Nclusters_ghost] = buf[m++]; 
+    atom->Nclusters_ghost++;
+  
+}
+   
+void packReverse(Atom* atom, int nc, int c0, MD_FLOAT* buf)
+{
+    for(int i = 0; i < nc; i++) {
+        int cj = c0+i;
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); 
+        MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
+        int displ = i*CLUSTER_N;
+
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            buf[3*(displ+cjj)+0] = cj_f[CL_X_OFFSET + cjj];
+            buf[3*(displ+cjj)+1] = cj_f[CL_Y_OFFSET + cjj]; 
+            buf[3*(displ+cjj)+2] = cj_f[CL_Z_OFFSET + cjj]; 
+        }
+
+        for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
+            buf[3*(displ+cjj)+0] = -1; //x 
+            buf[3*(displ+cjj)+1] = -1; //y
+            buf[3*(displ+cjj)+2] = -1; //z
+        }
+    }
+}
+
+void unpackReverse(Atom* atom, int nc, int* list, MD_FLOAT* buf)
+{
+    for(int i = 0; i < nc; i++) {
+        int cj = list[i];
+        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+        MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
+        int displ = i*CLUSTER_N;
+  
+        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
+            cj_f[CL_X_OFFSET + cjj] += buf[3*(displ+cjj)+0];
+            cj_f[CL_Y_OFFSET + cjj] += buf[3*(displ+cjj)+1]; 
+            cj_f[CL_Z_OFFSET + cjj] += buf[3*(displ+cjj)+2];
+        }
+    }
+}
+
+int packExchange(Atom* atom, int i, MD_FLOAT* buf)
+{
+  int m = 0;
+  buf[m++] = atom_x(i);
+  buf[m++] = atom_y(i);
+  buf[m++] = atom_z(i);
+  buf[m++] = atom_vx(i);
+  buf[m++] = atom_vy(i);
+  buf[m++] = atom_vz(i);
+  buf[m++] = atom->type[i];
+  return m;
+}
+
+int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
+{
+  while(i >= atom->Nmax) growAtom(atom);
+  int m = 0;
+  atom_x(i) = buf[m++];
+  atom_y(i) = buf[m++];
+  atom_z(i) = buf[m++];
+  atom_vx(i) = buf[m++];
+  atom_vy(i) = buf[m++];
+  atom_vz(i) = buf[m++];
+  atom->type[i] = buf[m++];
+  return m;
+}
+
+void pbc(Atom* atom)
+{
+  for(int i = 0; i < atom->Nlocal; i++) {
+   
+    MD_FLOAT xprd = atom->mybox.xprd;
+    MD_FLOAT yprd = atom->mybox.yprd;
+    MD_FLOAT zprd = atom->mybox.zprd; 
+
+    if(atom_x(i) < 0.0) atom_x(i) += xprd;
+    if(atom_y(i) < 0.0) atom_y(i) += yprd;
+    if(atom_z(i) < 0.0)  atom_z(i) +=zprd;
+    if(atom_x(i) >= xprd) atom_x(i) -=xprd;    
+    if(atom_y(i) >= yprd) atom_y(i) -=yprd;
+    if(atom_z(i) >= zprd) atom_z(i) -=zprd;
+  }
+}
+
+void copy(Atom* atom, int i, int j)
+{
+  atom_x(i) = atom_x(j);
+  atom_y(i) = atom_y(j);
+  atom_z(i) = atom_z(j);
+  atom_vx(i) = atom_vx(j);
+  atom_vy(i) = atom_vy(j);
+  atom_vz(i) = atom_vz(j);
+  atom->type[i] = atom->type[j];
+}
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -5,6 +5,7 @@
 * license that can be found in the LICENSE file.
 */
 #include <parameter.h>
+#include <box.h>

 #ifndef __ATOM_H_
 #define __ATOM_H_
@@ -22,6 +23,7 @@
 #   define KERNEL_NAME              "CUDA"
 #   define CLUSTER_M                8
 #   define CLUSTER_N                VECTOR_WIDTH
+#   define UNROLL_J                 1
 #   define computeForceLJ           computeForceLJ_cuda
 #   define initialIntegrate         cudaInitialIntegrate
 #   define finalIntegrate           cudaFinalIntegrate
@@ -32,11 +34,15 @@
 #   if VECTOR_WIDTH > CLUSTER_M * 2
 #       define KERNEL_NAME          "Simd2xNN"
 #       define CLUSTER_N            (VECTOR_WIDTH / 2)
+#       define UNROLL_I             4
+#       define UNROLL_J             2
 #       define computeForceLJ       computeForceLJ_2xnn
 // Simd4xN
 #   else
 #       define KERNEL_NAME          "Simd4xN"
 #       define CLUSTER_N            VECTOR_WIDTH
+#       define UNROLL_I             4
+#       define UNROLL_J             1
 #       define computeForceLJ       computeForceLJ_4xn
 #   endif
 #   ifdef USE_REFERENCE_VERSION
@@ -97,7 +103,7 @@ typedef struct {

 typedef struct {
    int Natoms, Nlocal, Nghost, Nmax;
-    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
+    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max, NmaxGhost,ncj;
    MD_FLOAT *x, *y, *z;
    MD_FLOAT *vx, *vy, *vz;
    int *border_map;
@@ -107,6 +113,7 @@ typedef struct {
    MD_FLOAT *sigma6;
    MD_FLOAT *cutforcesq;
    MD_FLOAT *cutneighsq;
+    //track the movement of a particle along boundaries
    int *PBCx, *PBCy, *PBCz;
    // Data in cluster format
    MD_FLOAT *cl_x;
@@ -116,9 +123,20 @@ typedef struct {
    Cluster *iclusters, *jclusters;
    int *icluster_bin;
    int dummy_cj;
+    MD_UINT *exclusion_filter;
+    MD_FLOAT *diagonal_4xn_j_minus_i;
+    MD_FLOAT *diagonal_2xnn_j_minus_i;
+    unsigned int masks_2xnn_hn[8];
+    unsigned int masks_2xnn_fn[8];
+    unsigned int masks_4xn_hn[16];
+    unsigned int masks_4xn_fn[16];
+
+    //Info Subdomain
+    Box mybox;   
 } Atom;

 extern void initAtom(Atom*);
+extern void initMasks(Atom*);
 extern void createAtom(Atom*, Parameter*);
 extern int readAtom(Atom*, Parameter*);
 extern int readAtom_pdb(Atom*, Parameter*);
@@ -127,6 +145,18 @@ extern int readAtom_dmp(Atom*, Parameter*);
 extern void growAtom(Atom*);
 extern void growClusters(Atom*);

+int  packGhost(Atom*, int, MD_FLOAT* , int*);
+int  unpackGhost(Atom*, int, MD_FLOAT*);
+int  packExchange(Atom*, int, MD_FLOAT*);
+int  unpackExchange(Atom*, int, MD_FLOAT*);
+void packForward(Atom*, int, int*, MD_FLOAT*, int*); 
+void unpackForward(Atom*, int, int, MD_FLOAT*);
+void packReverse(Atom* , int , int , MD_FLOAT*);
+void unpackReverse(Atom*, int, int*, MD_FLOAT*);
+void pbc(Atom*);
+void copy(Atom*, int, int);
+
+
 #ifdef AOS
 #   define POS_DATA_LAYOUT     "AoS"
 #   define atom_x(i)           atom->x[(i) * 3 + 0]
--- a/gromacs/includes/integrate.h
+++ b/gromacs/includes/integrate.h
@@ -9,10 +9,13 @@
 #include <atom.h>
 #include <parameter.h>
 #include <util.h>
-
+#include <timers.h>
+#include <timing.h>
+#include <simd.h>
+/*
 void cpuInitialIntegrate(Parameter *param, Atom *atom) {
+  
    DEBUG_MESSAGE("cpuInitialIntegrate start\n");
-
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
@@ -32,9 +35,9 @@ void cpuInitialIntegrate(Parameter *param, Atom *atom) {
    DEBUG_MESSAGE("cpuInitialIntegrate end\n");
 }

-void cpuFinalIntegrate(Parameter *param, Atom *atom) {
-    DEBUG_MESSAGE("cpuFinalIntegrate start\n");
+void  cpuFinalIntegrate(Parameter *param, Atom *atom) {

+    DEBUG_MESSAGE("cpuFinalIntegrate start\n");
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
@@ -46,6 +49,56 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) {
            ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
        }
    }
+    DEBUG_MESSAGE("cpuFinalIntegrate end\n");
+}
+*/
+
+void cpuInitialIntegrate(Parameter *param, Atom *atom) {
+  
+    DEBUG_MESSAGE("cpuInitialIntegrate start\n");
+    for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
+        MD_SIMD_FLOAT dt = simd_broadcast(param->dt); 
+        
+        MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
+        MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
+        MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
+        MD_SIMD_FLOAT x_vector = simd_fma(vx_vector, dt, simd_load(&ci_x[CL_X_OFFSET]));
+        MD_SIMD_FLOAT y_vector = simd_fma(vy_vector, dt, simd_load(&ci_x[CL_Y_OFFSET]));
+        MD_SIMD_FLOAT z_vector = simd_fma(vz_vector, dt, simd_load(&ci_x[CL_Z_OFFSET]));
+        
+        simd_store(&ci_v[CL_X_OFFSET], vx_vector);
+        simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
+        simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
+        simd_store(&ci_x[CL_X_OFFSET], x_vector);
+        simd_store(&ci_x[CL_Y_OFFSET], y_vector);
+        simd_store(&ci_x[CL_Z_OFFSET], z_vector);
+    }
+
+    DEBUG_MESSAGE("cpuInitialIntegrate end\n");
+}
+
+void  cpuFinalIntegrate(Parameter *param, Atom *atom) {
+
+    DEBUG_MESSAGE("cpuFinalIntegrate start\n");
+    for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
+        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
+        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
+        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
+
+        MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
+        MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
+        MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
+        MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
+        simd_store(&ci_v[CL_X_OFFSET], vx_vector);
+        simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
+        simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
+    }

    DEBUG_MESSAGE("cpuFinalIntegrate end\n");
 }
@@ -54,3 +107,6 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) {
 void cudaInitialIntegrate(Parameter*, Atom*);
 void cudaFinalIntegrate(Parameter*, Atom*);
 #endif
+
+
+   
--- a/gromacs/includes/neighbor.h
+++ b/gromacs/includes/neighbor.h
@@ -9,15 +9,50 @@

 #ifndef __NEIGHBOR_H_
 #define __NEIGHBOR_H_
+// Interaction masks from GROMACS, things to remember (maybe these confused just me):
+//   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
+//      interaction masks (1 = interaction, 0 = no interaction)
+//   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
+//      so read them from right to left (least significant to most significant bit)
+// All interaction mask is the same for all kernels
+#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
+// 4x4 kernel diagonal mask
+#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
+// 4x2 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
+#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
+// 4x8 kernel diagonal masks
+#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
+#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
+
+typedef struct {
+    int cluster;
+    int atom;
+} Pair;
+
 typedef struct {
    int every;
    int ncalls;
-    int* neighbors;
    int maxneighs;
    int* numneigh;
+    int* numneigh_masked;
    int half_neigh;
+    int* neighbors;
+    unsigned int* neighbors_imask;
+    //MPI
+    /*
+    int Nshell;         //# of atoms in listShell(Cluster here cover all possible ghost interactions)
+    int *numNeighShell; //# of neighs for each atom in listShell
+    Pair *neighshell;    //list of neighs for each atom in listShell
+    Pair *listshell;     //Atoms to compute the force
+    */
+    int Nshell;         //# of cluster in listShell(Cluster here cover all possible ghost interactions)
+    int *numNeighShell; //# of neighs for each atom in listShell
+    int *neighshell;    //list of neighs for each atom in listShell
+    int *listshell;     //Atoms to compute the force
 } Neighbor;

+
 extern void initNeighbor(Neighbor*, Parameter*);
 extern void setupNeighbor(Parameter*, Atom*);
 extern void binatoms(Atom*);
--- a/gromacs/includes/vtk.h
+++ b/gromacs/includes/vtk.h
@@ -5,6 +5,8 @@
 * license that can be found in the LICENSE file.
 */
 #include <atom.h>
+#include <comm.h>
+#include <parameter.h>

 #ifndef __VTK_H_
 #define __VTK_H_
@@ -13,4 +15,5 @@ extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int t
 extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
 extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
 extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
 #endif
--- a/gromacs/main-stub.c
+++ b/gromacs/main-stub.c
@@ -60,18 +60,15 @@ void init(Parameter *param) {
    param->eam_file = NULL;
 }

-// Show debug messages
-#define DEBUG(msg)  printf(msg)
-// Do not show debug messages
-//#define DEBUG(msg)
-
-
-void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
+void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
    const int maxneighs = nneighs * nreps;
    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
    const int ncj = atom->Nclusters_local / jfac;
+    const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
    neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
+    neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
    neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
+    neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));

    if(pattern == P_RAND && ncj <= nneighs) {
        fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
@@ -80,6 +77,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
+        unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
        int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
        int m = (pattern == P_SEQ) ? ncj : nneighs;
        int k = 0;
@@ -90,6 +88,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
                do {
                    int cj = rand() % ncj;
                    neighptr[k] = cj;
+                    neighptr_imask[k] = imask;
                    found = 0;
                    for(int l = 0; l < k; l++) {
                        if(neighptr[l] == cj) {
@@ -99,6 +98,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
                } while(found == 1);
            } else {
                neighptr[k] = j;
+                neighptr_imask[k] = imask;
                j = (j + 1) % m;
            }
        }
@@ -106,10 +106,12 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
        for(int r = 1; r < nreps; r++) {
            for(int k = 0; k < nneighs; k++) {
                neighptr[r * nneighs + k] = neighptr[k];
+                neighptr_imask[r * nneighs + k] = neighptr_imask[k];
            }
        }

        neighbor->numneigh[ci] = nneighs * nreps;
+        neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
    }
 }

@@ -125,12 +127,13 @@ int main(int argc, const char *argv[]) {
    int niclusters = 256;               // Number of local i-clusters
    int iclusters_natoms = CLUSTER_M;   // Number of valid atoms within i-clusters
    int nneighs = 9;                    // Number of j-cluster neighbors per i-cluster
+    int masked = 0;                     // Use masked loop 
    int nreps = 1;
    int csv = 0;

    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("force");
-    DEBUG("Initializing parameters...\n");
+    DEBUG_MESSAGE("Initializing parameters...\n");
    init(&param);

    for(int i = 0; i < argc; i++) {
@@ -156,6 +159,10 @@ int main(int argc, const char *argv[]) {
            param.eam_file = strdup(argv[++i]);
            continue;
        }
+        if((strcmp(argv[i], "-m") == 0)) {
+            masked = 1;
+            continue;
+        }
        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
            param.ntimes = atoi(argv[++i]);
            continue;
@@ -206,11 +213,11 @@ int main(int argc, const char *argv[]) {
    }

    if(param.force_field == FF_EAM) {
-        DEBUG("Initializing EAM parameters...\n");
+        DEBUG_MESSAGE("Initializing EAM parameters...\n");
        initEam(&eam, &param);
    }

-    DEBUG("Initializing atoms...\n");
+    DEBUG_MESSAGE("Initializing atoms...\n");
    initAtom(atom);
    initStats(&stats);

@@ -226,7 +233,7 @@ int main(int argc, const char *argv[]) {
        atom->cutforcesq[i] = param.cutforce * param.cutforce;
    }

-    DEBUG("Creating atoms...\n");
+    DEBUG_MESSAGE("Creating atoms...\n");
    while(atom->Nmax < niclusters * iclusters_natoms) {
        growAtom(atom);
    }
@@ -281,13 +288,13 @@ int main(int argc, const char *argv[]) {
        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
    }

-    DEBUG("Defining j-clusters...\n");
+    DEBUG_MESSAGE("Defining j-clusters...\n");
    defineJClusters(atom);
-    DEBUG("Initializing neighbor lists...\n");
+    DEBUG_MESSAGE("Initializing neighbor lists...\n");
    initNeighbor(&neighbor, &param);
-    DEBUG("Creating neighbor lists...\n");
-    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
-    DEBUG("Computing forces...\n");
+    DEBUG_MESSAGE("Creating neighbor lists...\n");
+    createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
+    DEBUG_MESSAGE("Computing forces...\n");

    double T_accum = 0.0;
    for(int i = 0; i < param.ntimes; i++) {
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -24,6 +24,10 @@
 #include <util.h>
 #include <vtk.h>
 #include <xtc.h>
+#include <comm.h>
+#include <grid.h>
+#include <shell_methods.h>
+#include <mpi.h>

 #define HLINE "----------------------------------------------------------------------------\n"

@@ -40,17 +44,55 @@ extern void copyDataFromCUDADevice(Atom *atom);
 extern void cudaDeviceFree();
 #endif

-double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
+double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time)
+{
+    double S, E;
+    int dims = 3;   //TODO: Adjust to do in 3d and 2d
+    S = getTimeStamp();
+    if(param->balance == RCB) {
+        rcbBalance(grid, atom, param, meanBisect,dims,0);
+        neighComm(comm, param, grid);
+    }else if(param->balance == meanTimeRCB){
+        rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
+        neighComm(comm, param, grid);
+    }else if(param->balance == Staggered) {
+        staggeredBalance(grid, atom, param, time);
+        neighComm(comm, param, grid);
+        exchangeComm(comm,atom);
+    }else { } //Do nothing
+    //printGrid(grid);
+    E = getTimeStamp();
+
+    return E-S;
+} 
+
+double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
+{      
+    double E,S,time;
+    int me;
+    MPI_Comm_rank(world,&me);
+    S = getTimeStamp();
+    if(param->balance == meanTimeRCB || param->balance == RCB){
+        rcbBalance(grid, atom, param, meanBisect,3,0);
+        neighComm(comm, param, grid); 
+    }      
+    MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); 
+    printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
+    MPI_Barrier(world);
+    E = getTimeStamp();
+    return E-S;
+}
+
+double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
    if(param->force_field == FF_EAM) { initEam(eam, param); }
    double S, E;
    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
    param->xprd = param->nx * param->lattice;
    param->yprd = param->ny * param->lattice;
    param->zprd = param->nz * param->lattice;
-
    S = getTimeStamp();
    initAtom(atom);
-    initPbc(atom);
+    //initPbc(atom);
    initStats(stats);
    initNeighbor(neighbor, param);
    if(param->input_file == NULL) {
@@ -58,13 +100,18 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
    } else {
        readAtom(atom, param);
    }
-
+    setupGrid(grid,atom,param);
    setupNeighbor(param, atom);
+    setupComm(comm, param, grid);
+    if(param->balance){  
+        initialBalance(param, eam, atom, neighbor, stats, comm, grid);
+    }
    setupThermo(param, atom->Natoms);
    if(param->input_file == NULL) { adjustThermo(param, atom); }
    buildClusters(atom);
    defineJClusters(atom);
-    setupPbc(atom, param);
+    //setupPbc(atom, param);
+    ghostNeighbor(comm, atom, param); //change
    binClusters(atom);
    buildNeighbor(atom, neighbor);
    initDevice(atom, neighbor);
@@ -72,15 +119,15 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
    return E-S;
 }

-double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
+double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
    double S, E;
    S = getTimeStamp();
    LIKWID_MARKER_START("reneighbour");
-    updateSingleAtoms(atom);
-    updateAtomsPbc(atom, param);
+    //updateAtomsPbc(atom, param);
    buildClusters(atom);
    defineJClusters(atom);
-    setupPbc(atom, param);
+    //setupPbc(atom, param);
+    ghostNeighbor(comm, atom, param);
    binClusters(atom);
    buildNeighbor(atom, neighbor);
    LIKWID_MARKER_STOP("reneighbour");
@@ -88,15 +135,13 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
    return E-S;
 }

-void printAtomState(Atom *atom) {
-    printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
-            atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
-
-    /*     int nall = atom->Nlocal + atom->Nghost; */
-
-    /*     for (int i=0; i<nall; i++) { */
-    /*         printf("%d  %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
-    /*     } */
+double updateAtoms(Comm* comm, Atom* atom){
+    double S,E;
+    S = getTimeStamp();
+        updateSingleAtoms(atom);
+        exchangeComm(comm, atom);
+    E = getTimeStamp();
+    return E-S;
 }

 int main(int argc, char** argv) {
@@ -106,7 +151,8 @@ int main(int argc, char** argv) {
    Neighbor neighbor;
    Stats stats;
    Parameter param;
-
+    Comm comm; 
+    Grid grid;
    LIKWID_MARKER_INIT;
 #pragma omp parallel
    {
@@ -114,10 +160,10 @@ int main(int argc, char** argv) {
        //LIKWID_MARKER_REGISTER("reneighbour");
        //LIKWID_MARKER_REGISTER("pbc");
    }
-
+    initComm(&argc, &argv, &comm); //change
    initParameter(&param);
    for(int i = 0; i < argc; i++) {
-        if((strcmp(argv[i], "-p") == 0)) {
+        if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
            readParameter(&param, argv[++i]);
            continue;
        }
@@ -156,6 +202,24 @@ int main(int argc, char** argv) {
            param.half_neigh = atoi(argv[++i]);
            continue;
        }
+        if((strcmp(argv[i], "-method") == 0)) {
+            param.method = atoi(argv[++i]);
+            if (param.method>2 || param.method< 0){
+                if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
+                endComm(&comm);   
+                exit(0);
+            }
+            continue;
+        }
+        if((strcmp(argv[i], "-bal") == 0)) {
+            param.balance = atoi(argv[++i]);
+            if (param.balance>3 || param.balance< 0){
+                if(comm.myproc == 0) fprintf(stderr, "Load balance does not exist!\n");
+                endComm(&comm);   
+                exit(0);
+            }
+            continue;
+        }
        if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
            param.mass = atof(argv[++i]);
            continue;
@@ -186,6 +250,7 @@ int main(int argc, char** argv) {
            continue;
        }
        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
+            //TODO: add the shell and ac print options
            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
            printf(HLINE);
            printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
@@ -203,98 +268,101 @@ int main(int argc, char** argv) {
            exit(EXIT_SUCCESS);
        }
    }
+    
+    if(param.balance>0 && param.method == 1){
+        if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
+        endComm(&comm);   
+        exit(0);
+    }

    param.cutneigh = param.cutforce + param.skin;
-    setup(&param, &eam, &atom, &neighbor, &stats);
-    printParameter(&param);
-    printf(HLINE);
-
-    printf("step\ttemp\t\tpressure\n");
+    timer[SETUP]=setup(&param, &eam, &atom, &neighbor, &stats, &comm, &grid);
+    if(comm.myproc == 0) printParameter(&param);
+    if(comm.myproc == 0) printf(HLINE);
+    if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n");
    computeThermo(0, &param, &atom);
    #if defined(MEM_TRACER) || defined(INDEX_TRACER)
    traceAddresses(&param, &atom, &neighbor, n + 1);
    #endif
-
    #ifdef CUDA_TARGET
    copyDataToCUDADevice(&atom);
    #endif
-
    if(param.force_field == FF_EAM) {
        timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
    } else {
        timer[FORCE] = computeForceLJ(&param, &atom, &neighbor, &stats);
    }
-
-    timer[NEIGH] = 0.0;
+    timer[NEIGH]    = 0.0;
+    timer[FORWARD]  = 0.0;
+    timer[UPDATE]   = 0.0;
+    timer[BALANCE]  = 0.0;
+    timer[REVERSE]  = reverse(&comm, &atom, &param);
+    MPI_Barrier(world);
    timer[TOTAL] = getTimeStamp();
-
    if(param.vtk_file != NULL) {
-        write_data_to_vtk_file(param.vtk_file, &atom, 0);
+        //write_data_to_vtk_file(param.vtk_file, &comm ,&atom, 0);
+        printvtk(param.vtk_file, &comm, &atom, &param, 0); 
    }
-
+    //TODO: modify xct
    if(param.xtc_file != NULL) {
        xtc_init(param.xtc_file, &atom, 0);
    }
-
-    for(int n = 0; n < param.ntimes; n++) {
+    double forceTime=0.0;
+    double commTime=0.0;
+    for(int n = 0; n < param.ntimes; n++) {  
        initialIntegrate(&param, &atom);
-
        if((n + 1) % param.reneigh_every) {
-            if(!((n + 1) % param.prune_every)) {
+            timer[FORWARD]+=forward(&comm, &atom, &param);
+            if(!((n + 1) % param.prune_every)){
                pruneNeighbor(&param, &atom, &neighbor);
            }
-
-            updatePbc(&atom, &param, 0);
        } else {
            #ifdef CUDA_TARGET
            copyDataFromCUDADevice(&atom);
            #endif
-
-            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
-
+            timer[UPDATE] +=updateAtoms(&comm,&atom);
+            if(param.balance && !((n+1)%param.balance_every))
+                timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , &param, timer[FORCE]);   
+            timer[NEIGH]  += reneighbour(&comm, &param, &atom, &neighbor);
            #ifdef CUDA_TARGET
            copyDataToCUDADevice(&atom);
            isReneighboured = 1;
            #endif
        }
-
        #if defined(MEM_TRACER) || defined(INDEX_TRACER)
        traceAddresses(&param, &atom, &neighbor, n + 1);
        #endif
-
        if(param.force_field == FF_EAM) {
-            timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
+            timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats); 
        } else {
            timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
-        }
-
+        } 
+        timer[REVERSE] += reverse(&comm, &atom, &param);
        finalIntegrate(&param, &atom);
-
        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
            computeThermo(n + 1, &param, &atom);
        }
-
        int write_pos = !((n + 1) % param.x_out_every);
        int write_vel = !((n + 1) % param.v_out_every);
        if(write_pos || write_vel) {
            if(param.vtk_file != NULL) {
-                write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
+                printvtk(param.vtk_file, &comm, &atom, &param, n+1);
            }
-
+            //TODO: xtc file
            if(param.xtc_file != NULL) {
                xtc_write(&atom, n + 1, write_pos, write_vel);
            }
-        }
+        } 
    }

    #ifdef CUDA_TARGET
    copyDataFromCUDADevice(&atom);
    #endif
-
+    MPI_Barrier(world);
    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
-    updateSingleAtoms(&atom);
+    updateAtoms(&comm,&atom);
    computeThermo(-1, &param, &atom);
-
+    //TODO:
    if(param.xtc_file != NULL) {
        xtc_end();
    }
@@ -302,17 +370,35 @@ int main(int argc, char** argv) {
    #ifdef CUDA_TARGET
    cudaDeviceFree();
    #endif
-
-    printf(HLINE);
-    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
-    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
-            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
-    printf(HLINE);
-    printf("Performance: %.2f million atom updates per second\n",
-            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
-    #ifdef COMPUTE_STATS
+    double mint[NUMTIMER];
+    double maxt[NUMTIMER];
+    double sumt[NUMTIMER];
+    timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
+    MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
+    MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
+    MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
+    int Nghost;
+    MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);
+    
+    if(comm.myproc == 0){
+        int n = comm.numproc;
+        printf(HLINE);
+        printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
+        printf("TOTAL %.2fs\n\n",timer[TOTAL]);
+        printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
+        printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
+        printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
+        printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
+        printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
+        printf(HLINE);
+        printf("Performance: %.2f million atom updates per second\n",
+                1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+        
+#ifdef COMPUTE_STATS
    displayStatistics(&atom, &param, &stats, timer);
-    #endif
+#endif
+    }
+    endComm(&comm);
    LIKWID_MARKER_CLOSE;
    return EXIT_SUCCESS;
 }
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -7,15 +7,15 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-
 #include <neighbor.h>
 #include <parameter.h>
 #include <atom.h>
 #include <util.h>
+#include <mpi.h>

 #define SMALL 1.0e-6
 #define FACTOR 0.999
-
+#define eps 1.0e-9
 static MD_FLOAT xprd, yprd, zprd;
 static MD_FLOAT bininvx, bininvy;
 static int mbinxlo, mbinylo;
@@ -34,9 +34,16 @@ static int nmax;
 static int nstencil;      // # of bins in stencil
 static int* stencil;      // stencil list of bin offsets
 static MD_FLOAT binsizex, binsizey;
+int me;             //rank
+int method;         // method 
+int shellMethod;    //If shell method exist  

 static int coord2bin(MD_FLOAT, MD_FLOAT);
 static MD_FLOAT bindist(int, int);
+//static int ghostZone(Atom*, int);
+static int halfZoneCluster(Atom*,int);
+static int ghostClusterinRange(Atom*, int, int, MD_FLOAT);
+static void neighborGhost(Atom*, Neighbor*);

 /* exported subroutines */
 void initNeighbor(Neighbor *neighbor, Parameter *param) {
@@ -53,10 +60,25 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
    bincount = NULL;
    bin_clusters = NULL;
    bin_nclusters = NULL;
-    neighbor->half_neigh = param->half_neigh;
-    neighbor->maxneighs = 100;
+    neighbor->maxneighs = 200;
    neighbor->numneigh = NULL;
+    neighbor->numneigh_masked = NULL;
    neighbor->neighbors = NULL;
+    neighbor->neighbors_imask = NULL;
+    //MPI
+    shellMethod = 0;
+    method = param->method;
+    if(method == halfShell || method == eightShell){ 
+        param->half_neigh = 1;
+        shellMethod = 1;
+    }
+    me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+    neighbor->half_neigh = param->half_neigh;
+    neighbor->Nshell = 0;  
+    neighbor->numNeighShell = NULL;
+    neighbor->neighshell = NULL;
+    neighbor->listshell = NULL;
 }

 void setupNeighbor(Parameter *param, Atom *atom) {
@@ -75,7 +97,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;

-    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
+    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Natoms)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
@@ -144,6 +166,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
 }

 MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
+    
    MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
    MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
    MD_FLOAT dm = MAX(dl, dh);
@@ -161,6 +184,7 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
    dm = MAX(dl, dh);
    dm0 = MAX(dm, 0.0);
    d2 += dm0 * dm0;
+    
    return d2;
 }

@@ -184,16 +208,56 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
    return 0;
 }

+/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+static unsigned int get_imask(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
+    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
+                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
+static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+}
+
+/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
+static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
+    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
+                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
+                                                               : NBNXN_INTERACTION_MASK_ALL));
+}
+
+#if VECTOR_WIDTH == 2
+#   define get_imask_simd_4xn get_imask_simd_j2
+#elif VECTOR_WIDTH== 4
+#   define get_imask_simd_4xn get_imask_simd_j4
+#elif VECTOR_WIDTH == 8
+#   define get_imask_simd_4xn get_imask_simd_j8
+#   define get_imask_simd_2xnn get_imask_simd_j4
+#elif VECTOR_WIDTH == 16
+#   define get_imask_simd_2xnn get_imask_simd_j8
+#else
+#   error "Invalid cluster configuration"
+#endif
+
 void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("buildNeighbor start\n");
-
    /* extend atom arrays if necessary */
    if(atom->Nclusters_local > nmax) {
        nmax = atom->Nclusters_local;
        if(neighbor->numneigh) free(neighbor->numneigh);
+        if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
        if(neighbor->neighbors) free(neighbor->neighbors);
+        if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
-        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
+        neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
+        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
+        neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
    }

    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@@ -206,11 +270,11 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    while(resize) {
        int new_maxneighs = neighbor->maxneighs;
        resize = 0;
-
        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
            int ci_cj1 = CJ1_FROM_CI(ci);
            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
-            int n = 0;
+            unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
+            int n = 0, nmasked = 0;
            int ibin = atom->icluster_bin[ci];
            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
@@ -218,14 +282,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
            MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
            MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
            MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
-
            for(int k = 0; k < nstencil; k++) {
                int jbin = ibin + stencil[k];
                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
                int cj, m = -1;
                MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
                const int c = bin_nclusters[jbin];
-
                if(c > 0) {
                    MD_FLOAT dl, dh, dm, dm0, d_bb_sq;

@@ -235,6 +297,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                        if(neighbor->half_neigh && ci_cj1 > cj) {
                            continue;
                        }
+                        
                        jbb_zmin = atom->jclusters[cj].bbminz;
                        jbb_zmax = atom->jclusters[cj].bbmaxz;
                        dl = ibb_zmin - jbb_zmax;
@@ -243,7 +306,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                        dm0 = MAX(dm, 0.0);
                        d_bb_sq = dm0 * dm0;
                    } while(m + 1 < c && d_bb_sq > cutneighsq);
-
                    jbb_xmin = atom->jclusters[cj].bbminx;
                    jbb_xmax = atom->jclusters[cj].bbmaxx;
                    jbb_ymin = atom->jclusters[cj].bbminy;
@@ -275,7 +337,30 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {

                            if(d_bb_sq < cutneighsq) {
                                if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
-                                    neighptr[n++] = cj;
+                                    // We use true (1) for rdiag because we only care if there are masks
+                                    // at all, and when this is set to false (0) the self-exclusions are
+                                    // not accounted for, which  makes the optimized version to not work!
+                                    unsigned int imask;
+                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
+                                    imask = get_imask_simd_2xnn(1, ci, cj);
+                                    #else // 4xn
+                                    imask = get_imask_simd_4xn(1, ci, cj);
+                                    #endif
+
+                                    if(n < neighbor->maxneighs) {
+                                        if(imask == NBNXN_INTERACTION_MASK_ALL) {
+                                            neighptr[n] = cj;
+                                            neighptr_imask[n] = imask;
+                                        } else {
+                                            neighptr[n] = neighptr[nmasked];
+                                            neighptr_imask[n] = neighptr_imask[nmasked];
+                                            neighptr[nmasked] = cj;
+                                            neighptr_imask[nmasked] = imask;
+                                            nmasked++;
+                                        }
+                                    }
+
+                                    n++;
                                }
                            }
                        }
@@ -290,18 +375,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                            jbb_zmin = atom->jclusters[cj].bbminz;
                            jbb_zmax = atom->jclusters[cj].bbmaxz;
                        }
-                    }
+                    } 
                }
            }

            // Fill neighbor list with dummy values to fit vector width
            if(CLUSTER_N < VECTOR_WIDTH) {
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
-                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr_imask[n] = 0;
+                    n++;
                }
            }

            neighbor->numneigh[ci] = n;
+            neighbor->numneigh_masked[ci] = nmasked;
            if(n >= neighbor->maxneighs) {
                resize = 1;

@@ -312,13 +400,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
        }

        if(resize) {
-            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
            neighbor->maxneighs = new_maxneighs * 1.2;
+            fprintf(stdout, "RESIZE %d, PROC %d\n", neighbor->maxneighs,me);
            free(neighbor->neighbors);
-            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
+            free(neighbor->neighbors_imask);
+            neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
+            neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
        }
    }
-
+    if(method == eightShell) neighborGhost(atom, neighbor);
+    
    /*
    DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
    for(int ci = 0; ci < 6; ci++) {
@@ -371,7 +462,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {

    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
+        unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
+        int numneighs_masked = neighbor->numneigh_masked[ci];
        int k = 0;

        // Remove dummy clusters if necessary
@@ -387,6 +480,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
                k++;
            } else {
                numneighs--;
+                if(k < numneighs_masked) {
+                    numneighs_masked--;
+                }
                neighs[k] = neighs[numneighs];
            }
        }
@@ -394,11 +490,14 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
        // Readd dummy clusters if necessary
        if(CLUSTER_N < VECTOR_WIDTH) {
            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
-                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs_imask[numneighs] = 0;
+                numneighs++;
            }
        }

        neighbor->numneigh[ci] = numneighs;
+        neighbor->numneigh_masked[ci] = numneighs_masked;
    }

    DEBUG_MESSAGE("pruneNeighbor end\n");
@@ -431,46 +530,44 @@ int coord2bin(MD_FLOAT xin, MD_FLOAT yin) {
    int ix, iy;

    if(xin >= xprd) {
-        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+        ix = (int)((xin + eps - xprd) * bininvx) + nbinx - mbinxlo;
    } else if(xin >= 0.0) {
-        ix = (int)(xin * bininvx) - mbinxlo;
+        ix = (int)((xin+eps) * bininvx) - mbinxlo;
    } else {
-        ix = (int)(xin * bininvx) - mbinxlo - 1;
+        ix = (int)((xin+eps) * bininvx) - mbinxlo - 1;
    }

-    if(yin >= yprd) {
-        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    if(yin >= yprd) {   
+        iy = (int)(((yin+eps) - yprd) * bininvy) + nbiny - mbinylo;
    } else if(yin >= 0.0) {
-        iy = (int)(yin * bininvy) - mbinylo;
+        iy = (int)((yin+eps) * bininvy) - mbinylo;
    } else {
-        iy = (int)(yin * bininvy) - mbinylo - 1;
+        iy = (int)((yin+eps) * bininvy) - mbinylo - 1;
    }
-
+ 
    return (iy * mbinx + ix + 1);
 }

 void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) {
    if(xin >= xprd) {
-        *ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
+        *ix = (int)((xin + eps - xprd) * bininvx) + nbinx - mbinxlo;
    } else if(xin >= 0.0) {
-        *ix = (int)(xin * bininvx) - mbinxlo;
+        *ix = (int)((xin+eps) * bininvx) - mbinxlo;
    } else {
-        *ix = (int)(xin * bininvx) - mbinxlo - 1;
+        *ix = (int)((xin+eps) * bininvx) - mbinxlo - 1;
    }
-
-    if(yin >= yprd) {
-        *iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
+    if(yin >= yprd) {  
+        *iy = (int)((yin + eps - yprd) * bininvy) + nbiny - mbinylo;
    } else if(yin >= 0.0) {
-        *iy = (int)(yin * bininvy) - mbinylo;
+        *iy = (int)((yin+eps) * bininvy) - mbinylo;
    } else {
-        *iy = (int)(yin * bininvy) - mbinylo - 1;
+        *iy = (int)((yin+eps) * bininvy) - mbinylo - 1;
    }
 }

 void binAtoms(Atom *atom) {
    DEBUG_MESSAGE("binAtoms start\n");
    int resize = 1;
-
    while(resize > 0) {
        resize = 0;

@@ -487,7 +584,7 @@ void binAtoms(Atom *atom) {
                resize = 1;
            }
        }
-
+        
        if(resize) {
            free(bins);
            atoms_per_bin *= 2;
@@ -535,8 +632,7 @@ void buildClusters(Atom *atom) {

    /* bin local atoms */
    binAtoms(atom);
-    sortAtomsByZCoord(atom);
-
+    sortAtomsByZCoord(atom);   
    for(int bin = 0; bin < mbins; bin++) {
        int c = bincount[bin];
        int ac = 0;
@@ -608,6 +704,9 @@ void buildClusters(Atom *atom) {
 void defineJClusters(Atom *atom) {
    DEBUG_MESSAGE("defineJClusters start\n");

+    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
+    atom->ncj = atom->Nclusters_local / jfac;   
+
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int cj0 = CJ0_FROM_CI(ci);

@@ -750,12 +849,11 @@ void binClusters(Atom *atom) {
                }
            }
        }
-
        for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
            const int cj = ncj + cg;
            int ix = -1, iy = -1;
            MD_FLOAT xtmp, ytmp;
-
+            if(shellMethod == halfShell && !halfZoneCluster(atom, cj)) continue;
            if(atom->jclusters[cj].natoms > 0) {
                int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
                MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -766,6 +864,7 @@ void binClusters(Atom *atom) {
                coord2bin2D(xtmp, ytmp, &ix, &iy);
                ix = MAX(MIN(ix, mbinx - 1), 0);
                iy = MAX(MIN(iy, mbiny - 1), 0);
+            
                for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
                    int nix, niy;
                    xtmp = cj_x[CL_X_OFFSET + cjj];
@@ -773,7 +872,7 @@ void binClusters(Atom *atom) {
                    coord2bin2D(xtmp, ytmp, &nix, &niy);
                    nix = MAX(MIN(nix, mbinx - 1), 0);
                    niy = MAX(MIN(niy, mbiny - 1), 0);
-
+            
                    // Always put the cluster on the bin of its innermost atom so
                    // the cluster should be closer to local clusters
                    if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
@@ -781,7 +880,6 @@ void binClusters(Atom *atom) {
                    if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
                    if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
                }
-
                int bin = iy * mbinx + ix + 1;
                int c = bin_nclusters[bin];
                if(c < clusters_per_bin) {
@@ -803,25 +901,21 @@ void binClusters(Atom *atom) {
                            break;
                        }
                    }
-
                    if(!inserted) {
                        bin_clusters[bin * clusters_per_bin + c] = cj;
                    }
-
                    bin_nclusters[bin]++;
                } else {
                    resize = 1;
                }
            }
        }
-
        if(resize) {
            free(bin_clusters);
            clusters_per_bin *= 2;
            bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
        }
    }
-
    /*
    DEBUG_MESSAGE("bin_nclusters\n");
    for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
@@ -839,7 +933,6 @@ void updateSingleAtoms(Atom *atom) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
-
        for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
            atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
            atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
@@ -848,12 +941,174 @@ void updateSingleAtoms(Atom *atom) {
            atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
            atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
            Natom++;
-        }
+        }       
    }
-
    if(Natom != atom->Nlocal) {
        fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n");
    }

    DEBUG_MESSAGE("updateSingleAtoms stop\n");
 }
+
+//MPI Shell Methods
+
+static int eightZoneCluster(Atom* atom, int cj)
+{   
+    //Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7
+    int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7};
+    int zone = 0;
+    MD_FLOAT *hi = atom->mybox.hi;
+
+    if (atom->jclusters[cj].bbminx +eps >=hi[_x]){
+        zone += 1;
+    }
+    if (atom->jclusters[cj].bbminy  +eps >=hi[_y]){
+        zone += 2;
+    }
+    if (atom->jclusters[cj].bbminz  +eps >=hi[_z]){
+        zone += 4;
+    }
+    return zoneMapping[zone];
+}
+
+static int halfZoneCluster(Atom* atom, int cj)
+{   
+    MD_FLOAT *hi = atom->mybox.hi;
+    MD_FLOAT *lo = atom->mybox.lo;
+
+    if(atom->jclusters[cj].bbmaxx < lo[_x] && atom->jclusters[cj].bbmaxy < hi[_y] && 
+       atom->jclusters[cj].bbmaxz < hi[_z]){
+        return 0;
+    } else if(atom->jclusters[cj].bbmaxy < lo[_y] && atom->jclusters[cj].bbmaxz < hi[_z]){
+        return 0;
+    } else if(atom->jclusters[cj].bbmaxz < lo[_z]){
+        return 0;
+    } else { 
+        return 1;
+    } 
+}
+
+int BoxGhostDistance(Atom *atom, int ci, int cj) {
+    
+    MD_FLOAT dl = atom->jclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
+    MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->jclusters[ci].bbmaxx;
+    MD_FLOAT dm = MAX(dl, dh);
+    MD_FLOAT dm0 = MAX(dm, 0.0);
+    MD_FLOAT dx2 = dm0 * dm0;
+   
+    dl = atom->jclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
+    dh = atom->jclusters[cj].bbminy - atom->jclusters[ci].bbmaxy;
+    dm = MAX(dl, dh);
+    dm0 = MAX(dm, 0.0);
+    MD_FLOAT dy2 = dm0 * dm0;
+
+    dl = atom->jclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
+    dh = atom->jclusters[cj].bbminz - atom->jclusters[ci].bbmaxz;
+    dm = MAX(dl, dh);
+    dm0 = MAX(dm, 0.0);
+    MD_FLOAT dz2 = dm0 * dm0;
+    
+    return dx2 > cutneighsq ? 0 : dy2 > cutneighsq ? 0 : dz2 > cutneighsq ? 0 : 1;
+}
+
+static int ghostClusterinRange(Atom *atom, int cs, int cg, MD_FLOAT rsq) {
+    int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs);
+    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg);
+    MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base];
+    MD_FLOAT *cg_x = &atom->cl_x[cj_vec_base];
+
+    for(int cii = 0; cii < atom->jclusters[cs].natoms; cii++) {
+        for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) {
+            MD_FLOAT delx = cs_x[CL_X_OFFSET + cii] - cg_x[CL_X_OFFSET + cjj];
+            MD_FLOAT dely = cs_x[CL_Y_OFFSET + cii] - cg_x[CL_Y_OFFSET + cjj];
+            MD_FLOAT delz = cs_x[CL_Z_OFFSET + cii] - cg_x[CL_Z_OFFSET + cjj];
+            if(delx * delx + dely * dely + delz * delz < rsq) {
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+static void neighborGhost(Atom *atom, Neighbor *neighbor) {
+    int Nshell=0;
+    int Ncluster_local = atom->Nclusters_local;
+    int Nclusterghost  = atom->Nclusters_ghost;
+    if(neighbor->listshell) free(neighbor->listshell);
+    neighbor->listshell = (int*) malloc(Nclusterghost * sizeof(int));
+    int* listzone  = (int*) malloc(8 * Nclusterghost * sizeof(int));
+    int countCluster[8] = {0,0,0,0,0,0,0,0};
+    
+    //Selecting ghost atoms for interaction and putting them into regions
+   for(int cg = atom->ncj; cg < atom->ncj+Nclusterghost; cg++) {
+        int czone = eightZoneCluster(atom,cg);        
+        int *list = &listzone[Nclusterghost*czone];
+        int n  = countCluster[czone];
+        list[n] = cg;
+        countCluster[czone]++;     
+       //It is only necessary to find neighbour particles for 3 regions
+       //if(czone == 1 || czone == 2 || czone == 3)
+       //neighbor->listshell[Nshell++] = cg;  
+    }
+    
+    for(int zone = 1; zone<=3; zone++){
+        int *list = &listzone[Nclusterghost*zone];
+        for(int n=0; n<countCluster[zone]; n++)
+            neighbor->listshell[Nshell++] = list[n]; 
+    }
+    
+    neighbor->Nshell = Nshell;
+    if(neighbor->numNeighShell) free(neighbor->numNeighShell);
+    if(neighbor->neighshell) free(neighbor->neighshell);
+    neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
+    neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int));
+    
+    int resize = 1;
+
+    while(resize)
+    {
+        resize = 0;
+        for(int ic = 0; ic < Nshell; ic++) {   
+            int *neighshell = &(neighbor->neighshell[ic*neighbor->maxneighs]); 
+            int n = 0;  
+            int icluster = neighbor->listshell[ic];
+            int iczone = eightZoneCluster(atom, icluster);
+            
+            for(int jczone=0; jczone<8; jczone++){
+                
+                if(jczone <=iczone) continue;
+                if(iczone == 1 && (jczone==5||jczone==6||jczone==7)) continue;
+                if(iczone == 2 && (jczone==4||jczone==6||jczone==7)) continue;
+                if(iczone == 3 && (jczone==4||jczone==5||jczone==7)) continue;
+                
+                int Ncluster = countCluster[jczone];
+                int* loc_zone = &listzone[jczone * Nclusterghost];
+
+                for(int k = 0; k < Ncluster ; k++) {
+                    int jcluster = loc_zone[k];    
+                                    
+                    if(BoxGhostDistance(atom, icluster, jcluster)) 
+                    {
+                        if(ghostClusterinRange(atom, icluster, jcluster, cutneighsq))
+                            neighshell[n++] = jcluster;
+                    }
+
+                }
+            }
+            neighbor->numNeighShell[ic] = n; 
+            
+            if(n >= neighbor->maxneighs){
+                resize = 1;
+                neighbor->maxneighs = n * 1.2;
+                fprintf(stdout, "RESIZE EIGHT SHELL %d, PROC %d\n", neighbor->maxneighs,me);
+                break;
+            }  
+        }
+
+        if(resize) {
+            free(neighbor->neighshell);
+            neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
+        }
+    }  
+    free(listzone); 
+}
--- a/gromacs/tracing.c
+++ b/gromacs/tracing.c
@@ -13,7 +13,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
    MEM_TRACER_INIT;
    INDEX_TRACER_INIT;
    int Nlocal = atom->Nlocal;
-    int* neighs;
+    int *neighs;
+    unsigned int *neighs_imask;
    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;

    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@@ -34,7 +35,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
        DIST_TRACE(neighs, numneighs);

        for(int k = 0; k < numneighs; k++) {
-            MEM_TRACE(neighs[k], 'R');
+            int j = neighs[k];
+            MEM_TRACE(j, 'R');
            MEM_TRACE(atom_x(j), 'R');
            MEM_TRACE(atom_y(j), 'R');
            MEM_TRACE(atom_z(j), 'R');
--- a/gromacs/vtk.c
+++ b/gromacs/vtk.c
@@ -9,6 +9,11 @@

 #include <atom.h>
 #include <vtk.h>
+#include <mpi.h>
+#include <string.h>
+
+static MPI_File _fh; 
+static inline void flushBuffer(char*); 

 void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
    write_local_atoms_to_vtk_file(filename, atom, timestep);
@@ -188,3 +193,128 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
    fclose(fp);
    return 0;
 }
+
+int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
+{
+    char msg[256];
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
+    MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
+    if(_fh == MPI_FILE_NULL) {
+        if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+ 
+    if (comm->myproc==0){
+        sprintf(msg, "# vtk DataFile Version 2.0\n");
+        sprintf(msg, "%sParticle data\n",msg);
+        sprintf(msg, "%sASCII\n",msg);
+        sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
+        sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);  
+        flushBuffer(msg);
+    } 
+}
+
+int vtkVector(Comm* comm, Atom* atom, Parameter* param)
+{ 
+    if (_fh == MPI_FILE_NULL) {
+        if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
+        return -1;
+    }
+    
+    int sizeline= 25;   //#initial guess of characters in "%.4f %.4f %.4f\n" 
+    int extrabuff = 100;
+    int sizebuff = sizeline*atom->Nlocal+extrabuff; 
+    int mysize = 0;
+    char* msg = (char*) malloc(sizebuff);
+    sprintf(msg, "");
+    for(int i = 0; i < atom->Nlocal; i++){
+        if(mysize+extrabuff >= sizebuff){
+            sizebuff*= 1.5;
+            msg = (char*) realloc(msg, sizebuff); 
+        }
+        //TODO: do not forget to add param->xlo, param->ylo, param->zlo   
+        sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
+        mysize = strlen(msg);
+    }
+    int gatherSize[comm->numproc];
+
+    MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
+    int offset=0;
+    int globalSize = 0;
+    
+    for(int i = 0; i < comm->myproc; i++)
+        offset+= gatherSize[i];
+    
+    for(int i = 0; i < comm->numproc; i++)
+        globalSize+= gatherSize[i];
+    
+    MPI_Offset displ;   
+    MPI_Datatype FileType;       
+    int GlobalSize[] = {globalSize}; 
+    int LocalSize[]  = {mysize};
+    int Start[] = {offset};
+
+    if(LocalSize[0]>0){
+        MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);    
+    } else {
+        MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
+    }
+    MPI_Type_commit(&FileType);
+    MPI_File_get_size(_fh, &displ);
+    MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
+    MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
+    MPI_Barrier(MPI_COMM_WORLD); 
+    MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);    
+     
+    if (comm->myproc==0){
+        
+        sprintf(msg, "\n\n");
+        sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2); 
+
+        for(int i = 0; i < atom->Natoms; i++) 
+            sprintf(msg, "%s1 %d\n", msg, i);
+        flushBuffer(msg);
+        
+        sprintf(msg, "\n\n"); 
+        sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
+        for(int i = 0; i < atom->Natoms; i++) 
+            sprintf(msg, "%s1\n",msg);
+        flushBuffer(msg);
+
+        sprintf(msg, "\n\n"); 
+        sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
+        sprintf(msg, "%sSCALARS mass double\n",msg);
+        sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
+        for(int i = 0; i < atom->Natoms; i++) 
+            sprintf(msg, "%s1.0\n",msg);
+        sprintf(msg, "%s\n\n",msg);
+        flushBuffer(msg);
+    }
+}
+
+void vtkClose()
+{
+    MPI_File_close(&_fh);
+    _fh=MPI_FILE_NULL;
+}
+
+//TODO: print ghost and cluster using MPI
+void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
+{
+    if(comm->numproc == 1)
+    {
+        write_data_to_vtk_file(filename, atom, timestep);
+        return;
+    }
+
+    vtkOpen(filename, comm, atom, timestep);
+    vtkVector(comm, atom, param);
+    vtkClose(); 
+}
+
+static inline void flushBuffer(char* msg){
+    MPI_Offset displ; 
+    MPI_File_get_size(_fh, &displ);
+    MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
+}
--- a/include_CLANG.mk
+++ b/include_CLANG.mk
@@ -7,6 +7,7 @@ ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra

 CFLAGS   = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
+#CFLAGS   = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 #CFLAGS   = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 #CFLAGS   = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
 ASFLAGS  = -masm=intel
--- a/include_GCC.mk
+++ b/include_GCC.mk
@@ -6,13 +6,29 @@ ANSI_CFLAGS += -std=c99
 ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra

-#CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
+ifeq ($(ISA),AVX512)
+CFLAGS   = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -O3 -march=cascadelake  -ffast-math -funroll-loops # -fopenmp
-CFLAGS   = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
+endif
+
+ifeq ($(ISA),AVX2)
 #CFLAGS   = -Ofast -march=native -mavx2  -ffast-math -funroll-loops # -fopenmp
-#CFLAGS   = -Ofast -march=native  -ffast-math -funroll-loops # -fopenmp
-#CFLAGS   = -O3 -march=native  -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -O3 -march=znver1  -ffast-math -funroll-loops # -fopenmp
+#CFLAGS   = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
+CFLAGS   = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
+endif
+
+ifeq ($(ISA),AVX)
+CFLAGS   = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
+endif
+
+ifeq ($(ISA),SSE)
+CFLAGS   = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
+endif
+
+#CFLAGS   = -O0 -g -std=c99 -fargument-noalias
+#CFLAGS   = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
+#CFLAGS   = -O3 -march=native  -ffast-math -funroll-loops # -fopenmp
 ASFLAGS  =  #-masm=intel
 LFLAGS   =
 DEFINES  = -D_GNU_SOURCE -DNO_ZMM_INTRIN
--- a/include_ICC.mk
+++ b/include_ICC.mk
@@ -1,13 +1,27 @@
 CC  = icc
 LINKER = $(CC)

-OPENMP  = #-qopenmp
+OPENMP  = -qopenmp
 PROFILE  = #-profile-functions -g  -pg
+
+ifeq ($(ISA),AVX512)
 OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
-#OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
-#OPTS     = -Ofast -xAVX  $(PROFILE)
+endif
+
+ifeq ($(ISA),AVX2)
+OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 #OPTS     = -Ofast -xAVX2  $(PROFILE)
-#OPTS     = -Ofast -xSSE4.2 $(PROFILE)
+#OPTS     = -Ofast -march=core-avx2 $(PROFILE)
+endif
+
+ifeq ($(ISA),AVX)
+OPTS     = -Ofast -xAVX  $(PROFILE)
+endif
+
+ifeq ($(ISA),SSE)
+OPTS     = -Ofast -xSSE4.2 $(PROFILE)
+endif
+
 #OPTS     = -Ofast -no-vec $(PROFILE)
 #OPTS     = -Ofast -xHost $(PROFILE)
 CFLAGS   = $(PROFILE) -restrict $(OPENMP) $(OPTS)
--- a/include_ICX.mk
+++ b/include_ICX.mk
@@ -3,13 +3,28 @@ LINKER = $(CC)

 OPENMP  = #-qopenmp
 PROFILE  = #-profile-functions -g  -pg
-#OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
-#OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
-#OPTS     = -Ofast -xAVX  $(PROFILE)
-#OPTS     = -Ofast -xAVX2  $(PROFILE)
-#OPTS     = -Ofast -xSSE4.2 $(PROFILE)
+
+ifeq ($(ISA),AVX512)
+OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
+#OPTS      = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
+endif
+
+ifeq ($(ISA),AVX2)
+OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
+#OPTS     = -Ofast -xHost  $(PROFILE)
+#OPTS     = -Ofast -march=core-avx2 $(PROFILE)
+endif
+
+ifeq ($(ISA),AVX)
+OPTS     = -Ofast -xAVX  $(PROFILE)
+endif
+
+ifeq ($(ISA),SSE)
+OPTS     = -Ofast -xSSE4.2 $(PROFILE)
+endif
+
 #OPTS     = -Ofast -no-vec $(PROFILE)
-OPTS     = -Ofast -xHost $(PROFILE)
+#OPTS     = -Ofast -xHost $(PROFILE)
 CFLAGS   = $(PROFILE) $(OPENMP) $(OPTS)
 ASFLAGS  = #-masm=intel
 LFLAGS   = $(PROFILE) $(OPTS) $(OPENMP)
--- a/include_ISA.mk
+++ b/include_ISA.mk
@@ -9,13 +9,15 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
    __ISA_AVX_FMA__=true
    __SIMD_WIDTH_DBL__=4
 else ifeq ($(strip $(ISA)), AVX2)
-    __ISA_AVX2__=true
    #__SIMD_KERNEL__=true
+    __ISA_AVX2__=true
    __SIMD_WIDTH_DBL__=4
 else ifeq ($(strip $(ISA)), AVX512)
    __ISA_AVX512__=true
-    __SIMD_KERNEL__=true
    __SIMD_WIDTH_DBL__=8
+    ifeq ($(strip $(DATA_TYPE)), DP)
+        __SIMD_KERNEL__=true
+    endif
 endif

 # SIMD width is specified in double-precision, hence it may
--- a/include_MPIICC.mk
+++ b/include_MPIICC.mk
@@ -0,0 +1,32 @@
+CC  = mpiicc
+LINKER = $(CC)
+
+OPENMP  = #-qopenmp
+PROFILE  = #-profile-functions -g  -pg
+
+ifeq ($(ISA),AVX512)
+OPTS      =  -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)  #-g -debug
+endif
+
+ifeq ($(ISA),AVX2)
+OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
+#OPTS     = -Ofast -xAVX2  $(PROFILE)
+#OPTS     = -Ofast -march=core-avx2 $(PROFILE)
+endif
+
+ifeq ($(ISA),AVX)
+OPTS     = -Ofast -xAVX  $(PROFILE)
+endif
+
+ifeq ($(ISA),SSE)
+OPTS     = -Ofast -xSSE4.2 $(PROFILE)
+endif
+
+#OPTS     = -Ofast -no-vec $(PROFILE)
+#OPTS     = -Ofast -xHost $(PROFILE)
+CFLAGS   = $(PROFILE) -restrict $(OPENMP) $(OPTS)
+ASFLAGS  = #-masm=intel
+LFLAGS   = $(PROFILE) $(OPTS) $(OPENMP) 
+DEFINES  = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
+INCLUDES = 
+LIBS     = -lm
--- a/lammps/atom.c
+++ b/lammps/atom.c
@@ -9,10 +9,12 @@
 #include <string.h>
 #include <math.h>

+#include <parameter.h>
 #include <atom.h>
 #include <allocate.h>
 #include <device.h>
 #include <util.h>
+#include <mpi.h>

 #define DELTA 20000

@@ -21,10 +23,10 @@
 #endif

 #ifndef MAX
-#define MAX(a,b)    ((a) > (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif

-void initAtom(Atom *atom) {
+void initAtom(Atom *atom){
    atom->x  = NULL; atom->y  = NULL; atom->z  = NULL;
    atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
    atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
@@ -41,6 +43,7 @@ void initAtom(Atom *atom) {
    atom->radius = NULL;
    atom->av = NULL;
    atom->r = NULL;
+    atom->border_map = NULL;

    DeviceAtom *d_atom = &(atom->d_atom);
    d_atom->x  = NULL; d_atom->y  = NULL; d_atom->z  = NULL;
@@ -52,12 +55,19 @@ void initAtom(Atom *atom) {
    d_atom->sigma6 = NULL;
    d_atom->cutforcesq = NULL;
    d_atom->cutneighsq = NULL;
+    //MPI
+    Box *mybox = &(atom->mybox);                  
+    mybox->xprd = mybox->yprd = mybox->zprd = 0;          
+    mybox->lo[_x]  = mybox->lo[_y]  = mybox->lo[_z] = 0;             
+    mybox->hi[_x]  = mybox->hi[_y]  = mybox->hi[_z] = 0;   
 }

 void createAtom(Atom *atom, Parameter *param) {
-    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
-    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
-    MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
+    
+    MD_FLOAT xlo = 0; MD_FLOAT xhi = param->xprd;
+    MD_FLOAT ylo = 0; MD_FLOAT yhi = param->yprd;
+    MD_FLOAT zlo = 0; MD_FLOAT zhi = param->zprd;
+    
    atom->Natoms = 4 * param->nx * param->ny * param->nz;
    atom->Nlocal = 0;
    atom->ntypes = param->ntypes;
@@ -107,15 +117,15 @@ void createAtom(Atom *atom, Parameter *param) {
            xtmp = 0.5 * alat * i;
            ytmp = 0.5 * alat * j;
            ztmp = 0.5 * alat * k;
-
+    
            if( xtmp >= xlo && xtmp < xhi &&
                    ytmp >= ylo && ytmp < yhi &&
                    ztmp >= zlo && ztmp < zhi ) {
-
+                
                n = k * (2 * param->ny) * (2 * param->nx) +
                    j * (2 * param->nx) +
                    i + 1;
-
+                
                for(m = 0; m < 5; m++) {
                    myrandom(&n);
                }
@@ -131,7 +141,7 @@ void createAtom(Atom *atom, Parameter *param) {
                }
                vztmp = myrandom(&n);

-                if(atom->Nlocal == atom->Nmax) {
+                while(atom->Nlocal >= atom->Nmax) {
                    growAtom(atom);
                }

@@ -163,38 +173,42 @@ int type_str2int(const char *type) {
    return -1;
 }

-int readAtom(Atom* atom, Parameter* param) {
+int readAtom(Atom *atom, Parameter *param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    int len = strlen(param->input_file);
    if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
    if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
    if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
    if(strncmp(&param->input_file[len - 3], ".in",  3) == 0) { return readAtom_in(atom, param); }
-    fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
+    if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
    exit(-1);
    return -1;
 }

 int readAtom_pdb(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    int read_atoms = 0;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }

    while(!feof(fp)) {
        readline(line, fp);
-        char *item = strtok(line, " ");
+        char *item = strtok(line, "\t ");
        if(strncmp(item, "CRYST1", 6) == 0) {
            param->xlo = 0.0;
-            param->xhi = atof(strtok(NULL, " "));
+            param->xhi = atof(strtok(NULL, "\t "));
            param->ylo = 0.0;
-            param->yhi = atof(strtok(NULL, " "));
+            param->yhi = atof(strtok(NULL, "\t "));
            param->zlo = 0.0;
-            param->zhi = atof(strtok(NULL, " "));
+            param->zhi = atof(strtok(NULL, "\t "));
            param->xprd = param->xhi - param->xlo;
            param->yprd = param->yhi - param->ylo;
            param->zprd = param->zhi - param->zlo;
@@ -203,23 +217,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
            char *label;
            int atom_id, comp_id;
            MD_FLOAT occupancy, charge;
-            atom_id = atoi(strtok(NULL, " ")) - 1;
+            atom_id = atoi(strtok(NULL, "\t ")) - 1;

            while(atom_id + 1 >= atom->Nmax) {
                growAtom(atom);
            }

-            atom->type[atom_id] = type_str2int(strtok(NULL, " "));
-            label = strtok(NULL, " ");
-            comp_id = atoi(strtok(NULL, " "));
-            atom_x(atom_id) = atof(strtok(NULL, " "));
-            atom_y(atom_id) = atof(strtok(NULL, " "));
-            atom_z(atom_id) = atof(strtok(NULL, " "));
+            atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
+            label = strtok(NULL, "\t ");
+            comp_id = atoi(strtok(NULL, "\t "));
+            atom_x(atom_id) = atof(strtok(NULL, "\t "));
+            atom_y(atom_id) = atof(strtok(NULL, "\t "));
+            atom_z(atom_id) = atof(strtok(NULL, "\t "));
            atom_vx(atom_id) = 0.0;
            atom_vy(atom_id) = 0.0;
            atom_vz(atom_id) = 0.0;
-            occupancy = atof(strtok(NULL, " "));
-            charge = atof(strtok(NULL, " "));
+            occupancy = atof(strtok(NULL, "\t "));
+            charge = atof(strtok(NULL, "\t "));
            atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
            atom->Natoms++;
            atom->Nlocal++;
@@ -231,14 +245,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
                  strncmp(item, "ENDMDL", 6) == 0) {
            // Do nothing
        } else {
-            fprintf(stderr, "Invalid item: %s\n", item);
+            if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
            exit(-1);
            return -1;
        }
    }

    if(!read_atoms) {
-        fprintf(stderr, "Input error: No atoms read!\n");
+        if(me==0)fprintf(stderr, "Input error: No atoms read!\n");
        exit(-1);
        return -1;
    }
@@ -254,12 +268,15 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
    fclose(fp);
    return read_atoms;
 }

 int readAtom_gro(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    char desc[MAXLINE];
@@ -268,7 +285,7 @@ int readAtom_gro(Atom* atom, Parameter* param) {
    int i = 0;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }
@@ -277,26 +294,26 @@ int readAtom_gro(Atom* atom, Parameter* param) {
    for(i = 0; desc[i] != '\n'; i++);
    desc[i] = '\0';
    readline(line, fp);
-    atoms_to_read = atoi(strtok(line, " "));
-    fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
+    atoms_to_read = atoi(strtok(line, "\t "));
+    if(me==0)fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);

    while(!feof(fp) && read_atoms < atoms_to_read) {
        readline(line, fp);
-        char *label = strtok(line, " ");
-        int type = type_str2int(strtok(NULL, " "));
-        int atom_id = atoi(strtok(NULL, " ")) - 1;
+        char *label = strtok(line, "\t ");
+        int type = type_str2int(strtok(NULL, "\t "));
+        int atom_id = atoi(strtok(NULL, "\t ")) - 1;
        atom_id = read_atoms;
        while(atom_id + 1 >= atom->Nmax) {
            growAtom(atom);
        }

        atom->type[atom_id] = type;
-        atom_x(atom_id) = atof(strtok(NULL, " "));
-        atom_y(atom_id) = atof(strtok(NULL, " "));
-        atom_z(atom_id) = atof(strtok(NULL, " "));
-        atom_vx(atom_id) = atof(strtok(NULL, " "));
-        atom_vy(atom_id) = atof(strtok(NULL, " "));
-        atom_vz(atom_id) = atof(strtok(NULL, " "));
+        atom_x(atom_id) = atof(strtok(NULL, "\t "));
+        atom_y(atom_id) = atof(strtok(NULL, "\t "));
+        atom_z(atom_id) = atof(strtok(NULL, "\t "));
+        atom_vx(atom_id) = atof(strtok(NULL, "\t "));
+        atom_vy(atom_id) = atof(strtok(NULL, "\t "));
+        atom_vz(atom_id) = atof(strtok(NULL, "\t "));
        atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
        atom->Natoms++;
        atom->Nlocal++;
@@ -306,18 +323,18 @@ int readAtom_gro(Atom* atom, Parameter* param) {
    if(!feof(fp)) {
        readline(line, fp);
        param->xlo = 0.0;
-        param->xhi = atof(strtok(line, " "));
+        param->xhi = atof(strtok(line, "\t "));
        param->ylo = 0.0;
-        param->yhi = atof(strtok(NULL, " "));
+        param->yhi = atof(strtok(NULL, "\t "));
        param->zlo = 0.0;
-        param->zhi = atof(strtok(NULL, " "));
+        param->zhi = atof(strtok(NULL, "\t "));
        param->xprd = param->xhi - param->xlo;
        param->yprd = param->yhi - param->ylo;
        param->zprd = param->zhi - param->zlo;
    }

    if(read_atoms != atoms_to_read) {
-        fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
+        if(me==0)fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
        exit(-1);
        return -1;
    }
@@ -333,12 +350,14 @@ int readAtom_gro(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
+    if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
    fclose(fp);
    return read_atoms;
 }

 int readAtom_dmp(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    int natoms = 0;
@@ -347,7 +366,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
    int ts = -1;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }
@@ -370,47 +389,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
                }
            } else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
                readline(line, fp);
-                param->xlo = atof(strtok(line, " "));
-                param->xhi = atof(strtok(NULL, " "));
+                param->xlo = atof(strtok(line, "\t "));
+                param->xhi = atof(strtok(NULL, "\t "));
                param->xprd = param->xhi - param->xlo;

                readline(line, fp);
-                param->ylo = atof(strtok(line, " "));
-                param->yhi = atof(strtok(NULL, " "));
+                param->ylo = atof(strtok(line, "\t "));
+                param->yhi = atof(strtok(NULL, "\t "));
                param->yprd = param->yhi - param->ylo;

                readline(line, fp);
-                param->zlo = atof(strtok(line, " "));
-                param->zhi = atof(strtok(NULL, " "));
+                param->zlo = atof(strtok(line, "\t "));
+                param->zhi = atof(strtok(NULL, "\t "));
                param->zprd = param->zhi - param->zlo;
            } else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
                for(int i = 0; i < natoms; i++) {
                    readline(line, fp);
-                    atom_id = atoi(strtok(line, " ")) - 1;
-                    atom->type[atom_id] = atoi(strtok(NULL, " "));
-                    atom_x(atom_id) = atof(strtok(NULL, " "));
-                    atom_y(atom_id) = atof(strtok(NULL, " "));
-                    atom_z(atom_id) = atof(strtok(NULL, " "));
-                    atom_vx(atom_id) = atof(strtok(NULL, " "));
-                    atom_vy(atom_id) = atof(strtok(NULL, " "));
-                    atom_vz(atom_id) = atof(strtok(NULL, " "));
+                    atom_id = atoi(strtok(line, "\t ")) - 1;
+                    atom->type[atom_id] = atoi(strtok(NULL, "\t "));
+                    atom_x(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_y(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_z(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_vx(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_vy(atom_id) = atof(strtok(NULL, "\t "));
+                    atom_vz(atom_id) = atof(strtok(NULL, "\t "));
                    atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
                    read_atoms++;
                }
            } else {
-                fprintf(stderr, "Invalid item: %s\n", item);
+                if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
                exit(-1);
                return -1;
            }
        } else {
-            fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
+            if(me==0)fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
            exit(-1);
            return -1;
        }
    }

    if(ts < 0 || !natoms || !read_atoms) {
-        fprintf(stderr, "Input error: atom data was not read!\n");
+        if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
        exit(-1);
        return -1;
    }
@@ -426,30 +445,34 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
    return natoms;
 }

 int readAtom_in(Atom* atom, Parameter* param) {
+    int me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
    FILE *fp = fopen(param->input_file, "r");
    char line[MAXLINE];
    int natoms = 0;
    int atom_id = 0;

    if(!fp) {
-        fprintf(stderr, "Could not open input file: %s\n", param->input_file);
+        if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
        exit(-1);
        return -1;
    }
-
    readline(line, fp);
-    natoms = atoi(strtok(line, " "));
-    param->xlo = atof(strtok(NULL, " "));
-    param->xhi = atof(strtok(NULL, " "));
-    param->ylo = atof(strtok(NULL, " "));
-    param->yhi = atof(strtok(NULL, " "));
-    param->zlo = atof(strtok(NULL, " "));
-    param->zhi = atof(strtok(NULL, " "));
+    natoms = atoi(strtok(line, "\t "));
+    param->xlo = atof(strtok(NULL, "\t "));
+    param->xhi = atof(strtok(NULL, "\t "));
+    param->ylo = atof(strtok(NULL, "\t "));
+    param->yhi = atof(strtok(NULL, "\t "));
+    param->zlo = atof(strtok(NULL, "\t "));
+    param->zhi = atof(strtok(NULL, "\t "));
+    param->xprd = param->xhi - param->xlo; 
+    param->yprd = param->yhi - param->ylo;
+    param->zprd = param->zhi - param->zlo;
    atom->Natoms = natoms;
    atom->Nlocal = natoms;
    atom->ntypes = 1;
@@ -462,27 +485,26 @@ int readAtom_in(Atom* atom, Parameter* param) {
        readline(line, fp);

        // TODO: store mass per atom
-        char *s_mass = strtok(line, " ");
+        char *s_mass = strtok(line, "\t ");
        if(strncmp(s_mass, "inf", 3) == 0) {
            // Set atom's mass to INFINITY
        } else {
            param->mass = atof(s_mass);
        }
-
-        atom->radius[atom_id] = atof(strtok(NULL, " "));
-        atom_x(atom_id) = atof(strtok(NULL, " "));
-        atom_y(atom_id) = atof(strtok(NULL, " "));
-        atom_z(atom_id) = atof(strtok(NULL, " "));
-        atom_vx(atom_id) = atof(strtok(NULL, " "));
-        atom_vy(atom_id) = atof(strtok(NULL, " "));
-        atom_vz(atom_id) = atof(strtok(NULL, " "));
+    
+        atom->radius[atom_id] = atof(strtok(NULL, "\t "));
+        atom_x(atom_id) = atof(strtok(NULL, "\t "));
+        atom_y(atom_id) = atof(strtok(NULL, "\t "));
+        atom_z(atom_id) = atof(strtok(NULL, "\t "));
+        atom_vx(atom_id) = atof(strtok(NULL, "\t "));
+        atom_vy(atom_id) = atof(strtok(NULL, "\t "));
+        atom_vz(atom_id) = atof(strtok(NULL, "\t "));
        atom->type[atom_id] = 0;
        atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
        atom_id++;
    }
-
    if(!natoms) {
-        fprintf(stderr, "Input error: atom data was not read!\n");
+        if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
        exit(-1);
        return -1;
    }
@@ -498,7 +520,7 @@ int readAtom_in(Atom* atom, Parameter* param) {
        atom->cutforcesq[i] = param->cutforce * param->cutforce;
    }

-    fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
+    if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
    return natoms;
 }

@@ -530,7 +552,125 @@ void growAtom(Atom *atom) {
    REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));

    // DEM
-    atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
+    atom->radius = (MD_FLOAT*) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
    atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
    atom->r  = (MD_FLOAT*) reallocate(atom->r,  ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
 }
+
+/* MPI added*/
+void packForward(Atom* atom, int n ,int* list, MD_FLOAT* buf, int* pbc)
+{
+  int i, j;
+    for(i = 0; i < n; i++) {
+      j = list[i];
+      buf_x(i) = atom_x(j) + pbc[0] * atom->mybox.xprd;
+      buf_y(i) = atom_y(j) + pbc[1] * atom->mybox.yprd;
+      buf_z(i) = atom_z(j) + pbc[2] * atom->mybox.zprd;
+    }
+}
+
+void unpackForward(Atom* atom, int n, int first, MD_FLOAT* buf)
+{
+  for(int i = 0; i < n; i++) {
+    atom_x((first + i)) = buf_x(i);
+    atom_y((first + i)) = buf_y(i);
+    atom_z((first + i)) = buf_z(i);
+  }
+}
+
+int packGhost(Atom* atom, int i, MD_FLOAT* buf, int* pbc)
+{    
+    int m = 0; 
+    buf[m++] = atom_x(i) + pbc[_x] * atom->mybox.xprd;
+    buf[m++] = atom_y(i) + pbc[_y] * atom->mybox.yprd;
+    buf[m++] = atom_z(i) + pbc[_z] * atom->mybox.zprd;
+    buf[m++] = atom->type[i];
+    return m;
+}
+
+int unpackGhost(Atom* atom, int i, MD_FLOAT* buf)
+{
+  while (i>=atom->Nmax) growAtom(atom);
+  int m = 0;
+  atom_x(i) = buf[m++];
+  atom_y(i) = buf[m++];
+  atom_z(i) = buf[m++];
+  atom->type[i] = buf[m++];
+  atom->Nghost++;
+  return m;
+}
+
+void packReverse(Atom* atom, int n, int first, MD_FLOAT* buf)
+{
+    for(int i = 0; i < n; i++) {
+        buf_x(i) = atom_fx(first + i);
+        buf_y(i) = atom_fy(first + i);
+        buf_z(i) = atom_fz(first + i);
+    }
+}
+
+void unpackReverse(Atom* atom, int n, int* list, MD_FLOAT* buf)
+{
+  int i, j;
+  for(i = 0; i < n; i++) {
+    j = list[i];
+    atom_fx(j) += buf_x(i);
+    atom_fy(j) += buf_y(i);
+    atom_fz(j) += buf_z(i);
+  }
+}
+
+int packExchange(Atom* atom, int i, MD_FLOAT* buf)
+{
+  int m = 0;
+  buf[m++] = atom_x(i);
+  buf[m++] = atom_y(i);
+  buf[m++] = atom_z(i);
+  buf[m++] = atom_vx(i);
+  buf[m++] = atom_vy(i);
+  buf[m++] = atom_vz(i);
+  buf[m++] = atom->type[i];
+  return m;
+}
+
+int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
+{
+  while(i >= atom->Nmax) growAtom(atom);
+  int m = 0;
+  atom_x(i) = buf[m++];
+  atom_y(i) = buf[m++];
+  atom_z(i) = buf[m++];
+  atom_vx(i) = buf[m++];
+  atom_vy(i) = buf[m++];
+  atom_vz(i) = buf[m++];
+  atom->type[i] = buf[m++];
+  return m;
+}
+
+void pbc(Atom* atom)
+{
+  for(int i = 0; i < atom->Nlocal; i++) {
+   
+    MD_FLOAT xprd = atom->mybox.xprd;
+    MD_FLOAT yprd = atom->mybox.yprd;
+    MD_FLOAT zprd = atom->mybox.zprd; 
+    
+    if(atom_x(i) < 0.0) atom_x(i) += xprd;
+    if(atom_y(i) < 0.0) atom_y(i) += yprd;
+    if(atom_z(i) < 0.0)  atom_z(i)+= zprd;
+    if(atom_x(i) >= xprd) atom_x(i) -= xprd;
+    if(atom_y(i) >= yprd) atom_y(i) -= yprd;
+    if(atom_z(i) >= zprd) atom_z(i) -= zprd;
+  }
+}
+
+void copy(Atom* atom, int i, int j)
+{
+  atom_x(i) = atom_x(j);
+  atom_y(i) = atom_y(j);
+  atom_z(i) = atom_z(j);
+  atom_vx(i) = atom_vx(j);
+  atom_vy(i) = atom_vy(j);
+  atom_vz(i) = atom_vz(j);
+  atom->type[i] = atom->type[j];
+}
--- a/lammps/cuda/force.cu
+++ b/lammps/cuda/force.cu
@@ -29,7 +29,7 @@ extern "C" {
 }

 // cuda kernel
-__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
+__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i >= Nlocal) {
        return;
@@ -46,6 +46,10 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
    MD_FLOAT fiy = 0;
    MD_FLOAT fiz = 0;

+#ifdef EXPLICIT_TYPES
+    const int type_i = atom->type[i];
+#endif
+
    for(int k = 0; k < numneighs; k++) {
        int j = neigh_neighbors[Nlocal * k + i];
        MD_FLOAT delx = xtmp - atom_x(j);
@@ -55,7 +59,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M

 #ifdef EXPLICIT_TYPES
        const int type_j = atom->type[j];
-        const int type_ij = type_i * atom->ntypes + type_j;
+        const int type_ij = type_i * ntypes + type_j;
        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
        const MD_FLOAT epsilon = atom->epsilon[type_ij];
@@ -109,7 +113,7 @@ extern "C" {

 void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
    const int Nlocal = atom->Nlocal;
-    const int num_threads_per_block = get_num_threads();
+    const int num_threads_per_block = get_cuda_num_threads();
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);

    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
@@ -123,7 +127,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {

 void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
    const int Nlocal = atom->Nlocal;
-    const int num_threads_per_block = get_num_threads();
+    const int num_threads_per_block = get_cuda_num_threads();
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);

    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
@@ -136,13 +140,11 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 }

 double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
-    const int num_threads_per_block = get_num_threads();
+    const int num_threads_per_block = get_cuda_num_threads();
    int Nlocal = atom->Nlocal;
-#ifndef EXPLICIT_TYPES
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
-#endif

    /*
    int nDevices;
@@ -165,7 +167,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");

-    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
+    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
    cuda_assert("calc_force", cudaPeekAtLastError());
    cuda_assert("calc_force", cudaDeviceSynchronize());
    cudaProfilerStop();
--- a/lammps/cuda/neighbor.cu
+++ b/lammps/cuda/neighbor.cu
@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins

 __global__ void compute_neighborhood(
    DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
-    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
+    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {

    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i >= nlocal) {
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(

 #ifdef EXPLICIT_TYPES
            int type_j = atom->type[j];
-            const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+            const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
 #else
            const MD_FLOAT cutoff = cutneighsq;
 #endif
@@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo

 void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
-    const int num_threads_per_block = get_num_threads();
+    const int num_threads_per_block = get_cuda_num_threads();
    int nall = atom->Nlocal + atom->Nghost;

    cudaProfilerStart();
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
                                                                    np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
                                                                    c_new_maxneighs,
-								                                    cutneighsq);
+								                                    cutneighsq, atom->ntypes);

        cuda_assert("compute_neighborhood", cudaPeekAtLastError());
        cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
--- a/lammps/cuda/pbc.cu
+++ b/lammps/cuda/pbc.cu
@@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
 /* update coordinates of ghost atoms */
 /* uses mapping created in setupPbc */
 void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
-    const int num_threads_per_block = get_num_threads();
+    const int num_threads_per_block = get_cuda_num_threads();

    if(reneigh) {
        memcpyToGPU(atom->d_atom.x,     atom->x,    sizeof(MD_FLOAT) * atom->Nmax * 3);
@@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
 }

 void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
-    const int num_threads_per_block = get_num_threads();
+    const int num_threads_per_block = get_cuda_num_threads();
    MD_FLOAT xprd = param->xprd;
    MD_FLOAT yprd = param->yprd;
    MD_FLOAT zprd = param->zprd;
--- a/lammps/device_spec.c
+++ b/lammps/device_spec.c
@@ -14,6 +14,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {

    d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    d_atom->cutneighsq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
    d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
@@ -22,6 +23,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
    memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
    memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
+    memcpyToGPU(d_atom->cutneighsq,     atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
 }
--- a/lammps/force_eam.c
+++ b/lammps/force_eam.c
@@ -31,8 +31,12 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
    double S = getTimeStamp();

+
+    #pragma omp parallel
+    {
    LIKWID_MARKER_START("force_eam_fp");
-    #pragma omp parallel for
+
+    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -95,13 +99,19 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
    }

    LIKWID_MARKER_STOP("force_eam_fp");
+    }

    // We still need to update fp for PBC atoms
    for(int i = 0; i < atom->Nghost; i++) {
        fp[Nlocal + i] = fp[atom->border_map[i]];
    }

+
+    #pragma omp parallel
+    {
    LIKWID_MARKER_START("force_eam");
+
+    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -192,6 +202,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
    }

    LIKWID_MARKER_STOP("force_eam");
+    }
+
    double E = getTimeStamp();
    return E-S;
 }
--- a/lammps/force_lj.c
+++ b/lammps/force_lj.c
@@ -13,44 +13,53 @@
 #include <parameter.h>
 #include <stats.h>
 #include <timing.h>
-
+#include <mpi.h>
+#include <util.h>
 #ifdef __SIMD_KERNEL__
 #include <simd.h>
 #endif

+void computeForceGhostShell(Parameter*, Atom*, Neighbor*);
+
 double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    int Nlocal = atom->Nlocal;
+    int Nghost = atom->Nghost;
    int* neighs;
    #ifndef EXPLICIT_TYPES
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
    #endif
+    const MD_FLOAT num1 = 1.0;
+    const MD_FLOAT num48 = 48.0;
+    const MD_FLOAT num05 = 0.5;

    for(int i = 0; i < Nlocal; i++) {
        atom_fx(i) = 0.0;
        atom_fy(i) = 0.0;
        atom_fz(i) = 0.0;
    }
-
    double S = getTimeStamp();
+
+    #pragma omp parallel
+    {
    LIKWID_MARKER_START("force");

-    #pragma omp parallel for
+    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
        MD_FLOAT xtmp = atom_x(i);
        MD_FLOAT ytmp = atom_y(i);
        MD_FLOAT ztmp = atom_z(i);
-        MD_FLOAT fix = 0;
-        MD_FLOAT fiy = 0;
-        MD_FLOAT fiz = 0;
-
+        MD_FLOAT fix = 0.0;
+        MD_FLOAT fiy = 0.0;
+        MD_FLOAT fiz = 0.0;
+        
        #ifdef EXPLICIT_TYPES
        const int type_i = atom->type[i];
        #endif
-
+        
        for(int k = 0; k < numneighs; k++) {
            int j = neighs[k];
            MD_FLOAT delx = xtmp - atom_x(j);
@@ -65,53 +74,66 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
            const MD_FLOAT sigma6 = atom->sigma6[type_ij];
            const MD_FLOAT epsilon = atom->epsilon[type_ij];
            #endif
-
            if(rsq < cutforcesq) {
-                MD_FLOAT sr2 = 1.0 / rsq;
+                MD_FLOAT sr2 = num1 / rsq;
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
-                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;  
                fix += delx * force;
                fiy += dely * force;
-                fiz += delz * force;
+                fiz += delz * force; 
+        
            #ifdef USE_REFERENCE_VERSION
                addStat(stats->atoms_within_cutoff, 1);
            } else {
                addStat(stats->atoms_outside_cutoff, 1);
            #endif
            }
-        }
-
+        }              
        atom_fx(i) += fix;
        atom_fy(i) += fiy;
        atom_fz(i) += fiz;
+        
+        #ifdef USE_REFERENCE_VERSION
+        if(numneighs % VECTOR_WIDTH > 0) {
+            addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
+        }
+        #endif

        addStat(stats->total_force_neighs, numneighs);
        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
    }

    LIKWID_MARKER_STOP("force");
+    }
    double E = getTimeStamp();
    return E-S;
 }

 double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    int Nlocal = atom->Nlocal;
+    int Nghost = atom->Nghost;
    int* neighs;
    #ifndef EXPLICIT_TYPES
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
    #endif
+    const MD_FLOAT num1 = 1.0;
+    const MD_FLOAT num48 = 48.0;
+    const MD_FLOAT num05 = 0.5;

-    for(int i = 0; i < Nlocal; i++) {
+    for(int i = 0; i < Nlocal+Nghost; i++) {
        atom_fx(i) = 0.0;
        atom_fy(i) = 0.0;
        atom_fz(i) = 0.0;
    }
-
    double S = getTimeStamp();
+
+    #pragma omp parallel
+    {
    LIKWID_MARKER_START("forceLJ-halfneigh");

+    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -146,22 +168,20 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
            #endif

            if(rsq < cutforcesq) {
-                MD_FLOAT sr2 = 1.0 / rsq;
+                MD_FLOAT sr2 = num1 / rsq;
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
-                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
+                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
                fix += delx * force;
                fiy += dely * force;
                fiz += delz * force;
-
-                // We do not need to update forces for ghost atoms
-                if(j < Nlocal) {
+                // We need to update forces for ghost atoms if shell_method  or half stencil is requiered
+                if((param->half_neigh && j<Nlocal) || param->method){
                    atom_fx(j) -= delx * force;
                    atom_fy(j) -= dely * force;
                    atom_fz(j) -= delz * force;
                }
            }
        }
-
        atom_fx(i) += fix;
        atom_fy(i) += fiy;
        atom_fz(i) += fiz;
@@ -170,7 +190,10 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
    }

+    if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor); 
    LIKWID_MARKER_STOP("forceLJ-halfneigh");
+    }
+
    double E = getTimeStamp();
    return E-S;
 }
@@ -189,7 +212,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
    }

    double S = getTimeStamp();
-    LIKWID_MARKER_START("force");

    #ifndef __SIMD_KERNEL__
    fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
@@ -201,7 +223,12 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);

-    #pragma omp parallel for
+
+    #pragma omp parallel
+    {
+    LIKWID_MARKER_START("force");
+
+    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -242,9 +269,66 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
        atom_fy(i) += simd_h_reduce_sum(fiy);
        atom_fz(i) += simd_h_reduce_sum(fiz);
    }
-    #endif

    LIKWID_MARKER_STOP("force");
+    }
+    #endif
+
    double E = getTimeStamp();
    return E-S;
 }
+
+void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) {
+    int Nshell = neighbor->Nshell;
+    int* neighs;
+    #ifndef EXPLICIT_TYPES
+    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
+    MD_FLOAT sigma6 = param->sigma6;
+    MD_FLOAT epsilon = param->epsilon;
+    #endif
+    const MD_FLOAT num1 = 1.0;
+    const MD_FLOAT num48 = 48.0;
+    const MD_FLOAT num05 = 0.5;
+
+    for(int i = 0; i < Nshell; i++) {
+        neighs = &(neighbor->neighshell[i * neighbor->maxneighs]);
+        int numneigh = neighbor->numNeighShell[i];
+        int iatom = neighbor->listshell[i];
+        MD_FLOAT xtmp = atom_x(iatom);
+        MD_FLOAT ytmp = atom_y(iatom);
+        MD_FLOAT ztmp = atom_z(iatom);
+        MD_FLOAT fix = 0;
+        MD_FLOAT fiy = 0;
+        MD_FLOAT fiz = 0;
+
+        #ifdef EXPLICIT_TYPES
+        const int type_i = atom->type[i];
+        #endif
+
+        for(int k = 0; k < numneigh; k++) {
+            int jatom = neighs[k];
+            MD_FLOAT delx = xtmp - atom_x(jatom);
+            MD_FLOAT dely = ytmp - atom_y(jatom);
+            MD_FLOAT delz = ztmp - atom_z(jatom);
+            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+            if(rsq < cutforcesq) {
+                MD_FLOAT sr2 = num1 / rsq;
+                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
+                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
+                fix += delx * force;
+                fiy += dely * force;
+                fiz += delz * force;
+                
+                atom_fx(jatom) -= delx * force;
+                atom_fy(jatom) -= dely * force;
+                atom_fz(jatom) -= delz * force;
+            }
+        }
+        atom_fx(iatom) += fix;
+        atom_fy(iatom) += fiy;
+        atom_fz(iatom) += fiz;
+
+    }
+}
+
--- a/lammps/includes/atom.h
+++ b/lammps/includes/atom.h
@@ -4,8 +4,9 @@
 * Use of this source code is governed by a LGPL-3.0
 * license that can be found in the LICENSE file.
 */
-#include <parameter.h>

+#include <box.h>
+#include <parameter.h>
 #ifndef __ATOM_H_
 #define __ATOM_H_

@@ -56,6 +57,8 @@ typedef struct {
    MD_FLOAT *sigma6;
    MD_FLOAT *cutforcesq;
    MD_FLOAT *cutneighsq;
+    //TODO: insert the id number
+    //MD_FLOAT *Atom_id;

    // DEM
    MD_FLOAT *radius;
@@ -64,6 +67,9 @@ typedef struct {

    // Device data
    DeviceAtom d_atom;
+     
+    //Info Subdomain
+    Box mybox;            
 } Atom;

 extern void initAtom(Atom*);
@@ -75,6 +81,17 @@ extern int readAtom_dmp(Atom*, Parameter*);
 extern int readAtom_in(Atom*, Parameter*);
 extern void growAtom(Atom*);

+int  packGhost(Atom*, int, MD_FLOAT*, int*);
+int  unpackGhost(Atom*, int, MD_FLOAT*);
+int  packExchange(Atom*, int, MD_FLOAT*);
+int  unpackExchange(Atom*, int, MD_FLOAT*);
+void packForward(Atom*, int, int*, MD_FLOAT*, int*); 
+void unpackForward(Atom*, int, int, MD_FLOAT*);
+void packReverse(Atom* , int , int , MD_FLOAT*);
+void unpackReverse(Atom*, int, int*, MD_FLOAT*);
+void pbc(Atom*);
+void copy(Atom*, int, int);
+
 #ifdef AOS
 #   define POS_DATA_LAYOUT     "AoS"
 #   define atom_x(i)           atom->x[(i) * 3 + 0]
@@ -99,4 +116,8 @@ extern void growAtom(Atom*);
 #   define atom_fz(i)          atom->fz[i]
 #endif

+#   define buf_x(i)            buf[3*(i)] 
+#   define buf_y(i)            buf[3*(i)+1]
+#   define buf_z(i)            buf[3*(i)+2]
+
 #endif
--- a/lammps/includes/neighbor.h
+++ b/lammps/includes/neighbor.h
@@ -20,9 +20,14 @@ typedef struct {
    int ncalls;
    int maxneighs;
    int half_neigh;
+    int half_stencil;
    int *neighbors;
    int *numneigh;
-
+    //MPI
+    int Nshell;         //# of atoms in listShell
+    int *numNeighShell; //# of neighs for each atom in listShell
+    int *neighshell;    //list of neighs for each atom in listShell
+    int *listshell;     //Atoms to compute the force
    // Device data
    DeviceNeighbor d_neighbor;
 } Neighbor;
--- a/lammps/includes/vtk.h
+++ b/lammps/includes/vtk.h
@@ -5,8 +5,11 @@
 * license that can be found in the LICENSE file.
 */
 #include <atom.h>
+#include <comm.h>
+#include <parameter.h>

 #ifndef __VTK_H_
 #define __VTK_H_
 extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
+extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
 #endif
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
@@ -59,12 +59,6 @@ void init(Parameter *param) {
    param->eam_file = NULL;
 }

-// Show debug messages
-#define DEBUG(msg)  printf(msg)
-// Do not show debug messages
-//#define DEBUG(msg)
-
-
 void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
    const int maxneighs = nneighs * nreps;
    neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
@@ -125,7 +119,7 @@ int main(int argc, const char *argv[]) {

    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("force");
-    DEBUG("Initializing parameters...\n");
+    DEBUG_MESSAGE("Initializing parameters...\n");
    init(&param);

    for(int i = 0; i < argc; i++) {
@@ -196,11 +190,11 @@ int main(int argc, const char *argv[]) {
    }

    if(param.force_field == FF_EAM) {
-        DEBUG("Initializing EAM parameters...\n");
+        DEBUG_MESSAGE("Initializing EAM parameters...\n");
        initEam(&eam, &param);
    }

-    DEBUG("Initializing atoms...\n");
+    DEBUG_MESSAGE("Initializing atoms...\n");
    initAtom(atom);
    initStats(&stats);

@@ -216,7 +210,7 @@ int main(int argc, const char *argv[]) {
        atom->cutforcesq[i] = param.cutforce * param.cutforce;
    }

-    DEBUG("Creating atoms...\n");
+    DEBUG_MESSAGE("Creating atoms...\n");
    for(int i = 0; i < natoms; ++i) {
        while(atom->Nlocal > atom->Nmax - natoms) {
            growAtom(atom);
@@ -247,11 +241,11 @@ int main(int argc, const char *argv[]) {
        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
    }

-    DEBUG("Initializing neighbor lists...\n");
+    DEBUG_MESSAGE("Initializing neighbor lists...\n");
    initNeighbor(&neighbor, &param);
-    DEBUG("Creating neighbor lists...\n");
+    DEBUG_MESSAGE("Creating neighbor lists...\n");
    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
-    DEBUG("Computing forces...\n");
+    DEBUG_MESSAGE("Computing forces...\n");

    double T_accum = 0.0;
    for(int i = 0; i < param.ntimes; i++) {
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -11,9 +11,7 @@
 #include <limits.h>
 #include <math.h>
 #include <float.h>
-
 #include <likwid-marker.h>
-
 #include <allocate.h>
 #include <atom.h>
 #include <device.h>
@@ -23,13 +21,19 @@
 #include <timing.h>
 #include <neighbor.h>
 #include <parameter.h>
-#include <pbc.h>
 #include <stats.h>
 #include <timers.h>
 #include <util.h>
 #include <vtk.h>
+#include <comm.h>
+#include <grid.h>
+#include <shell_methods.h>
+#include <mpi.h>

 #define HLINE "----------------------------------------------------------------------------\n"
+#ifdef CUDA_TARGET
+extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
+#endif

 extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
@@ -37,62 +41,6 @@ extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
 extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);

-#ifdef CUDA_TARGET
-extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
-#endif
-
-double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
-    if(param->force_field == FF_EAM) { initEam(eam, param); }
-    double S, E;
-    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
-    param->xprd = param->nx * param->lattice;
-    param->yprd = param->ny * param->lattice;
-    param->zprd = param->nz * param->lattice;
-
-    S = getTimeStamp();
-    initAtom(atom);
-    initPbc(atom);
-    initStats(stats);
-    initNeighbor(neighbor, param);
-    if(param->input_file == NULL) {
-        createAtom(atom, param);
-    } else {
-        readAtom(atom, param);
-    }
-
-    setupNeighbor(param);
-    setupThermo(param, atom->Natoms);
-    if(param->input_file == NULL) { adjustThermo(param, atom); }
-    setupPbc(atom, param);
-    initDevice(atom, neighbor);
-    updatePbc(atom, param, true);
-    buildNeighbor(atom, neighbor);
-    E = getTimeStamp();
-    return E-S;
-}
-
-double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
-    double S, E;
-    S = getTimeStamp();
-    LIKWID_MARKER_START("reneighbour");
-    updateAtomsPbc(atom, param);
-    setupPbc(atom, param);
-    updatePbc(atom, param, true);
-    //sortAtom(atom);
-    buildNeighbor(atom, neighbor);
-    LIKWID_MARKER_STOP("reneighbour");
-    E = getTimeStamp();
-    return E-S;
-}
-
-void printAtomState(Atom *atom) {
-    printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
-    // int nall = atom->Nlocal + atom->Nghost;
-    // for (int i=0; i<nall; i++) {
-    //     printf("%d  %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
-    // }
-}
-
 double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    if(param->force_field == FF_EAM) {
        return computeForceEam(eam, param, atom, neighbor, stats);
@@ -105,7 +53,7 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
        }
    }

-    if(param->half_neigh) {
+    if(param->half_neigh || param->method) {
        return computeForceLJHalfNeigh(param, atom, neighbor, stats);
    }

@@ -116,6 +64,102 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
    #endif
 }

+double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time){
+    double S, E;
+    int dims = 3;   //TODO: Adjust to do in 3d and 2d
+    S = getTimeStamp();
+    if(param->balance == RCB) {
+        rcbBalance(grid, atom, param, meanBisect,dims,0);
+        neighComm(comm, param, grid);
+    }else if(param->balance == meanTimeRCB){
+        rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
+        neighComm(comm, param, grid);
+    }else if(param->balance == Staggered) {
+        staggeredBalance(grid, atom, param, time);
+        neighComm(comm, param, grid);
+        exchangeComm(comm,atom);
+    }else { } //Do nothing
+    //printGrid(grid);
+  E = getTimeStamp();
+  
+  return E-S;
+} 
+
+double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
+{      
+    double E,S,time;
+    int me;
+    MPI_Comm_rank(world,&me);
+    S = getTimeStamp();
+    if(param->balance == meanTimeRCB || param->balance == RCB){
+        rcbBalance(grid, atom, param, meanBisect,3,0);
+        neighComm(comm, param, grid); 
+    }      
+    MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world); 
+    printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
+    MPI_Barrier(world);
+    E = getTimeStamp();
+    return E-S;
+}
+
+double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
+    if(param->force_field == FF_EAM) { initEam(eam, param); }
+    double S, E;
+    param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
+    param->xprd = param->nx * param->lattice;
+    param->yprd = param->ny * param->lattice;
+    param->zprd = param->nz * param->lattice;
+    S = getTimeStamp();
+    initAtom(atom);
+    initStats(stats);
+    initNeighbor(neighbor, param);
+    if(param->input_file == NULL) {
+        createAtom(atom, param);
+    } else {
+        readAtom(atom, param);
+    }
+    setupGrid(grid,atom,param);
+    setupNeighbor(param);
+    setupComm(comm, param, grid);
+    if(param->balance){  
+        initialBalance(param, eam, atom, neighbor, stats, comm, grid);
+    }
+    setupThermo(param, atom->Natoms);
+    if(param->input_file == NULL) { adjustThermo(param, atom); }
+    #ifdef SORT_ATOMS
+    atom->Nghost = 0;
+    sortAtom(atom);
+    #endif
+    initDevice(atom, neighbor);
+    ghostNeighbor(comm, atom, param); 
+    buildNeighbor(atom, neighbor);
+    E = getTimeStamp();
+    return E-S;
+}
+
+double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
+    double S, E;
+    S = getTimeStamp();
+    LIKWID_MARKER_START("reneighbour");
+    #ifdef SORT_ATOMS
+    atom->Nghost = 0;
+    sortAtom(atom);
+    #endif
+    ghostNeighbor(comm, atom, param);
+    buildNeighbor(atom, neighbor);
+    LIKWID_MARKER_STOP("reneighbour");
+    E = getTimeStamp();
+    return E-S;
+}
+
+double updateAtoms(Comm* comm, Atom* atom){
+    double S,E;
+    S = getTimeStamp();
+        exchangeComm(comm, atom);
+    E = getTimeStamp();
+    return E-S;
+}
+
 void writeInput(Parameter *param, Atom *atom) {
    FILE *fpin = fopen("input.in", "w");
    fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
@@ -134,15 +178,16 @@ int main(int argc, char** argv) {
    Neighbor neighbor;
    Stats stats;
    Parameter param;
-
+    Comm comm; 
+    Grid grid;
    LIKWID_MARKER_INIT;
 #pragma omp parallel
    {
        LIKWID_MARKER_REGISTER("force");
        //LIKWID_MARKER_REGISTER("reneighbour");
        //LIKWID_MARKER_REGISTER("pbc");
-    }
-
+    } 
+    initComm(&argc, &argv, &comm);
    initParameter(&param);
    for(int i = 0; i < argc; i++) {
        if((strcmp(argv[i], "-p") == 0)) {
@@ -183,6 +228,24 @@ int main(int argc, char** argv) {
        if((strcmp(argv[i], "-half") == 0)) {
            param.half_neigh = atoi(argv[++i]);
            continue;
+        } 
+        if((strcmp(argv[i], "-method") == 0)) {
+            param.method = atoi(argv[++i]);
+            if (param.method>3 || param.method< 0){
+                if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
+                endComm(&comm);   
+                exit(0);
+            }
+            continue;
+        } 
+        if((strcmp(argv[i], "-bal") == 0)) {
+            param.balance = atoi(argv[++i]);
+            if (param.balance>3 || param.balance< 0){
+                if(comm.myproc == 0) fprintf(stderr, "Load Balance does not exist!\n");
+                endComm(&comm);   
+                exit(0);
+            }
+            continue;
        }
        if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
            param.cutforce = atof(argv[++i]);
@@ -201,60 +264,70 @@ int main(int argc, char** argv) {
            continue;
        }
        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
-            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
-            printf(HLINE);
-            printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
-            printf("-f <string>:          force field (lj, eam or dem), default lj\n");
-            printf("-i <string>:          input file with atom positions (dump)\n");
-            printf("-e <string>:          input file for EAM\n");
-            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
-            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
-            printf("-r / --radius <real>: set cutoff radius\n");
-            printf("-s / --skin <real>:   set skin (verlet buffer)\n");
-            printf("--freq <real>:        processor frequency (GHz)\n");
-            printf("--vtk <string>:       VTK file for visualization\n");
-            printf(HLINE);
-            exit(EXIT_SUCCESS);
+            if(comm.myproc ==0 ){
+                printf("MD Bench: A minimalistic re-implementation of miniMD\n");
+                printf(HLINE);
+                printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
+                printf("-f <string>:          force field (lj, eam or dem), default lj\n");
+                printf("-i <string>:          input file with atom positions (dump)\n");
+                printf("-e <string>:          input file for EAM\n");
+                printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
+                printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
+                printf("-r / --radius <real>: set cutoff radius\n");
+                printf("-s / --skin <real>:   set skin (verlet buffer)\n");
+                printf("--freq <real>:        processor frequency (GHz)\n");
+                printf("--vtk <string>:       VTK file for visualization\n");
+                printf(HLINE);
+            }
+                exit(EXIT_SUCCESS);
        }
    }
+    
+    if(param.balance>0 && param.method == 1){
+        if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
+        endComm(&comm);   
+        exit(0);
+    }

    param.cutneigh = param.cutforce + param.skin;
-    setup(&param, &eam, &atom, &neighbor, &stats);
-    printParameter(&param);
-    printf(HLINE);
-
-    printf("step\ttemp\t\tpressure\n");
+    timer[SETUP]=setup(&param, &eam, &atom, &neighbor, &stats, &comm, &grid);
+    if(comm.myproc == 0)printParameter(&param);
+    if(comm.myproc == 0)printf(HLINE);
+    if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n"); 
    computeThermo(0, &param, &atom);
    #if defined(MEM_TRACER) || defined(INDEX_TRACER)
-    traceAddresses(&param, &atom, &neighbor, n + 1);
+    traceAddresses(&param, &atom, &neighbor, n + 1);// TODO: trace adress
    #endif
-
    //writeInput(&param, &atom);
-
-    timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
-    timer[NEIGH] = 0.0;
-    timer[TOTAL] = getTimeStamp();
-
+    timer[FORCE]    = computeForce(&eam, &param, &atom, &neighbor, &stats); 
+    timer[NEIGH]    = 0.0;
+    timer[FORWARD]  = 0.0;
+    timer[UPDATE]   = 0.0;
+    timer[BALANCE]  = 0.0;  
+    timer[REVERSE]  = reverse(&comm, &atom, &param);
+    MPI_Barrier(world);
+    timer[TOTAL]    = getTimeStamp();
    if(param.vtk_file != NULL) {
-        write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
-    }
-
+        printvtk(param.vtk_file, &comm, &atom, &param, 0);
+    } 
    for(int n = 0; n < param.ntimes; n++) {
        bool reneigh = (n + 1) % param.reneigh_every == 0;
        initialIntegrate(reneigh, &param, &atom);
-        if((n + 1) % param.reneigh_every) {
-            updatePbc(&atom, &param, false);
+        if(reneigh) { 
+            timer[UPDATE] +=updateAtoms(&comm,&atom);         
+            if(param.balance && !((n+1)%param.balance_every))
+                timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , &param, timer[FORCE]);
+            timer[NEIGH] += reneighbour(&comm, &param, &atom, &neighbor);
        } else {
-            timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
-        }
-
+            timer[FORWARD] += forward(&comm, &atom, &param);
+        } 
        #if defined(MEM_TRACER) || defined(INDEX_TRACER)
        traceAddresses(&param, &atom, &neighbor, n + 1);
        #endif
-
        timer[FORCE] += computeForce(&eam, &param, &atom, &neighbor, &stats);
+        timer[REVERSE] += reverse(&comm, &atom, &param);
        finalIntegrate(reneigh, &param, &atom);
-
+        
        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
            #ifdef CUDA_TARGET
            memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
@@ -263,23 +336,42 @@ int main(int argc, char** argv) {
        }

        if(param.vtk_file != NULL) {
-            write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
-        }
+            printvtk(param.vtk_file, &comm, &atom ,&param, n+1);
+        } 
    }
-
+    MPI_Barrier(world);
    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
    computeThermo(-1, &param, &atom);
+        
+    double mint[NUMTIMER];
+    double maxt[NUMTIMER];
+    double sumt[NUMTIMER];
+    timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
+    MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
+    MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
+    MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
+    int Nghost;
+    MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);

-    printf(HLINE);
-    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
-    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
-            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
-    printf(HLINE);
-    printf("Performance: %.2f million atom updates per second\n",
+    if(comm.myproc == 0){
+        int n = comm.numproc;
+        printf(HLINE);
+        printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
+        printf("TOTAL %.2fs\n\n",timer[TOTAL]);
+        printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
+        printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
+        printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
+        printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
+        printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
+        printf(HLINE);
+        printf("Performance: %.2f million atom updates per second\n",
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+            
 #ifdef COMPUTE_STATS
    displayStatistics(&atom, &param, &stats, timer);
 #endif
+    } 
+    endComm(&comm);
    LIKWID_MARKER_CLOSE;
    return EXIT_SUCCESS;
 }
--- a/lammps/neighbor.c
+++ b/lammps/neighbor.c
@@ -11,27 +11,39 @@
 #include <neighbor.h>
 #include <parameter.h>
 #include <atom.h>
+#include <util.h>
+#include <mpi.h>

 #define SMALL 1.0e-6
 #define FACTOR 0.999

 MD_FLOAT xprd, yprd, zprd;
 MD_FLOAT bininvx, bininvy, bininvz;
-int mbinxlo, mbinylo, mbinzlo;
+int pad_x, pad_y, pad_z;
 int nbinx, nbiny, nbinz;
-int mbinx, mbiny, mbinz; // n bins in x, y, z
+int mbinx, mbiny, mbinz; // m bins in x, y, z
 int *bincount;
 int *bins;
-int mbins; //total number of bins
-int atoms_per_bin;  // max atoms per bin
+int mbins;              //total number of bins
+int atoms_per_bin;      // max atoms per bin
 MD_FLOAT cutneigh;
-MD_FLOAT cutneighsq;  // neighbor cutoff squared
+MD_FLOAT cutneighsq;    // neighbor cutoff squared
 int nmax;
-int nstencil;      // # of bins in stencil
-int* stencil;      // stencil list of bin offsets
+int nstencil;           // # of bins in stencil
+int* stencil;           // stencil list of bin offsets
 MD_FLOAT binsizex, binsizey, binsizez;
+int me;                 //rank
+int method;             // method
+int half_stencil;       //If half stencil exist 
+int shellMethod;        //If shell method exist   
+
 static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
 static MD_FLOAT bindist(int, int, int);
+static int ghostZone(Atom*, int);
+static int eightZone(Atom*, int);
+static int halfZone(Atom*, int);
+static void neighborGhost(Atom*, Neighbor*);
+static inline int interaction(Atom* atom, int i, int j);

 /* exported subroutines */
 void initNeighbor(Neighbor *neighbor, Parameter *param) {
@@ -51,7 +63,25 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
    neighbor->maxneighs = 100;
    neighbor->numneigh = NULL;
    neighbor->neighbors = NULL;
+    //========== MPI =============
+    shellMethod = 0;
+    half_stencil = 0;
+    method = param->method;
+    if(method == halfShell || method == eightShell){ 
+        param->half_neigh = 1;
+        shellMethod = 1;
+    }
+    if(method == halfStencil){
+        param->half_neigh = 0;
+        half_stencil = 1;
+    }
+    me = 0;
+    MPI_Comm_rank(MPI_COMM_WORLD,&me);
    neighbor->half_neigh = param->half_neigh;
+    neighbor->Nshell = 0;  
+    neighbor->numNeighShell = NULL;
+    neighbor->neighshell = NULL;
+    neighbor->listshell = NULL;
 }

 void setupNeighbor(Parameter* param) {
@@ -64,7 +94,6 @@ void setupNeighbor(Parameter* param) {
        yprd = param->yprd;
        zprd = param->zprd;
    }
-
    // TODO: update lo and hi for standard case and use them here instead
    MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
    MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
@@ -93,54 +122,48 @@ void setupNeighbor(Parameter* param) {
        bininvy = 1.0 / binsizey;
        bininvz = 1.0 / binsizez;
    }
-
-    coord = xlo - cutneigh - SMALL * xprd;
-    mbinxlo = (int) (coord * bininvx);
-    if (coord < 0.0) { mbinxlo = mbinxlo - 1; }
-    coord = xhi + cutneigh + SMALL * xprd;
-    mbinxhi = (int) (coord * bininvx);
-
-    coord = ylo - cutneigh - SMALL * yprd;
-    mbinylo = (int) (coord * bininvy);
-    if (coord < 0.0) { mbinylo = mbinylo - 1; }
-    coord = yhi + cutneigh + SMALL * yprd;
-    mbinyhi = (int) (coord * bininvy);
-
-    coord = zlo - cutneigh - SMALL * zprd;
-    mbinzlo = (int) (coord * bininvz);
-    if (coord < 0.0) { mbinzlo = mbinzlo - 1; }
-    coord = zhi + cutneigh + SMALL * zprd;
-    mbinzhi = (int) (coord * bininvz);
-
-    mbinxlo = mbinxlo - 1;
-    mbinxhi = mbinxhi + 1;
-    mbinx = mbinxhi - mbinxlo + 1;
-
-    mbinylo = mbinylo - 1;
-    mbinyhi = mbinyhi + 1;
-    mbiny = mbinyhi - mbinylo + 1;
-
-    mbinzlo = mbinzlo - 1;
-    mbinzhi = mbinzhi + 1;
-    mbinz = mbinzhi - mbinzlo + 1;
+    pad_x = (int)(cutneigh*bininvx);
+    while(pad_x * binsizex < FACTOR * cutneigh) pad_x++;
+    pad_y = (int)(cutneigh*bininvy);
+    while(pad_y * binsizey < FACTOR * cutneigh) pad_y++;
+    pad_z = (int)(cutneigh*bininvz);
+    while(pad_z * binsizez < FACTOR * cutneigh) pad_z++;

    nextx = (int) (cutneigh * bininvx);
-    if(nextx * binsizex < FACTOR * cutneigh) nextx++;
+    if(nextx * binsizex < FACTOR * cutneigh){
+        nextx++;
+        pad_x++;
+    } 
    nexty = (int) (cutneigh * bininvy);
-    if(nexty * binsizey < FACTOR * cutneigh) nexty++;
+    if(nexty * binsizey < FACTOR * cutneigh){
+        nexty++;
+        pad_y++;
+    } 
    nextz = (int) (cutneigh * bininvz);
-    if(nextz * binsizez < FACTOR * cutneigh) nextz++;
+    if(nextz * binsizez < FACTOR * cutneigh){
+        nextz++;
+        pad_z++;
+    } 
+
+    mbinx = nbinx+4*pad_x;
+    mbiny = nbiny+4*pad_y;
+    mbinz = nbinz+4*pad_z;

    if (stencil) { free(stencil); }
    stencil = (int*) malloc((2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
    nstencil = 0;
+ 
    int kstart = -nextz;
-
+    int jstart = -nexty; 
+    int istart = -nextx;
+    int ibin = 0;
    for(int k = kstart; k <= nextz; k++) {
-        for(int j = -nexty; j <= nexty; j++) {
-            for(int i = -nextx; i <= nextx; i++) {
-                if(bindist(i, j, k) < cutneighsq) {
-                    stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
+        for(int j = jstart; j <= nexty; j++) {
+            for(int i = istart; i <= nextx; i++) {
+                if(bindist(i, j, k) < cutneighsq) {     
+                    int jbin = k * mbiny * mbinx + j * mbinx + i;
+                    if(ibin>jbin && half_stencil) continue;                  
+                    stencil[nstencil++] = jbin;
                }
            }
        }
@@ -154,8 +177,7 @@ void setupNeighbor(Parameter* param) {
 }

 void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
-    int nall = atom->Nlocal + atom->Nghost;
-
+    int nall = atom->Nlocal + atom->Nghost;  
    /* extend atom arrays if necessary */
    if(nall > nmax) {
        nmax = nall;
@@ -164,16 +186,13 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
    }
-
    /* bin local & ghost atoms */
    binatoms(atom);
    int resize = 1;
-
    /* loop over each atom, storing neighbors */
    while(resize) {
        int new_maxneighs = neighbor->maxneighs;
        resize = 0;
-
        for(int i = 0; i < atom->Nlocal; i++) {
            int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
            int n = 0;
@@ -184,21 +203,22 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
            #ifdef EXPLICIT_TYPES
            int type_i = atom->type[i];
            #endif
+
            for(int k = 0; k < nstencil; k++) {
                int jbin = ibin + stencil[k];
                int* loc_bin = &bins[jbin * atoms_per_bin];
-
-                for(int m = 0; m < bincount[jbin]; m++) {
+                for(int m = 0; m < bincount[jbin]; m++) {    
                    int j = loc_bin[m];
-                    if((j == i) || (neighbor->half_neigh && (j < i))) {
-                        continue;
-                    }
-
+                    
+                    if((j==i) || (neighbor->half_neigh && (j<i))) 
+                        continue;              
+                    if(half_stencil && ibin==jbin  && !interaction(atom,i,j))
+                        continue;          
+                
                    MD_FLOAT delx = xtmp - atom_x(j);
-                    MD_FLOAT dely = ytmp - atom_y(j);
+                    MD_FLOAT dely = ytmp - atom_y(j);  
                    MD_FLOAT delz = ztmp - atom_z(j);
                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
-
                    #ifdef EXPLICIT_TYPES
                    int type_j = atom->type[j];
                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
@@ -210,8 +230,8 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
                    }
                }
            }
-
            neighbor->numneigh[i] = n;
+            
            if(n >= neighbor->maxneighs) {
                resize = 1;

@@ -220,14 +240,15 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
                }
            }
        }
-
        if(resize) {
-            printf("RESIZE %d\n", neighbor->maxneighs);
+            printf("RESIZE %d, PROC %d\n", neighbor->maxneighs,me);
            neighbor->maxneighs = new_maxneighs * 1.2;
            free(neighbor->neighbors);
            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
        }
    }
+
+    if(method == eightShell) neighborGhost(atom, neighbor);
 }

 /* internal subroutines */
@@ -257,44 +278,28 @@ MD_FLOAT bindist(int i, int j, int k) {
    } else {
        delz = (k + 1) * binsizez;
    }
-
    return (delx * delx + dely * dely + delz * delz);
 }

 int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin) {
-    int ix, iy, iz;
-
-    if(xin >= xprd) {
-        ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
-    } else if(xin >= 0.0) {
-        ix = (int)(xin * bininvx) - mbinxlo;
-    } else {
-        ix = (int)(xin * bininvx) - mbinxlo - 1;
-    }
-
-    if(yin >= yprd) {
-        iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
-    } else if(yin >= 0.0) {
-        iy = (int)(yin * bininvy) - mbinylo;
-    } else {
-        iy = (int)(yin * bininvy) - mbinylo - 1;
-    }
-
-    if(zin >= zprd) {
-        iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
-    } else if(zin >= 0.0) {
-        iz = (int)(zin * bininvz) - mbinzlo;
-    } else {
-        iz = (int)(zin * bininvz) - mbinzlo - 1;
-    }
-
-    return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
+   int ix, iy, iz;
+   MD_FLOAT eps = 1e-9; 
+   MD_FLOAT xlo=0.0; MD_FLOAT ylo=0.0; MD_FLOAT zlo=0.0;
+   xlo = fabs(xlo - pad_x*binsizex)+eps;
+   ylo = fabs(ylo - pad_y*binsizey)+eps;
+   zlo = fabs(zlo - pad_z*binsizez)+eps;
+   ix = (int) ((xin + xlo)*bininvx);
+   iy = (int) ((yin + ylo)*bininvy);
+   iz = (int) ((zin + zlo)*bininvz);
+    
+    return (iz * mbiny * mbinx + iy * mbinx + ix);
+    //return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
 }

-void binatoms(Atom *atom) {
+void binatoms(Atom *atom) {    
    int nall = atom->Nlocal + atom->Nghost;
    int resize = 1;
-
+    
    while(resize > 0) {
        resize = 0;

@@ -304,7 +309,7 @@ void binatoms(Atom *atom) {

        for(int i = 0; i < nall; i++) {
            int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
-
+            if(shellMethod && !ghostZone(atom, i)) continue; 
            if(bincount[ibin] < atoms_per_bin) {
                int ac = bincount[ibin]++;
                bins[ibin * atoms_per_bin + ac] = i;
@@ -325,54 +330,51 @@ void sortAtom(Atom* atom) {
    binatoms(atom);
    int Nmax = atom->Nmax;
    int* binpos = bincount;
-
-    for(int i=1; i<mbins; i++) {
-        binpos[i] += binpos[i-1];
+    for(int i = 1; i < mbins; i++) {
+        binpos[i] += binpos[i - 1];
    }
-
-#ifdef AOS
+    #ifdef AOS
    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
-#else
+    #else
    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
    MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
    MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
    MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
    MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
-#endif
+    #endif
    MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
    MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;

-    for(int mybin = 0; mybin<mbins; mybin++) {
-        int start = mybin>0?binpos[mybin-1]:0;
+    for(int mybin = 0; mybin < mbins; mybin++) {
+        int start = mybin > 0 ? binpos[mybin - 1] : 0;
        int count = binpos[mybin] - start;
-        for(int k=0; k<count; k++) {
+        for(int k = 0; k < count; k++) {
            int new_i = start + k;
            int old_i = bins[mybin * atoms_per_bin + k];
-#ifdef AOS
+            #ifdef AOS
            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
            new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
            new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
-#else
+            #else
            new_x[new_i] = old_x[old_i];
            new_y[new_i] = old_y[old_i];
            new_z[new_i] = old_z[old_i];
            new_vx[new_i] = old_vx[old_i];
            new_vy[new_i] = old_vy[old_i];
            new_vz[new_i] = old_vz[old_i];
-#endif
+            #endif
        }
    }
-
    free(atom->x);
    free(atom->vx);
    atom->x = new_x;
    atom->vx = new_vx;
-#ifndef AOS
+    #ifndef AOS
    free(atom->y);
    free(atom->z);
    free(atom->vy);
@@ -381,5 +383,160 @@ void sortAtom(Atom* atom) {
    atom->z = new_z;
    atom->vy = new_vy;
    atom->vz = new_vz;
-#endif
+    #endif
 }
+
+/* internal subroutines 
+Added with MPI*/
+
+static int ghostZone(Atom* atom, int i){
+    if(i<atom->Nlocal)  return 1;
+    else if(method == halfShell)  return halfZone(atom,i);
+    else if(method == eightShell) return eightZone(atom,i); 
+    else return 0;  
+}
+
+static int eightZone(Atom* atom, int i)
+{   
+    //Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7
+    int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7};
+    MD_FLOAT *hi = atom->mybox.hi;
+    int zone = 0;
+
+    if(BigOrEqual(atom_x(i),hi[_x])) {
+        zone += 1;
+    }
+    if(BigOrEqual(atom_y(i),hi[_y])) {
+        zone += 2;
+    }
+    if(BigOrEqual(atom_z(i),hi[_z])) {
+        zone += 4;
+    }   
+    return zoneMapping[zone];
+}
+
+static int halfZone(Atom* atom, int i)
+{   
+    MD_FLOAT *hi = atom->mybox.hi;
+    MD_FLOAT *lo = atom->mybox.lo;
+
+    if(atom_x(i)<lo[_x] && atom_y(i)<hi[_y] && atom_z(i)<hi[_z]){
+        return 0;
+    } else if(atom_y(i)<lo[_y] && atom_z(i)<hi[_z]){
+        return 0;
+    } else if(atom_z(i)<lo[_z]){
+        return 0;
+    } else {
+        return 1;
+    }
+}
+
+static void neighborGhost(Atom *atom, Neighbor *neighbor) {
+    int Nshell=0;
+    int Nlocal = atom->Nlocal;
+    int Nghost = atom->Nghost;
+    if(neighbor->listshell) free(neighbor->listshell);
+    neighbor->listshell = (int*) malloc(Nghost * sizeof(int));
+    int* listzone  = (int*) malloc(8 * Nghost * sizeof(int));
+    int countAtoms[8] = {0,0,0,0,0,0,0,0};
+ 
+    //Selecting ghost atoms for interaction
+   for(int i = Nlocal; i < Nlocal+Nghost; i++) {
+        int izone = ghostZone(atom,i);
+        int *list = &listzone[Nghost*izone];
+        int n  = countAtoms[izone];
+        list[n] = i;
+        countAtoms[izone]++;     
+    }
+
+    for(int zone = 1; zone<=3; zone++){
+        int *list = &listzone[Nghost*zone];
+        for(int n=0; n<countAtoms[zone]; n++)
+            neighbor->listshell[Nshell++] = list[n]; 
+    }
+
+    neighbor->Nshell = Nshell;
+    if(neighbor->numNeighShell) free(neighbor->numNeighShell);
+    if(neighbor->neighshell) free(neighbor->neighshell);
+    neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
+    neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int));
+    int resize = 1;
+
+    while(resize)
+    {
+        resize = 0;
+        for(int i = 0; i < Nshell; i++) {   
+            int *neighshell = &(neighbor->neighshell[i*neighbor->maxneighs]); 
+            int n = 0;  
+            int iatom = neighbor->listshell[i];
+            int izone = ghostZone(atom, iatom);
+            MD_FLOAT xtmp = atom_x(iatom);
+            MD_FLOAT ytmp = atom_y(iatom);
+            MD_FLOAT ztmp = atom_z(iatom);
+            int ibin = coord2bin(xtmp, ytmp, ztmp);
+            
+            #ifdef EXPLICIT_TYPES
+            int type_i = atom->type[iatom];
+            #endif
+
+            for(int k = 0; k < nstencil; k++) {
+                int jbin = ibin + stencil[k];
+                int* loc_bin = &bins[jbin * atoms_per_bin];
+                for(int m = 0; m < bincount[jbin]; m++) {    
+                    int jatom = loc_bin[m];
+                
+                    int jzone = ghostZone(atom,jatom);
+
+                    if(jzone <=izone) continue;
+                    if(izone == 1 && (jzone==5||jzone==6||jzone==7)) continue;
+                    if(izone == 2 && (jzone==4||jzone==6||jzone==7)) continue;
+                    if(izone == 3 && (jzone==4||jzone==5||jzone==7)) continue;
+    
+                    MD_FLOAT delx = xtmp - atom_x(jatom);
+                    MD_FLOAT dely = ytmp - atom_y(jatom);  
+                    MD_FLOAT delz = ztmp - atom_z(jatom);
+                    MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                    #ifdef EXPLICIT_TYPES
+                    int type_j = atom->type[jatom];
+                    const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
+                    #else
+                    const MD_FLOAT cutoff = cutneighsq;
+                    #endif
+                    if(rsq <= cutoff) {
+                        neighshell[n++] = jatom;
+                    }
+                }
+            }         
+            
+            neighbor->numNeighShell[i] = n; 
+            if(n >= neighbor->maxneighs){
+                resize = 1;
+                neighbor->maxneighs = n * 1.2;
+                break;
+            }  
+        }
+            
+        if(resize) {
+            free(neighbor->neighshell);
+            neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
+        }
+    }  
+    free(listzone); 
+}
+
+static inline int interaction(Atom* atom, int i, int j) {
+       
+    if(i<j && j<atom->Nlocal) {
+        return 1;
+    } else if( atom_z(j)>atom_z(i) && j>=atom->Nlocal) {
+        return 1;
+    } else if(Equal(atom_z(j),atom_z(i)) && atom_y(j)<atom_y(i) && j>=atom->Nlocal){
+        return 1;  
+    } else if(Equal(atom_z(j),atom_z(i)) && Equal(atom_y(j),atom_y(i)) && atom_x(j)<atom_x(i) && j>=atom->Nlocal){
+        return 1;
+    }  else {
+        return 0;
+    }
+}                          
+ 
--- a/lammps/pbc.c
+++ b/lammps/pbc.c
@@ -125,7 +125,7 @@ void setupPbc(Atom *atom, Parameter *param) {
        if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
            if (x < Cutneigh         && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(+1,+1,+1); }
            if (x < Cutneigh         && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(+1,-1,+1); }
-            if (x < Cutneigh         && y >= Cutneigh        && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
+            if (x < Cutneigh         && y < Cutneigh        && z >= (zprd-Cutneigh))  { ADDGHOST(+1,+1,-1); }
            if (x < Cutneigh         && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
            if (x >= (xprd-Cutneigh) && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(-1,+1,+1); }
            if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(-1,-1,+1); }
--- a/lammps/vtk.c
+++ b/lammps/vtk.c
@@ -6,8 +6,12 @@
 */
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
+#include <vtk.h>
+#include <mpi.h>

-#include <atom.h>
+static MPI_File _fh; 
+static inline void flushBuffer(char*); 

 int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
    char timestep_filename[128];
@@ -18,12 +22,12 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
        fprintf(stderr, "Could not open VTK file for writing!\n");
        return -1;
    }
-
    fprintf(fp, "# vtk DataFile Version 2.0\n");
    fprintf(fp, "Particle data\n");
    fprintf(fp, "ASCII\n");
    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
    fprintf(fp, "POINTS %d double\n", atom->Nlocal);
+
    for(int i = 0; i < atom->Nlocal; ++i) {
        fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
    }
@@ -48,3 +52,168 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
    fclose(fp);
    return 0;
 }
+
+int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
+{
+    char msg[256];
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
+    MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
+    if(_fh == MPI_FILE_NULL) {
+        if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+ 
+    if (comm->myproc==0){
+        sprintf(msg, "# vtk DataFile Version 2.0\n");
+        sprintf(msg, "%sParticle data\n",msg);
+        sprintf(msg, "%sASCII\n",msg);
+        sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
+        sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);  
+        flushBuffer(msg);
+    } 
+}
+
+int vtkVector(Comm* comm, Atom* atom, Parameter* param)
+{ 
+    if (_fh == MPI_FILE_NULL) {
+        if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
+        return -1;
+    }
+    
+    int sizeline= 25;   //#initial guess of characters in "%.4f %.4f %.4f\n" 
+    int extrabuff = 100;
+    int sizebuff = sizeline*atom->Nlocal+extrabuff; 
+    int mysize = 0;
+    char* msg = (char*) malloc(sizebuff);
+    sprintf(msg, "");
+    for(int i = 0; i < atom->Nlocal; i++){
+        if(mysize+extrabuff >= sizebuff){
+            sizebuff*= 1.5;
+            msg = (char*) realloc(msg, sizebuff); 
+        }
+        //TODO: do not forget to add param->xlo, param->ylo, param->zlo   
+        sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
+        mysize = strlen(msg);
+    }
+    int gatherSize[comm->numproc];
+
+    MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
+    int offset=0;
+    int globalSize = 0;
+    
+    for(int i = 0; i < comm->myproc; i++)
+        offset+= gatherSize[i];
+    
+    for(int i = 0; i < comm->numproc; i++)
+        globalSize+= gatherSize[i];
+    
+    MPI_Offset displ;   
+    MPI_Datatype FileType;       
+    int GlobalSize[] = {globalSize}; 
+    int LocalSize[]  = {mysize};
+    int Start[] = {offset};
+
+    if(LocalSize[0]>0){
+        MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);    
+    } else {
+        MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
+    }
+    MPI_Type_commit(&FileType);
+    MPI_File_get_size(_fh, &displ);
+    MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
+    MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
+    MPI_Barrier(MPI_COMM_WORLD); 
+    MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);    
+     
+    if (comm->myproc==0){
+        
+        sprintf(msg, "\n\n");
+        sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2); 
+
+        for(int i = 0; i < atom->Natoms; i++) 
+            sprintf(msg, "%s1 %d\n", msg, i);
+        flushBuffer(msg);
+        
+        sprintf(msg, "\n\n"); 
+        sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
+        for(int i = 0; i < atom->Natoms; i++) 
+            sprintf(msg, "%s1\n",msg);
+        flushBuffer(msg);
+
+        sprintf(msg, "\n\n"); 
+        sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
+        sprintf(msg, "%sSCALARS mass double\n",msg);
+        sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
+        for(int i = 0; i < atom->Natoms; i++) 
+            sprintf(msg, "%s1.0\n",msg);
+        sprintf(msg, "%s\n\n",msg);
+        flushBuffer(msg);
+    }
+}
+
+void vtkClose()
+{
+    MPI_File_close(&_fh);
+    _fh=MPI_FILE_NULL;
+}
+
+int printGhost(const char* filename, Atom* atom, int timestep, int me) {
+    char timestep_filename[128];
+    snprintf(timestep_filename, sizeof timestep_filename, "%s_%d_ghost%i.vtk", filename, timestep,me);
+    FILE* fp = fopen(timestep_filename, "wb");
+
+    if(fp == NULL) {
+        fprintf(stderr, "Could not open VTK file for writing!\n");
+        return -1;
+    }
+    fprintf(fp, "# vtk DataFile Version 2.0\n");
+    fprintf(fp, "Particle data\n");
+    fprintf(fp, "ASCII\n");
+    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
+    fprintf(fp, "POINTS %d double\n", atom->Nghost);
+
+    for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
+        fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
+    for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
+        fprintf(fp, "1 %d\n", i);
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
+    for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
+        fprintf(fp, "1\n");
+    }
+    fprintf(fp, "\n\n");
+    fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
+    fprintf(fp, "SCALARS mass double\n");
+    fprintf(fp, "LOOKUP_TABLE default\n");
+    for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; i++) {
+        fprintf(fp, "1.0\n");
+    }
+    fprintf(fp, "\n\n");
+    fclose(fp);
+    return 0;
+}
+
+void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
+{
+    if(comm->numproc == 1)
+    {
+        write_atoms_to_vtk_file(filename, atom, timestep);
+        return;
+    }
+
+    vtkOpen(filename, comm, atom, timestep);
+    vtkVector(comm, atom, param);
+    vtkClose(); 
+    //printGhost(filename, atom, timestep, comm->myproc);
+}
+
+static inline void flushBuffer(char* msg){
+    MPI_Offset displ; 
+    MPI_File_get_size(_fh, &displ);
+    MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
+}
--- a/util/evaluate_latency_and_cfd.sh
+++ b/util/evaluate_latency_and_cfd.sh
@@ -1,46 +1,116 @@
 #!/bin/bash

-TAG=ICX
-OPT_SCHEME=gromacs
-MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
-FREQ=2.4
-NRUNS=3
-FIXED_PARAMS=--freq $FREQ
+[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
+[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
+[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit

-if [ "$OPT_SCHEME" = "gromacs" ]; then
-    STUB1_NAME=Stub-33
-    STUB1_PARAMS=-na 4 -nn 33
-    STUB2_NAME=Stub-128
-    STUB2_PARAMS=-na 4 -nn 128
+MDBENCH_BIN=$1
+BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
+OPT_SCHEME="${BIN_INFO%%-*}"
+PREC="${BIN_INFO##*-}"
+BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
+BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
+TAG="${BIN_INFO%%-*}"
+ISA="${BIN_INFO##*-}"
+CORE="${CORE:-0}"
+FREQ="${FREQ:-2.4}"
+NRUNS="${NRUNS:-3}"
+LOG="${LOG:-latencies_and_cfds.$(hostname).log}"
+STUB_ONLY="${STUB_ONLY:-false}"
+SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
+
+OPTIND=2
+while getopts "c:f:n:l:s" flag; do
+    case "${flag}" in
+        c) CORE=${OPTARG};;
+        f) FREQ=${OPTARG};;
+        n) NRUNS=${OPTARG};;
+        l) LOG=${OPTARG};;
+        s) STUB_ONLY=true;;
+    esac
+done
+
+# Other useful variables
+MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
+FIXED_PARAMS="--freq $FREQ"
+CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
+
+if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
+    ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
+    DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
 else
-    STUB1_NAME=Stub-76
-    STUB1_PARAMS=-nn 76
-    STUB2_NAME=Stub-1024
-    STUB2_PARAMS=-nn 1024
+    ALL_PREFETCHERS=""
+    DEFAULT_PREFETCHERS=("IGNORE")
+fi
+
+if [ -z ${PREFETCHERS+x} ]; then
+    PREFETCHERS=${DEFAULT_PREFETCHERS}
+fi
+
+if [ "$OPT_SCHEME" == "gromacs" ]; then
+    STUB1_NAME=stub-33
+    STUB1_PARAMS="-na 4 -nn 33"
+    STUB2_NAME=stub-128
+    STUB2_PARAMS="-na 4 -nn 128"
+else
+    STUB1_NAME=stub-76
+    STUB1_PARAMS="-nn 76"
+    STUB2_NAME=stub-1024
+    STUB2_PARAMS="-nn 1024"
 fi

 function run_benchmark() {
+    BEST=10000000
    for i in $(seq $NRUNS); do
-        likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
+        RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
+        if (( $(echo "$BEST > $RES" | bc -l ) )); then
+            BEST=$RES
+        fi
    done
 }

-echo "Tag: $TAG"
-echo "Optimization scheme: $OPT_SCHEME"
-echo "Binary: $MDBENCH_BIN(-stub)"
-echo "Frequency: $FREQ"
-echo "Number of runs: $NRUNS"
+echo "Tag: $TAG" | tee -a $LOG
+echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
+echo "Instruction set: $ISA" | tee -a $LOG
+echo "Precision: $PREC" | tee -a $LOG
+echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
+echo "Frequency: $FREQ" | tee -a $LOG
+echo "Number of runs: $NRUNS" | tee -a $LOG
+echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG

-echo "Fixing frequencies..."
-likwid-setFrequencies -f $FREQ -t 0
+if [ "$SKIP_SET_FREQ" == "false" ]; then
+    echo "Fixing frequencies..."
+    likwid-setFrequencies -f $FREQ -t 0
+fi

-echo "Standard"
-run_benchmark $MDBENCH_BIN
-echo "Melt"
-run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
-echo "Argon"
-run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
-echo "$STUB1_NAME"
-run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
-echo "$STUB2_NAME"
-run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
+for p in $PREFETCHERS; do
+    if [ "$p" != "IGNORE" ]; then
+        if [ "$p" == "ALL" ]; then
+            likwid-features -c $CORE -e $ALL_PREFETCHERS
+        elif [ "$p" == "NONE" ]; then
+            likwid-features -c $CORE -d $ALL_PREFETCHERS
+        else
+            likwid-features -c $CORE -d $ALL_PREFETCHERS
+            likwid-features -c $CORE -e $p
+        fi
+
+        echo "Prefetcher settings: $p"
+        likwid-features -c $CORE -l
+    fi
+
+    MSG="$p: "
+    if [ "$STUB_ONLY" == "false" ]; then
+        run_benchmark $MDBENCH_BIN
+        MSG+="standard=$BEST, "
+        run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
+        MSG+="melt=$BEST, "
+        run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
+        MSG+="argon=$BEST, "
+    fi
+
+    run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
+    MSG+="$STUB1_NAME=$BEST, "
+    run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
+    MSG+="$STUB2_NAME=$BEST"
+    echo $MSG | tee -a $LOG
+done
--- a/util/gather-bench/.gitignore
+++ b/util/gather-bench/.gitignore
@@ -0,0 +1,52 @@
+# Prerequisites
+*.d
+
+# Object files
+*.o
+*.ko
+*.obj
+*.elf
+
+# Linker output
+*.ilk
+*.map
+*.exp
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Libraries
+*.lib
+*.a
+*.la
+*.lo
+
+# Shared objects (inc. Windows DLLs)
+*.dll
+*.so
+*.so.*
+*.dylib
+
+# Executables
+*.exe
+*.out
+*.app
+*.i*86
+*.x86_64
+*.hex
+
+# Debug files
+*.dSYM/
+*.su
+*.idb
+*.pdb
+
+# Kernel Module Compile Results
+*.mod*
+*.cmd
+.tmp_versions/
+modules.order
+Module.symvers
+Mkfile.old
+dkms.conf
--- a/util/gather-bench/LICENSE
+++ b/util/gather-bench/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 RRZE-HPC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/util/gather-bench/Makefile
+++ b/util/gather-bench/Makefile
@@ -0,0 +1,126 @@
+#CONFIGURE BUILD SYSTEM
+TARGET	   = gather-bench-$(TAG)
+BUILD_DIR  = ./$(TAG)
+SRC_DIR	= ./src
+MAKE_DIR   = ./
+ISA_DIR	= ./src/$(ISA)
+Q		 ?= @
+
+#DO NOT EDIT BELOW
+include $(MAKE_DIR)/config.mk
+include $(MAKE_DIR)/include_$(TAG).mk
+include $(MAKE_DIR)/include_LIKWID.mk
+INCLUDES  += -I./src/includes
+
+VPATH	 = $(SRC_DIR) ${ISA_DIR}
+ASM	   = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
+ASM	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90))
+OBJ	   = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
+OBJ	  += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
+OBJ	  += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))
+OBJ	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90))
+OBJ	  += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90))
+OBJ	  += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
+OBJ	  += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S))
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA)
+
+ifneq ($(VARIANT),)
+	.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
+endif
+
+ifeq ($(strip $(DATA_LAYOUT)),AOS)
+    CPPFLAGS += -DAOS
+endif
+
+ifeq ($(strip $(TEST)),true)
+    CPPFLAGS += -DTEST
+endif
+
+ifeq ($(strip $(PADDING)),true)
+    CPPFLAGS += -DPADDING
+endif
+
+ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true)
+    CPPFLAGS += -DMEASURE_GATHER_CYCLES
+endif
+
+ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true)
+    CPPFLAGS += -DONLY_FIRST_DIMENSION
+endif
+
+ifeq ($(strip $(MEM_TRACER)),true)
+    CPPFLAGS += -DMEM_TRACER
+endif
+
+${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c
+	@echo "===>  LINKING  $(TARGET)"
+	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS)
+
+${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c
+	@echo "===>  LINKING  $(TARGET)-$* "
+	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS)
+
+asm:  $(BUILD_DIR) $(ASM)
+
+$(BUILD_DIR)/%.o:  %.c
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+	$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+$(BUILD_DIR)/%.s:  %.c
+	@echo "===>  GENERATE ASM  $@"
+	$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+$(BUILD_DIR)/%.s:  %.f90
+	@echo "===>  COMPILE  $@"
+	$(Q)$(FC) -S  $(FCFLAGS) $< -o $@
+
+$(BUILD_DIR)/%.o:  %.cc
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
+	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+$(BUILD_DIR)/%.o:  %.cpp
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
+	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+$(BUILD_DIR)/%.o:  %.f90
+	@echo "===>  COMPILE  $@"
+	$(Q)$(FC) -c  $(FCFLAGS) $< -o $@
+
+$(BUILD_DIR)/%.o:  %.F90
+	@echo "===>  COMPILE  $@"
+	$(Q)$(FC) -c  $(CPPFLAGS)  $(FCFLAGS) $< -o $@
+
+$(BUILD_DIR)/%.o:  %.s
+	@echo "===>  ASSEMBLE  $@"
+	$(Q)$(AS)  $(ASFLAGS) $< -o $@
+
+$(BUILD_DIR)/%.o:  %.S
+	@echo "===>  ASSEMBLE  $@"
+	$(Q)$(CC) -c $(CPPFLAGS) $< -o $@
+
+tags:
+	@echo "===>  GENERATE  TAGS"
+	$(Q)ctags -R
+
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean
+
+clean:
+	@echo "===>  CLEAN"
+	@rm -rf $(BUILD_DIR)
+	@rm -f tags
+
+distclean: clean
+	@echo "===>  DIST CLEAN"
+	@rm -f $(TARGET)
+	@rm -f tags
--- a/util/gather-bench/README.md
+++ b/util/gather-bench/README.md
@@ -0,0 +1,2 @@
+# gather-bench
+A X86 gather instruction performance benchmark
--- a/util/gather-bench/config.mk
+++ b/util/gather-bench/config.mk
@@ -0,0 +1,22 @@
+# Supported: GCC, CLANG, ICC
+TAG ?= ICC
+# Supported: avx2, avx512
+ISA ?= avx512
+# Use likwid?
+ENABLE_LIKWID ?= false
+
+# SP or DP
+DATA_TYPE ?= DP
+# AOS or SOA
+DATA_LAYOUT ?= AOS
+# Padding byte for AoS
+PADDING ?= false
+# Measure cycles for each gather separately
+MEASURE_GATHER_CYCLES ?= false
+# Gather data only for first dimension (one gather per iteration)
+ONLY_FIRST_DIMENSION ?= false
+
+# Trace memory addresses for cache simulator
+MEM_TRACER ?= false
+# Test correctness of gather kernels
+TEST ?= false
--- a/util/gather-bench/include_CLANG.mk
+++ b/util/gather-bench/include_CLANG.mk
@@ -0,0 +1,9 @@
+CC  = clang
+LINKER = $(CC)
+
+OPENMP   =# -fopenmp
+CFLAGS   = -Ofast -std=c11 -march=core-avx2 -mavx -mfma  $(OPENMP)
+LFLAGS   = $(OPENMP) -march=core-avx2 -mavx -mfma
+DEFINES  = -D_GNU_SOURCE
+INCLUDES =
+LIBS     =
--- a/util/gather-bench/include_GCC.mk
+++ b/util/gather-bench/include_GCC.mk
@@ -0,0 +1,11 @@
+CC  = gcc
+AS  = as
+LINKER = $(CC)
+
+OPENMP   = -fopenmp
+CFLAGS   = -Ofast -std=c11 -mavx2 -mfma $(OPENMP)
+ASFLAGS  =
+LFLAGS   = $(OPENMP) -mavx2 -mfma
+DEFINES  = -D_GNU_SOURCE
+INCLUDES =
+LIBS     =
--- a/util/gather-bench/include_ICC.mk
+++ b/util/gather-bench/include_ICC.mk
@@ -0,0 +1,9 @@
+CC  = icc
+LINKER = $(CC)
+
+OPENMP   = -qopenmp
+CFLAGS   = -Ofast -xhost -std=c11 $(OPENMP)
+LFLAGS   = $(OPENMP)
+DEFINES  = -D_GNU_SOURCE
+INCLUDES =
+LIBS     =
--- a/util/gather-bench/include_LIKWID.mk
+++ b/util/gather-bench/include_LIKWID.mk
@@ -0,0 +1,10 @@
+LIKWID_INC ?= -I/usr/local/include
+LIKWID_DEFINES ?= -DLIKWID_PERFMON
+LIKWID_LIB ?= -L/usr/local/lib
+
+ifeq ($(strip $(ENABLE_LIKWID)),true)
+INCLUDES += ${LIKWID_INC}
+DEFINES +=  ${LIKWID_DEFINES}
+LIBS += -llikwid
+LFLAGS += ${LIKWID_LIB}
+endif
--- a/util/gather-bench/src/allocate.c
+++ b/util/gather-bench/src/allocate.c
@@ -0,0 +1,57 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+
+void* allocate (int alignment, size_t bytesize)
+{
+    int errorCode;
+    void* ptr;
+
+    errorCode =  posix_memalign(&ptr, alignment, bytesize);
+
+    if (errorCode) {
+        if (errorCode == EINVAL) {
+            fprintf(stderr,
+                    "Error: Alignment parameter is not a power of two\n");
+            exit(EXIT_FAILURE);
+        }
+        if (errorCode == ENOMEM) {
+            fprintf(stderr,
+                    "Error: Insufficient memory to fulfill the request\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (ptr == NULL) {
+        fprintf(stderr, "Error: posix_memalign failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return ptr;
+}
--- a/util/gather-bench/src/avx2/gather.S
+++ b/util/gather-bench/src/avx2/gather.S
@@ -0,0 +1,63 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> a
+# rsi -> idx
+# rdx -> N
+# rcx -> t
+.text
+.globl gather
+.type gather, @function
+gather :
+push rbp
+mov rbp, rsp
+push rbx
+push r12
+push r13
+push r14
+push r15
+
+xor   rax, rax
+vpcmpeqd ymm0, ymm0, ymm0
+.align 16
+1:
+vmovups xmm1, [rsi + rax * 4]
+vmovups xmm2, [rsi + rax * 4 + 16]
+vmovups xmm3, [rsi + rax * 4 + 32]
+vmovups xmm4, [rsi + rax * 4 + 48]
+vmovdqa ymm5, ymm0
+vmovdqa ymm6, ymm0
+vmovdqa ymm7, ymm0
+vmovdqa ymm8, ymm0
+vxorpd ymm9,  ymm9,  ymm9
+vxorpd ymm10, ymm10, ymm10
+vxorpd ymm11, ymm11, ymm11
+vxorpd ymm12, ymm12, ymm12
+vgatherdpd ymm9,  [rdi + xmm1 * 8], ymm5
+vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
+vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
+vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
+
+#ifdef TEST
+vmovapd [rcx + rax * 8],      ymm9
+vmovapd [rcx + rax * 8 + 32], ymm10
+vmovapd [rcx + rax * 8 + 64], ymm11
+vmovapd [rcx + rax * 8 + 96], ymm12
+#endif
+
+addq rax, 16
+cmpq rax, rdx
+jl 1b
+
+pop r15
+pop r14
+pop r13
+pop r12
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather, .-gather
--- a/util/gather-bench/src/avx2/gather_aos.S
+++ b/util/gather-bench/src/avx2/gather_aos.S
@@ -0,0 +1,71 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> a
+# rsi -> idx
+# rdx -> N
+# rcx -> t
+.text
+.globl gather_aos
+.type gather_aos, @function
+gather_aos :
+push rbp
+mov rbp, rsp
+push rbx
+push r9
+push r10
+push r11
+push r12
+push r13
+push r14
+push r15
+
+xor   rax, rax
+vpcmpeqd ymm8, ymm8, ymm8
+.align 16
+1:
+
+vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
+vpaddd xmm4, xmm3, xmm3
+#ifdef PADDING
+vpaddd xmm3, xmm4, xmm4
+#else
+vpaddd xmm3, xmm3, xmm4
+#endif
+vmovdqa ymm5, ymm8
+vmovdqa ymm6, ymm8
+vmovdqa ymm7, ymm8
+vxorpd ymm0, ymm0, ymm0
+vxorpd ymm1, ymm1, ymm1
+vxorpd ymm2, ymm2, ymm2
+vgatherdpd ymm0, [     rdi + xmm3 * 8], ymm5
+vgatherdpd ymm1, [8  + rdi + xmm3 * 8], ymm6
+vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
+
+#ifdef TEST
+vmovupd  [rcx + rax * 8], ymm0
+lea rbx, [rcx + rdx * 8]
+vmovupd  [rbx + rax * 8], ymm1
+lea r9,  [rbx + rdx * 8]
+vmovupd  [r9  + rax * 8], ymm2
+#endif
+
+addq rax, 4
+cmpq rax, rdx
+jl 1b
+
+pop r15
+pop r14
+pop r13
+pop r12
+pop r11
+pop r10
+pop r9
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather_aos, .-gather_aos
--- a/util/gather-bench/src/avx2/gather_soa.S
+++ b/util/gather-bench/src/avx2/gather_soa.S
@@ -0,0 +1,67 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> a
+# rsi -> idx
+# rdx -> N
+# rcx -> t
+.text
+.globl gather_soa
+.type gather_soa, @function
+gather_soa :
+push rbp
+mov rbp, rsp
+push rbx
+push r9
+push r10
+push r11
+push r12
+push r13
+push r14
+push r15
+
+xor rax, rax
+vpcmpeqd ymm8, ymm8, ymm8
+lea r8, [rdi + rdx * 8]
+lea r9, [r8  + rdx * 8]
+.align 16
+1:
+
+vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
+vmovdqa ymm5, ymm8
+vmovdqa ymm6, ymm8
+vmovdqa ymm7, ymm8
+vxorpd ymm0, ymm0, ymm0
+vxorpd ymm1, ymm1, ymm1
+vxorpd ymm2, ymm2, ymm2
+vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5
+vgatherdpd ymm1, [r8  + xmm3 * 8], ymm6
+vgatherdpd ymm2, [r9  + xmm3 * 8], ymm7
+
+#ifdef TEST
+vmovupd  [rcx + rax * 8], ymm0
+lea rbx, [rcx + rdx * 8]
+vmovupd  [rbx + rax * 8], ymm1
+lea r10, [rbx + rdx * 8]
+vmovupd  [r10 + rax * 8], ymm2
+#endif
+
+addq rax, 4
+cmpq rax, rdx
+jl 1b
+
+pop r15
+pop r14
+pop r13
+pop r12
+pop r11
+pop r10
+pop r9
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather_soa, .-gather_soa
--- a/util/gather-bench/src/avx512/gather.S
+++ b/util/gather-bench/src/avx512/gather.S
@@ -0,0 +1,62 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> a
+# rsi -> idx
+# rdx -> N
+# rcx -> t
+.text
+.globl gather
+.type gather, @function
+gather :
+push rbp
+mov rbp, rsp
+push rbx
+push r12
+push r13
+push r14
+push r15
+
+xor   rax, rax
+.align 16
+1:
+vpcmpeqb k1, xmm0, xmm0
+vpcmpeqb k2, xmm0, xmm0
+vpcmpeqb k3, xmm0, xmm0
+vpcmpeqb k4, xmm0, xmm0
+vmovdqu ymm0, [rsi + rax * 4]
+vmovdqu ymm1, [rsi + rax * 4 + 32]
+vmovdqu ymm2, [rsi + rax * 4 + 64]
+vmovdqu ymm3, [rsi + rax * 4 + 96]
+vpxord zmm4, zmm4, zmm4
+vpxord zmm5, zmm5, zmm5
+vpxord zmm6, zmm6, zmm6
+vpxord zmm7, zmm7, zmm7
+vgatherdpd zmm4{k1}, [rdi + ymm0 * 8]
+vgatherdpd zmm5{k2}, [rdi + ymm1 * 8]
+vgatherdpd zmm6{k3}, [rdi + ymm2 * 8]
+vgatherdpd zmm7{k4}, [rdi + ymm3 * 8]
+
+#ifdef TEST
+vmovapd [rcx + rax * 8],       zmm4
+vmovapd [rcx + rax * 8 + 64],  zmm5
+vmovapd [rcx + rax * 8 + 128], zmm6
+vmovapd [rcx + rax * 8 + 192], zmm7
+#endif
+
+addq rax, 32
+cmpq rax, rdx
+jl 1b
+
+pop r15
+pop r14
+pop r13
+pop r12
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather, .-gather
--- a/util/gather-bench/src/avx512/gather_aos.S
+++ b/util/gather-bench/src/avx512/gather_aos.S
@@ -0,0 +1,151 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> a
+# rsi -> idx
+# rdx -> N
+# rcx -> t
+# r8  -> cycles
+.text
+.globl gather_aos
+.type gather_aos, @function
+gather_aos :
+push rbp
+mov rbp, rsp
+push rbx
+push r9
+push r10
+push r11
+push r12
+push r13
+push r14
+push r15
+
+xor   rax, rax
+.align 16
+1:
+
+vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
+vpaddd ymm4, ymm3, ymm3
+#ifdef PADDING
+vpaddd ymm3, ymm4, ymm4
+#else
+vpaddd ymm3, ymm3, ymm4
+#endif
+
+# Prefetching instructions
+#mov ebx, DWORD PTR[rsi + rax*4]
+#mov r9d, DWORD PTR[4 + rsi + rax*4]
+#mov r10d, DWORD PTR[8 + rsi + rax*4]
+#mov r11d, DWORD PTR[12 + rsi + rax*4]
+#mov r12d, DWORD PTR[16 + rsi + rax*4]
+#mov r13d, DWORD PTR[20 + rsi + rax*4]
+#mov r14d, DWORD PTR[24 + rsi + rax*4]
+#mov r15d, DWORD PTR[28 + rsi + rax*4]
+#lea ebx, DWORD PTR[rbx]
+#lea r9d, DWORD PTR[r9]
+#lea r10d, DWORD PTR[r10]
+#lea r11d, DWORD PTR[r11]
+#lea r12d, DWORD PTR[r12]
+#lea r13d, DWORD PTR[r13]
+#lea r14d, DWORD PTR[r14]
+#lea r15d, DWORD PTR[r15]
+
+vpcmpeqb k1, xmm5, xmm5
+#ifndef ONLY_FIRST_DIMENSION
+vpcmpeqb k2, xmm5, xmm5
+vpcmpeqb k3, xmm5, xmm5
+#endif
+
+vpxord zmm0, zmm0, zmm0
+#ifndef ONLY_FIRST_DIMENSION
+vpxord zmm1, zmm1, zmm1
+vpxord zmm2, zmm2, zmm2
+#endif
+
+#ifdef MEASURE_GATHER_CYCLES
+
+mov r9, rax
+mov r10, rdx
+xor r11, r11
+add r11, rax
+add r11, rax
+add r11, rax
+#shr r11, 3
+
+xor rbx, rbx
+lfence
+rdtsc
+add ebx, eax
+vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
+lfence
+rdtsc
+sub eax, ebx
+#movdiri [r8 + r11], rax
+movnti [r8 + r11], rax
+
+#ifndef ONLY_FIRST_DIMENSION
+xor rbx, rbx
+lfence
+rdtsc
+add ebx, eax
+vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
+lfence
+rdtsc
+sub eax, ebx
+#movdiri [8 + r8 + r11], rax
+movnti [8 + r8 + r11], rax
+
+xor rbx, rbx
+lfence
+rdtsc
+add ebx, eax
+vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
+lfence
+rdtsc
+sub eax, ebx
+#movdiri [16 + r8 + r11], rax
+movnti [16 + r8 + r11], rax
+#endif // ONLY_FIRST_DIMENSION
+
+mov rax, r9
+mov rdx, r10
+
+#else // MEASURE_GATHER_CYCLES
+
+vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
+
+#ifndef ONLY_FIRST_DIMENSION
+vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
+vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
+#endif
+
+#endif // MEASURE_GATHER_CYCLES
+
+#ifdef TEST
+vmovupd  [rcx + rax * 8], zmm0
+lea rbx, [rcx + rdx * 8]
+vmovupd  [rbx + rax * 8], zmm1
+lea r9,  [rbx + rdx * 8]
+vmovupd  [r9  + rax * 8], zmm2
+#endif
+
+addq rax, 8
+cmpq rax, rdx
+jl 1b
+
+pop r15
+pop r14
+pop r13
+pop r12
+pop r11
+pop r10
+pop r9
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather_aos, .-gather_aos
--- a/util/gather-bench/src/avx512/gather_md_aos.S
+++ b/util/gather-bench/src/avx512/gather_md_aos.S
@@ -0,0 +1,147 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+.section .rodata, "a"
+.align 64
+.align 64
+.ymm_reg_mask.1:
+	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
+	.type	.ymm_reg_mask.1,@object
+	.size	.ymm_reg_mask.1,32
+	.align 8
+
+# rdi -> a
+# rsi -> neighbors
+# rdx -> numneighs[i]
+# rcx -> &t[t_idx]
+# r8  -> ntest
+.text
+.globl gather_md_aos
+.type gather_md_aos, @function
+gather_md_aos :
+push rbp
+mov rbp, rsp
+push rbx
+push r10
+push r11
+push r12
+push r13
+push r14
+push r15
+
+vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
+mov r15, rdx
+xor rax, rax
+.align 16
+1:
+
+vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
+vpaddd ymm4, ymm3, ymm3
+#ifdef PADDING
+vpaddd ymm3, ymm4, ymm4
+#else
+vpaddd ymm3, ymm3, ymm4
+#endif
+
+# Prefetching instructions
+#mov ebx, DWORD PTR[rsi + rax*4]
+#mov r9d, DWORD PTR[4 + rsi + rax*4]
+#mov r10d, DWORD PTR[8 + rsi + rax*4]
+#mov r11d, DWORD PTR[12 + rsi + rax*4]
+#mov r12d, DWORD PTR[16 + rsi + rax*4]
+#mov r13d, DWORD PTR[20 + rsi + rax*4]
+#mov r14d, DWORD PTR[24 + rsi + rax*4]
+#mov r15d, DWORD PTR[28 + rsi + rax*4]
+#lea ebx, DWORD PTR[rbx]
+#lea r9d, DWORD PTR[r9]
+#lea r10d, DWORD PTR[r10]
+#lea r11d, DWORD PTR[r11]
+#lea r12d, DWORD PTR[r12]
+#lea r13d, DWORD PTR[r13]
+#lea r14d, DWORD PTR[r14]
+#lea r15d, DWORD PTR[r15]
+
+vpcmpeqb k1, xmm5, xmm5
+#ifndef ONLY_FIRST_DIMENSION
+vpcmpeqb k2, xmm5, xmm5
+vpcmpeqb k3, xmm5, xmm5
+#endif
+
+vpxord zmm0, zmm0, zmm0
+#ifndef ONLY_FIRST_DIMENSION
+vpxord zmm1, zmm1, zmm1
+vpxord zmm2, zmm2, zmm2
+#endif
+
+vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
+#ifndef ONLY_FIRST_DIMENSION
+vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
+vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
+#endif
+
+#ifdef TEST
+vmovupd  [rcx + rax * 8], zmm0
+lea rbx, [rcx + r8  * 8]
+vmovupd  [rbx + rax * 8], zmm1
+lea r10, [rbx + r8  * 8]
+vmovupd  [r10 + rax * 8], zmm2
+#endif
+
+# TODO: see if this logic can be optimized
+addq rax, 8
+subq r15, 8
+cmpq r15, 8
+jge 1b
+
+cmpq r15, 0
+jle .end_func
+
+vpbroadcastd ymm6, r15d
+vpcmpgtd k1, ymm6, ymm7
+vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
+vpaddd ymm4, ymm3, ymm3
+#ifdef PADDING
+vpaddd ymm3, ymm4, ymm4
+#else
+vpaddd ymm3, ymm3, ymm4
+#endif
+
+vpxord    zmm0, zmm1, zmm2
+#ifndef ONLY_FIRST_DIMENSION
+kmovw     k2, k1
+kmovw     k3, k1
+vpxord    zmm1, zmm1, zmm1
+vpxord    zmm2, zmm2, zmm2
+#endif
+
+vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
+#ifndef ONLY_FIRST_DIMENSION
+vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
+vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
+#endif
+
+#ifdef TEST
+vmovupd  [rcx + rax * 8], zmm0
+lea rbx, [rcx + r8  * 8]
+vmovupd  [rbx + rax * 8], zmm1
+lea r10, [rbx + r8  * 8]
+vmovupd  [r10  + rax * 8], zmm2
+#endif
+
+addq rax, r15
+
+.end_func:
+pop r15
+pop r14
+pop r13
+pop r12
+pop r11
+pop r10
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather_md_aos, .-gather_md_aos
--- a/util/gather-bench/src/avx512/gather_soa.S
+++ b/util/gather-bench/src/avx512/gather_soa.S
@@ -0,0 +1,67 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> a
+# rsi -> idx
+# rdx -> N
+# rcx -> t
+.text
+.globl gather_soa
+.type gather_soa, @function
+gather_soa :
+push rbp
+mov rbp, rsp
+push rbx
+push r9
+push r10
+push r11
+push r12
+push r13
+push r14
+push r15
+
+xor   rax, rax
+vpcmpeqd ymm8, ymm8, ymm8
+lea r8, [rdi + rdx * 8]
+lea r9, [r8  + rdx * 8]
+.align 16
+1:
+
+vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
+vpcmpeqb k1, xmm5, xmm5
+vpcmpeqb k2, xmm5, xmm5
+vpcmpeqb k3, xmm5, xmm5
+vpxord zmm0, zmm0, zmm0
+vpxord zmm1, zmm1, zmm1
+vpxord zmm2, zmm2, zmm2
+vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
+vgatherdpd zmm1{k2}, [r8  + ymm3 * 8]
+vgatherdpd zmm2{k3}, [r9  + ymm3 * 8]
+
+#ifdef TEST
+vmovupd  [rcx + rax * 8], zmm0
+lea rbx, [rcx + rdx * 8]
+vmovupd  [rbx + rax * 8], zmm1
+lea r10, [rbx + rdx * 8]
+vmovupd  [r10 + rax * 8], zmm2
+#endif
+
+addq rax, 8
+cmpq rax, rdx
+jl 1b
+
+pop r15
+pop r14
+pop r13
+pop r12
+pop r11
+pop r10
+pop r9
+pop rbx
+mov  rsp, rbp
+pop rbp
+ret
+.size gather_soa, .-gather_soa
--- a/util/gather-bench/src/avx512/load_aos.S
+++ b/util/gather-bench/src/avx512/load_aos.S
@@ -0,0 +1,23 @@
+.intel_syntax noprefix
+.data
+.align 64
+SCALAR:
+.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
+
+# rdi -> &a[i * snbytes]
+
+.text
+.globl load_aos
+.type load_aos, @function
+load_aos :
+
+vmovsd xmm0, QWORD PTR [rdi]
+vmovsd xmm1, QWORD PTR [8  + rdi]
+vmovsd xmm2, QWORD PTR [16 + rdi]
+
+vbroadcastsd zmm3, xmm0
+vbroadcastsd zmm4, xmm1
+vbroadcastsd zmm5, xmm2
+
+ret
+.size load_aos, .-load_aos
--- a/util/gather-bench/src/includes/allocate.h
+++ b/util/gather-bench/src/includes/allocate.h
@@ -0,0 +1,32 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#ifndef __ALLOCATE_H_
+#define __ALLOCATE_H_
+
+extern void* allocate (int alignment, size_t bytesize);
+
+#endif
--- a/util/gather-bench/src/includes/likwid-marker.h
+++ b/util/gather-bench/src/includes/likwid-marker.h
@@ -0,0 +1,53 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_MARKERS_H
+#define LIKWID_MARKERS_H
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#define LIKWID_MARKER_INIT likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else  /* LIKWID_PERFMON */
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_MARKER_RESET(regionTag)
+#endif /* LIKWID_PERFMON */
+
+#endif /*LIKWID_MARKERS_H*/
--- a/util/gather-bench/src/includes/timing.h
+++ b/util/gather-bench/src/includes/timing.h
@@ -0,0 +1,34 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#ifndef __TIMING_H_
+#define __TIMING_H_
+
+extern double getTimeStamp();
+extern double getTimeResolution();
+extern double getTimeStamp_();
+
+#endif
--- a/util/gather-bench/src/main-md-trace.c
+++ b/util/gather-bench/src/main-md-trace.c
@@ -0,0 +1,441 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#include <float.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <x86intrin.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <allocate.h>
+#include <timing.h>
+
+#if !defined(ISA_avx2) && !defined (ISA_avx512)
+#error "Invalid ISA macro, possible values are: avx2 and avx512"
+#endif
+
+#if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
+#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
+#endif
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
+#define ARRAY_ALIGNMENT  64
+
+#ifdef ISA_avx512
+#define _VL_  8
+#define ISA_STRING "avx512"
+#else
+#define _VL_  4
+#define ISA_STRING "avx2"
+#endif
+
+#ifdef AOS
+#define GATHER gather_md_aos
+#define LOAD(a, i, d, n) load_aos(&a[i * d])
+#define LAYOUT_STRING "AoS"
+#else
+#define GATHER gather_md_soa
+#define LOAD(a, i, d, n) load_soa(a, i, n)
+#define LAYOUT_STRING "SoA"
+#endif
+
+#if defined(PADDING) && defined(AOS)
+#define PADDING_BYTES 1
+#else
+#define PADDING_BYTES 0
+#endif
+
+#ifdef MEM_TRACER
+#   define MEM_TRACER_INIT(trace_file)    FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(trace_file), "w");
+#   define MEM_TRACER_END                 fclose(mem_tracer_fp);
+#   define MEM_TRACE(addr, op)            fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
+#else
+#   define MEM_TRACER_INIT
+#   define MEM_TRACER_END
+#   define MEM_TRACE(addr, op)
+#endif
+
+int gather_md_aos(double*, int*, int, double*, int);
+int gather_md_soa(double*, int*, int, double*, int);
+void load_aos(double*);
+void load_soa(double*, int, int);
+
+const char *get_mem_tracer_filename(const char *trace_file) {
+    static char fname[64];
+    snprintf(fname, sizeof fname, "mem_tracer_%s.txt", trace_file);
+    return fname;
+}
+
+int log2_uint(unsigned int x) {
+    int ans = 0;
+    while(x >>= 1) { ans++; }
+    return ans;
+}
+
+int main (int argc, char** argv) {
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("gather");
+    char *trace_file = NULL;
+    int cl_size = 64;
+    int ntimesteps = 200;
+    int reneigh_every = 20;
+    int opt = 0;
+    double freq = 2.5;
+    struct option long_opts[] = {
+        {"trace" ,      required_argument,   NULL,   't'},
+        {"freq",        required_argument,   NULL,   'f'},
+        {"line",        required_argument,   NULL,   'l'},
+        {"timesteps",   required_argument,   NULL,   'n'},
+        {"reneigh",     required_argument,   NULL,   'r'},
+        {"help",        required_argument,   NULL,   'h'}
+    };
+
+    while((opt = getopt_long(argc, argv, "t:f:l:n:r:h", long_opts, NULL)) != -1) {
+        switch(opt) {
+            case 't':
+                trace_file = strdup(optarg);
+                break;
+
+            case 'f':
+                freq = atof(optarg);
+                break;
+
+            case 'l':
+                cl_size = atoi(optarg);
+                break;
+
+            case 'n':
+                ntimesteps = atoi(optarg);
+                break;
+
+            case 'r':
+                reneigh_every = atoi(optarg);
+                break;
+
+            case 'h':
+            case '?':
+            default:
+                printf("Usage: %s [OPTION]...\n", argv[0]);
+                printf("MD variant for gather benchmark.\n\n");
+                printf("Mandatory arguments to long options are also mandatory for short options.\n");
+                printf("\t-t, --trace=STRING        input file with traced indexes from MD-Bench.\n");
+                printf("\t-f, --freq=REAL           CPU frequency in GHz (default 2.5).\n");
+                printf("\t-l, --line=NUMBER         cache line size in bytes (default 64).\n");
+                printf("\t-n, --timesteps=NUMBER    number of timesteps to simulate (default 200).\n");
+                printf("\t-r, --reneigh=NUMBER      reneighboring frequency in timesteps (default 20).\n");
+                printf("\t-h, --help                display this help message.\n");
+                printf("\n\n");
+                return EXIT_FAILURE;
+        }
+    }
+
+    if(trace_file == NULL) {
+        fprintf(stderr, "Trace file not specified!\n");
+        return EXIT_FAILURE;
+    }
+
+    FILE *fp;
+    char *line = NULL;
+    int *neighborlists = NULL;
+    int *numneighs = NULL;
+    int atom = -1;
+    int nlocal, nghost, maxneighs;
+    int nall = 0;
+    int N_alloc = 0;
+    size_t ntest = 0;
+    size_t llen;
+    ssize_t read;
+    double *a = NULL;
+    double *f = NULL;
+    double *t = NULL;
+    double time = 0.0;
+    double E, S;
+    const int dims = 3;
+    const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
+    long long int niters = 0;
+    long long int ngathered = 0;
+
+    printf("ISA,Layout,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e)\n");
+    printf("%s,%s,%d,%f,%d,%d\n\n", ISA_STRING, LAYOUT_STRING, dims, freq, cl_size, _VL_);
+    freq = freq * 1e9;
+
+    #ifdef ONLY_FIRST_DIMENSION
+    const int gathered_dims = 1;
+    #else
+    const int gathered_dims = dims;
+    #endif
+
+    for(int ts = -1; ts < ntimesteps; ts++) {
+        if(!((ts + 1) % reneigh_every)) {
+            char ts_trace_file[128];
+            snprintf(ts_trace_file, sizeof ts_trace_file, "%s_%d.out", trace_file, ts + 1);
+            if((fp = fopen(ts_trace_file, "r")) == NULL) {
+                fprintf(stderr, "Error: could not open trace file!\n");
+                return EXIT_FAILURE;
+            }
+
+            while((read = getline(&line, &llen, fp)) != -1) {
+                int i = 2;
+                if(strncmp(line, "N:", 2) == 0) {
+                    while(line[i] == ' ') { i++; }
+                    nlocal = atoi(strtok(&line[i], " "));
+                    nghost = atoi(strtok(NULL, " "));
+                    nall = nlocal + nghost;
+                    maxneighs = atoi(strtok(NULL, " "));
+
+                    if(nlocal <= 0 || maxneighs <= 0) {
+                        fprintf(stderr, "Number of local atoms and neighbor lists capacity cannot be less or equal than zero!\n");
+                        return EXIT_FAILURE;
+                    }
+
+                    if(neighborlists == NULL) {
+                        neighborlists = (int *) allocate( ARRAY_ALIGNMENT, nlocal * maxneighs * sizeof(int) );
+                        numneighs = (int *) allocate( ARRAY_ALIGNMENT, nlocal * sizeof(int) );
+                    }
+                }
+
+                if(strncmp(line, "A:", 2) == 0) {
+                    while(line[i] == ' ') { i++; }
+                    atom = atoi(strtok(&line[i], " "));
+                    numneighs[atom] = 0;
+                }
+
+                if(strncmp(line, "I:", 2) == 0) {
+                    while(line[i] == ' ') { i++; }
+                    char *neigh_idx = strtok(&line[i], " ");
+
+                    while(neigh_idx != NULL && *neigh_idx != '\n') {
+                        int j = numneighs[atom];
+                        neighborlists[atom * maxneighs + j] = atoi(neigh_idx);
+                        numneighs[atom]++;
+                        ntest++;
+                        neigh_idx = strtok(NULL, " ");
+                    }
+                }
+            }
+
+            fclose(fp);
+        }
+
+        if(N_alloc == 0) {
+            N_alloc = nall * 2;
+            a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
+            f = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
+        }
+
+        #ifdef TEST
+        if(t != NULL) { free(t); }
+        ntest += 100;
+        t = (double*) allocate( ARRAY_ALIGNMENT, ntest * dims * sizeof(double) );
+        #endif
+
+        for(int i = 0; i < N_alloc; ++i) {
+            #ifdef AOS
+            a[i * snbytes + 0] = i * dims + 0;
+            a[i * snbytes + 1] = i * dims + 1;
+            a[i * snbytes + 2] = i * dims + 2;
+            #else
+            a[N * 0 + i] = N * 0 + i;
+            a[N * 1 + i] = N * 1 + i;
+            a[N * 2 + i] = N * 2 + i;
+            #endif
+            f[i * dims + 0] = 0.0;
+            f[i * dims + 1] = 0.0;
+            f[i * dims + 2] = 0.0;
+        }
+
+        int t_idx = 0;
+        S = getTimeStamp();
+        LIKWID_MARKER_START("gather");
+        for(int i = 0; i < nlocal; i++) {
+            int *neighbors = &neighborlists[i * maxneighs];
+            // We inline the assembly for AVX512 with AoS layout to evaluate the impact
+            // of calling external assembly procedures in the overall runtime
+            #ifdef ISA_avx512
+            __m256i ymm_reg_mask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+            __asm__ __volatile__(   "vmovsd 0(%0), %%xmm3;"
+                                    "vmovsd 8(%0), %%xmm4;"
+                                    "vmovsd 16(%0), %%xmm5;"
+                                    "vbroadcastsd %%xmm3, %%zmm0;"
+                                    "vbroadcastsd %%xmm4, %%zmm1;"
+                                    "vbroadcastsd %%xmm5, %%zmm2;"
+                                    :
+                                    : "r" (&a[i * snbytes])
+                                    : "%xmm3", "%xmm4", "%xmm5", "%zmm0", "%zmm1", "%zmm2"  );
+
+            __asm__ __volatile__(   "xor %%rax, %%rax;"
+                                    "movq %%rdx, %%r15;"
+                                    "1: vmovdqu (%1,%%rax,4), %%ymm3;"
+                                    "vpaddd %%ymm3, %%ymm3, %%ymm4;"
+                                    #ifdef PADDING
+                                    "vpaddd %%ymm4, %%ymm4, %%ymm3;"
+                                    #else
+                                    "vpaddd %%ymm3, %%ymm4, %%ymm3;"
+                                    #endif
+                                    "vpcmpeqb %%xmm5, %%xmm5, %%k1;"
+                                    "vpcmpeqb %%xmm5, %%xmm5, %%k2;"
+                                    "vpcmpeqb %%xmm5, %%xmm5, %%k3;"
+                                    "vpxord %%zmm0, %%zmm0, %%zmm0;"
+                                    "vpxord %%zmm1, %%zmm1, %%zmm1;"
+                                    "vpxord %%zmm2, %%zmm2, %%zmm2;"
+                                    "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
+                                    "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
+                                    "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
+                                    "addq $8, %%rax;"
+                                    "subq $8, %%r15;"
+                                    "cmpq $8, %%r15;"
+                                    "jge 1b;"
+                                    "cmpq $0, %%r15;"
+                                    "jle 2;"
+                                    "vpbroadcastd %%r15d, %%ymm5;"
+                                    "vpcmpgtd %%ymm5, %2, %%k1;"
+                                    "vmovdqu32 (%1,%%rax,4), %%ymm3{{%%k1}}{{z}};"
+                                    "vpaddd %%ymm3, %%ymm3, %%ymm4;"
+                                    #ifdef PADDING
+                                    "vpaddd %%ymm4, %%ymm4, %%ymm3;"
+                                    #else
+                                    "vpaddd %%ymm3, %%ymm4, %%ymm3;"
+                                    #endif
+                                    "vpxord %%zmm0, %%zmm0, %%zmm0;"
+                                    "kmovw %%k1, %%k2;"
+                                    "kmovw %%k1, %%k3;"
+                                    "vpxord %%zmm1, %%zmm1, %%zmm1;"
+                                    "vpxord %%zmm2, %%zmm2, %%zmm2;"
+                                    "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
+                                    "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
+                                    "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
+                                    "addq %%r15, %%rax;"
+                                    "2:;"
+                                    :
+                                    : "d" (numneighs[i]), "r" (neighbors), "x" (ymm_reg_mask), "r" (a)
+                                    : "%rax", "%r15", "%ymm3", "%ymm4", "%ymm5", "%k1", "%k2", "%k3", "%zmm0", "%zmm1", "%zmm2" );
+            #else
+            LOAD(a, i, snbytes, N_alloc);
+            t_idx += GATHER(a, neighbors, numneighs[i], &t[t_idx], ntest);
+            #endif
+            f[i * dims + 0] += i;
+            f[i * dims + 1] += i;
+            f[i * dims + 2] += i;
+        }
+        LIKWID_MARKER_STOP("gather");
+        E = getTimeStamp();
+        time += E - S;
+
+        #ifdef MEM_TRACER
+        MEM_TRACER_INIT(trace_file);
+        for(int i = 0; i < nlocal; i++) {
+            int *neighbors = &neighborlists[i * maxneighs];
+
+            for(int d = 0; d < gathered_dims; d++) {
+                #ifdef AOS
+                MEM_TRACE('R', a[i * snbytes + d])
+                #else
+                MEM_TRACE('R', a[d * N + i])
+                #endif
+            }
+
+            for(int j = 0; j < numneighs[i]; j += _VL_) {
+                for(int jj = j; jj < MIN(j + _VL_, numneighs[i]); j++) {
+                    int k = neighbors[jj];
+                    for(int d = 0; d < gathered_dims; d++) {
+                        #ifdef AOS
+                        MEM_TRACE('R', a[k * snbytes + d])
+                        #else
+                        MEM_TRACE('R', a[d * N + k])
+                        #endif
+                    }
+                }
+            }
+        }
+        MEM_TRACER_END;
+        #endif
+
+        #ifdef TEST
+        int test_failed = 0;
+        t_idx = 0;
+        for(int i = 0; i < nlocal; ++i) {
+            int *neighbors = &neighborlists[i * maxneighs];
+            for(int j = 0; j < numneighs[i]; ++j) {
+                int k = neighbors[j];
+                for(int d = 0; d < dims; ++d) {
+                    #ifdef AOS
+                    if(t[d * ntest + t_idx] != k * dims + d) {
+                    #else
+                    if(t[d * ntest + t_idx] != d * N + k) {
+                    #endif
+                        test_failed = 1;
+                        break;
+                    }
+                }
+
+                t_idx++;
+            }
+        }
+
+        if(test_failed) {
+            printf("Test failed!\n");
+            return EXIT_FAILURE;
+        }
+        #endif
+
+        for(int i = 0; i < nlocal; i++) {
+            niters += (numneighs[i] / _VL_) + ((numneighs[i] % _VL_ == 0) ? 0 : 1);
+            ngathered += numneighs[i];
+        }
+    }
+
+    printf("%14s,%14s,%14s,%14s,%14s,%14s", "tot. time(s)", "time/step(ms)", "time/iter(us)", "cy/it", "cy/gather", "cy/elem");
+    printf("\n");
+    const double time_per_step = time * 1e3 / ((double) ntimesteps);
+    const double time_per_it = time * 1e6 / ((double) niters);
+    const double cy_per_it = time * freq * _VL_ / ((double) niters);
+    const double cy_per_gather = time * freq * _VL_ / ((double) niters * gathered_dims);
+    const double cy_per_elem = time * freq / ((double) ngathered * gathered_dims);
+    printf("%14.6f,%14.6f,%14.6f,%14.6f,%14.6f,%14.6f\n", time, time_per_step, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
+
+    #ifdef TEST
+    printf("Test passed!\n");
+    #endif
+
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/util/gather-bench/src/main-md.c
+++ b/util/gather-bench/src/main-md.c
@@ -0,0 +1,361 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#include <float.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <allocate.h>
+#include <timing.h>
+
+#if !defined(ISA_avx2) && !defined (ISA_avx512)
+#error "Invalid ISA macro, possible values are: avx2 and avx512"
+#endif
+
+#if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
+#error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
+#endif
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
+#define ARRAY_ALIGNMENT  64
+#define SIZE  20000
+
+#ifdef ISA_avx512
+#define _VL_  8
+#define ISA_STRING "avx512"
+#else
+#define _VL_  4
+#define ISA_STRING "avx2"
+#endif
+
+#ifdef AOS
+#define GATHER gather_aos
+#define LAYOUT_STRING "AoS"
+#else
+#define GATHER gather_soa
+#define LAYOUT_STRING "SoA"
+#endif
+
+#if defined(PADDING) && defined(AOS)
+#define PADDING_BYTES 1
+#else
+#define PADDING_BYTES 0
+#endif
+
+#ifdef MEM_TRACER
+#   define MEM_TRACER_INIT(stride, size)  FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(stride, size), "w");
+#   define MEM_TRACER_END                 fclose(mem_tracer_fp);
+#   define MEM_TRACE(addr, op)            fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
+#else
+#   define MEM_TRACER_INIT
+#   define MEM_TRACER_END
+#   define MEM_TRACE(addr, op)
+#endif
+
+extern void gather_aos(double*, int*, int, double*, long int*);
+extern void gather_soa(double*, int*, int, double*, long int*);
+
+const char *get_mem_tracer_filename(int stride, int size) {
+    static char fname[64];
+    snprintf(fname, sizeof fname, "mem_tracer_%d_%d.txt", stride, size);
+    return fname;
+}
+
+int log2_uint(unsigned int x) {
+    int ans = 0;
+    while(x >>= 1) { ans++; }
+    return ans;
+}
+
+int main (int argc, char** argv) {
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("gather");
+    int stride = 1;
+    int cl_size = 64;
+    int opt = 0;
+    double freq = 2.5;
+    struct option long_opts[] = {
+        {"stride", required_argument,   NULL,   's'},
+        {"freq",   required_argument,   NULL,   'f'},
+        {"line",   required_argument,   NULL,   'l'},
+        {"help",   required_argument,   NULL,   'h'}
+    };
+
+    while((opt = getopt_long(argc, argv, "s:f:l:h", long_opts, NULL)) != -1) {
+        switch(opt) {
+            case 's':
+                stride = atoi(optarg);
+                break;
+
+            case 'f':
+                freq = atof(optarg);
+                break;
+
+            case 'l':
+                cl_size = atoi(optarg);
+                break;
+
+            case 'h':
+            case '?':
+            default:
+                printf("Usage: %s [OPTION]...\n", argv[0]);
+                printf("MD variant for gather benchmark.\n\n");
+                printf("Mandatory arguments to long options are also mandatory for short options.\n");
+                printf("\t-s, --stride=NUMBER   stride between two successive elements (default 1).\n");
+                printf("\t-f, --freq=REAL       CPU frequency in GHz (default 2.5).\n");
+                printf("\t-l, --line=NUMBER     cache line size in bytes (default 64).\n");
+                printf("\t-h, --help            display this help message.\n");
+                printf("\n\n");
+                return EXIT_FAILURE;
+        }
+    }
+
+    size_t bytesPerWord = sizeof(double);
+    const int dims = 3;
+    const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
+    #ifdef AOS
+    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ * snbytes / (cl_size / sizeof(double)), 1), _VL_);
+    #else
+    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_) * dims;
+    #endif
+    size_t N = SIZE;
+    double E, S;
+
+    printf("ISA,Layout,Stride,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e),Cache Lines/Gather\n");
+    printf("%s,%s,%d,%d,%f,%d,%d,%lu\n\n", ISA_STRING, LAYOUT_STRING, stride, dims, freq, cl_size, _VL_, cacheLinesPerGather);
+    printf("%14s,%14s,%14s,", "N", "Size(kB)", "cut CLs");
+
+#ifndef MEASURE_GATHER_CYCLES
+    printf("%14s,%14s,%14s,%14s,%14s", "tot. time", "time/LUP(ms)", "cy/it", "cy/gather", "cy/elem");
+#else
+
+#ifdef ONLY_FIRST_DIMENSION
+    printf("%27s,%27s,%27s", "min/max/avg cy(x)", "min/max/avg cy(y)", "min/max/avg cy(z)");
+#else
+    printf("%27s", "min/max/avg cy(x)");
+#endif
+
+#endif
+
+    printf("\n");
+    freq = freq * 1e9;
+
+    for(int N = 512; N < 80000000; N = 1.5 * N) {
+        // Currently this only works when the array size (in elements) is multiple of the vector length (no preamble and prelude)
+        if(N % _VL_ != 0) {
+            N += _VL_ - (N % _VL_);
+        }
+
+        MEM_TRACER_INIT(stride, N);
+
+        int N_gathers_per_dim = N / _VL_;
+        int N_alloc = N * 2;
+        int N_cycles_alloc = N_gathers_per_dim * 2;
+        int cut_cl = 0;
+        double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
+        int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
+        int rep;
+        double time;
+
+#ifdef TEST
+        double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
+#else
+        double* t = (double*) NULL;
+#endif
+
+#ifdef MEASURE_GATHER_CYCLES
+        long int* cycles = (long int*) allocate( ARRAY_ALIGNMENT, N_cycles_alloc * dims * sizeof(long int)) ;
+#else
+        long int* cycles = (long int*) NULL;
+#endif
+
+        for(int i = 0; i < N_alloc; ++i) {
+#ifdef AOS
+            a[i * snbytes + 0] = i * dims + 0;
+            a[i * snbytes + 1] = i * dims + 1;
+            a[i * snbytes + 2] = i * dims + 2;
+#else
+            a[N * 0 + i] = N * 0 + i;
+            a[N * 1 + i] = N * 1 + i;
+            a[N * 2 + i] = N * 2 + i;
+#endif
+            idx[i] = (i * stride) % N;
+        }
+
+#ifdef ONLY_FIRST_DIMENSION
+        const int gathered_dims = 1;
+#else
+        const int gathered_dims = dims;
+#endif
+
+#ifdef MEM_TRACER
+        for(int i = 0; i < N; i += _VL_) {
+            for(int j = 0; j < _VL_; j++) {
+                MEM_TRACE(idx[i + j], 'R');
+            }
+
+            for(int d = 0; d < gathered_dims; d++) {
+                for(int j = 0; j < _VL_; j++) {
+#ifdef AOS
+                    MEM_TRACE(a[idx[i + j] * snbytes + d], 'R');
+#else
+                    MEM_TRACE(a[N * d + idx[i + j]], 'R');
+#endif
+                }
+            }
+        }
+#endif
+
+#ifdef AOS
+        const int cl_shift = log2_uint((unsigned int) cl_size);
+        for(int i = 0; i < N; i++) {
+            const int first_cl = (idx[i] * snbytes * sizeof(double)) >> cl_shift;
+            const int last_cl = ((idx[i] * snbytes + gathered_dims - 1) * sizeof(double)) >> cl_shift;
+            if(first_cl != last_cl) {
+                cut_cl++;
+            }
+        }
+#endif
+
+        S = getTimeStamp();
+        for(int r = 0; r < 100; ++r) {
+            GATHER(a, idx, N, t, cycles);
+        }
+        E = getTimeStamp();
+
+#ifdef MEASURE_GATHER_CYCLES
+        for(int i = 0; i < N_cycles_alloc; i++) {
+            cycles[i * 3 + 0] = 0;
+            cycles[i * 3 + 1] = 0;
+            cycles[i * 3 + 2] = 0;
+        }
+#endif
+
+        rep = 100 * (0.5 / (E - S));
+        S = getTimeStamp();
+        LIKWID_MARKER_START("gather");
+        for(int r = 0; r < rep; ++r) {
+            GATHER(a, idx, N, t, cycles);
+        }
+        LIKWID_MARKER_STOP("gather");
+        E = getTimeStamp();
+
+        time = E - S;
+
+#ifdef TEST
+        int test_failed = 0;
+        for(int i = 0; i < N; ++i) {
+            for(int d = 0; d < dims; ++d) {
+#ifdef AOS
+                if(t[d * N + i] != ((i * stride) % N) * dims + d) {
+#else
+                if(t[d * N + i] != d * N + ((i * stride) % N)) {
+#endif
+                    test_failed = 1;
+                    break;
+                }
+            }
+        }
+
+        if(test_failed) {
+            printf("Test failed!\n");
+            return EXIT_FAILURE;
+        } else {
+            printf("Test passed!\n");
+        }
+#endif
+
+        const double size = N * (dims * sizeof(double) + sizeof(int)) / 1000.0;
+        printf("%14d,%14.2f,%14d,", N, size, cut_cl);
+
+#ifndef MEASURE_GATHER_CYCLES
+        const double time_per_it = time * 1e6 / ((double) N * rep);
+        const double cy_per_it = time * freq * _VL_ / ((double) N * rep);
+        const double cy_per_gather = time * freq * _VL_ / ((double) N * rep * gathered_dims);
+        const double cy_per_elem = time * freq / ((double) N * rep * gathered_dims);
+        printf("%14.10f,%14.10f,%14.6f,%14.6f,%14.6f", time, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
+#else
+        double cy_min[dims];
+        double cy_max[dims];
+        double cy_avg[dims];
+
+        for(int d = 0; d < dims; d++) {
+            cy_min[d] = 100000.0;
+            cy_max[d] = 0.0;
+            cy_avg[d] = 0.0;
+        }
+
+        for(int i = 0; i < N_gathers_per_dim; ++i) {
+            for(int d = 0; d < gathered_dims; d++) {
+                const double cy_d = (double)(cycles[i * 3 + d]);
+                cy_min[d] = MIN(cy_min[d], cy_d);
+                cy_max[d] = MAX(cy_max[d], cy_d);
+                cy_avg[d] += cy_d;
+            }
+        }
+
+        for(int d = 0; d < gathered_dims; d++) {
+            char tmp_str[64];
+            cy_avg[d] /= (double) N_gathers_per_dim;
+            snprintf(tmp_str, sizeof tmp_str, "%4.4f/%4.4f/%4.4f", cy_min[d], cy_max[d], cy_avg[d]);
+            printf("%27s%c", tmp_str, (d < gathered_dims - 1) ? ',' : ' ');
+        }
+#endif
+
+        printf("\n");
+        free(a);
+        free(idx);
+
+#ifdef TEST
+        free(t);
+#endif
+
+#ifdef MEASURE_GATHER_CYCLES
+        free(cycles);
+#endif
+
+        MEM_TRACER_END;
+    }
+
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/util/gather-bench/src/main.c
+++ b/util/gather-bench/src/main.c
@@ -0,0 +1,166 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <limits.h>
+#include <float.h>
+//---
+#include <likwid-marker.h>
+//---
+#include <timing.h>
+#include <allocate.h>
+
+#if !defined(ISA_avx2) && !defined (ISA_avx512)
+#error "Invalid ISA macro, possible values are: avx2 and avx512"
+#endif
+
+#define HLINE "----------------------------------------------------------------------------\n"
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+#ifndef MAX
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
+#define ARRAY_ALIGNMENT  64
+#define SIZE  20000
+
+#ifdef ISA_avx512
+#define _VL_  8
+#define ISA_STRING "avx512"
+#else
+#define _VL_  4
+#define ISA_STRING "avx2"
+#endif
+
+#ifdef TEST
+extern void gather(double*, int*, int, double*);
+#else
+extern void gather(double*, int*, int);
+#endif
+
+int main (int argc, char** argv) {
+    LIKWID_MARKER_INIT;
+    LIKWID_MARKER_REGISTER("gather");
+
+    if (argc < 3) {
+        printf("Please provide stride and frequency\n");
+        printf("%s <stride> <freq (GHz)> [cache line size (B)]\n", argv[0]);
+        return -1;
+    }
+
+    int stride = atoi(argv[1]);
+    double freq = atof(argv[2]);
+    int cl_size = (argc == 3) ? 64 : atoi(argv[3]);
+    size_t bytesPerWord = sizeof(double);
+    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_);
+    size_t N = SIZE;
+    double E, S;
+
+    printf("ISA,Stride (elems),Frequency (GHz),Cache Line Size (B),Vector Width (elems),Cache Lines/Gather\n");
+    printf("%s,%d,%f,%d,%d,%lu\n\n", ISA_STRING, stride, freq, cl_size, _VL_, cacheLinesPerGather);
+    printf("%14s,%14s,%14s,%14s,%14s,%14s\n", "N", "Size(kB)", "tot. time", "time/LUP(ms)", "cy/gather", "cy/elem");
+
+    freq = freq * 1e9;
+    for(int N = 1024; N < 400000; N = 1.5 * N) {
+        int N_alloc = N * 2;
+        double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
+        int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
+        int rep;
+        double time;
+
+#ifdef TEST
+        double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
+#endif
+
+        for(int i = 0; i < N_alloc; ++i) {
+            a[i] = i;
+            idx[i] = (i * stride) % N;
+        }
+
+        S = getTimeStamp();
+        for(int r = 0; r < 100; ++r) {
+#ifdef TEST
+            gather(a, idx, N, t);
+#else
+            gather(a, idx, N);
+#endif
+        }
+        E = getTimeStamp();
+
+        rep = 100 * (0.5 / (E - S));
+        S = getTimeStamp();
+        LIKWID_MARKER_START("gather");
+        for(int r = 0; r < rep; ++r) {
+#ifdef TEST
+            gather(a, idx, N, t);
+#else
+            gather(a, idx, N);
+#endif
+        }
+        LIKWID_MARKER_STOP("gather");
+        E = getTimeStamp();
+
+        time = E - S;
+
+#ifdef TEST
+        int test_failed = 0;
+        for(int i = 0; i < N; ++i) {
+            if(t[i] != i * stride % N) {
+                test_failed = 1;
+                break;
+            }
+        }
+
+        if(test_failed) {
+            printf("Test failed!\n");
+            return EXIT_FAILURE;
+        } else {
+            printf("Test passed!\n");
+        }
+#endif
+
+        const double size = N * (sizeof(double) + sizeof(int)) / 1000.0;
+        const double time_per_it = time * 1e6 / ((double) N * rep);
+        const double cy_per_gather = time * freq * _VL_ / ((double) N * rep);
+        const double cy_per_elem = time * freq / ((double) N * rep);
+        printf("%14d,%14.2f,%14.10f,%14.10f,%14.6f,%14.6f\n", N, size, time, time_per_it, cy_per_gather, cy_per_elem);
+        free(a);
+        free(idx);
+#ifdef TEST
+        free(t);
+#endif
+    }
+
+    LIKWID_MARKER_CLOSE;
+    return EXIT_SUCCESS;
+}
--- a/util/gather-bench/src/timing.c
+++ b/util/gather-bench/src/timing.c
@@ -0,0 +1,47 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <time.h>
+
+double getTimeStamp()
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeResolution()
+{
+    struct timespec ts;
+    clock_getres(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeStamp_()
+{
+    return getTimeStamp();
+}
--- a/util/preds.py
+++ b/util/preds.py
@@ -0,0 +1,28 @@
+import sys
+import re
+
+if len(sys.argv) != 6:
+    print("Usage: python preds.py <iaca> <mca> <osaca> <uica> <div_factor>")
+    sys.exit(1)
+
+iaca_pred = float(sys.argv[1])
+mca_pred = float(sys.argv[2])
+osaca_pred = float(sys.argv[3])
+uica_pred = float(sys.argv[4])
+div_factor = float(sys.argv[5])
+preds = [x / div_factor for x in [iaca_pred, mca_pred, osaca_pred, uica_pred]]
+
+start = -4.0
+end = 36.0
+npoints = 50
+offset = (end - start) / (npoints - 1)
+i = 0
+for pred in preds:
+    print(f"@target G0.S{i+6}")
+    print(f"@type xy")
+    for j in range(npoints):
+        pos = start + offset * j
+        print("{:.6f} {}".format(pos, pred))
+
+    print("&")
+    i += 1
--- a/util/string_to_agr.py
+++ b/util/string_to_agr.py
@@ -0,0 +1,34 @@
+import sys
+import re
+
+if len(sys.argv) != 3:
+    print("Usage: python string_to_agr.py <input_filename> <div_factor>")
+    sys.exit(1)
+
+input_filename = sys.argv[1]
+div_factor = float(sys.argv[2])
+result_list = []
+
+with open(input_filename, 'r') as file:
+    for line in file:
+        numbers = re.findall(r'\d+\.\d+', line)
+        divided_numbers = [float(number) / div_factor for number in numbers]
+        result_list.append(divided_numbers)
+
+start = -2.5
+bar_offset = 1.0
+group_offset = 8.0
+i = 0
+
+for group in result_list:
+    print(f"@target G0.S{i}")
+    print(f"@type bar")
+
+    j = 0
+    for meas in group:
+        pos = start + i * bar_offset + j * group_offset
+        print(f"{pos} {meas}")
+        j += 1
+
+    print("&")
+    i += 1
Author	SHA1	Message	Date
JairoBuitrago	0094c3c4e1	Update neighbor.c	2024-04-15 18:12:27 +02:00
JairoBuitrago	a13a0f3bae	Final MPI version	2024-04-15 16:53:25 +02:00
rafaelravedutti	a6a269703d	Merge pull request #7 from RRZE-HPC/mucosim23 Mucosim23	2024-01-17 15:14:08 +01:00
TejeshPala	7ee250161a	omp_get_max_threads instead of omp_get_num_threads for gcc compiler adaption Signed-off-by: TejeshPala <tejesh.pala@fau.de>	2024-01-13 15:09:03 +01:00
TejeshPala	c73efea786	include openmp in ICC Signed-off-by: TejeshPala <tejesh.pala@fau.de>	2024-01-11 17:16:17 +01:00
TejeshPala	4cfa664533	schedule options for force kernels and to print in main fn Signed-off-by: TejeshPala <tejesh.pala@fau.de>	2024-01-11 17:09:18 +01:00
Rafael Ravedutti	1837403326	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-12-13 10:52:55 +01:00
Rafael Ravedutti	02629612a9	Fix explicit types for CUDA and provide option to write initial state of system Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-12-13 10:52:47 +01:00
TEJESH PALA	ce00aa0042	Merge pull request #6 from RRZE-HPC/mucosim23 omp print threads	2023-11-21 17:11:18 +01:00
TejeshPala	c4e5e87265	omp print threads	2023-11-21 15:31:27 +01:00
Rafael Ravedutti	da3b1dd53f	Add extended parameter option --param Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-11-21 15:27:11 +01:00
Rafael Ravedutti	2f13291817	Change function get_num_threads to get_cuda_num_threads Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-11-21 14:40:19 +01:00
Rafael Ravedutti	a460fffa19	Fix PBC case Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-10-10 12:53:43 +02:00
Jan Eitzinger	19209bdcce	Cleanup and move gather-bench to util folder	2023-08-15 15:21:21 +02:00
Rafael Ravedutti	151f0c0e6f	Add extendend param option Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-05-29 02:27:32 +02:00
Rafael Ravedutti	72f486f9bf	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-04-09 03:44:53 +02:00
Rafael Ravedutti	8253b31ee0	Include masked out interactions from remainder in atoms_outside_cutoff Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-09 03:44:21 +02:00
Rafael Ravedutti	e206c3566d	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-04-09 01:23:45 +02:00
Rafael Ravedutti	7ff1673399	Update config.mk with SORT_ATOMS Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>	2023-04-09 01:23:39 +02:00
Rafael Ravedutti	b6982d56f5	Fix atom sorting Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-09 01:19:12 +02:00
Rafael Ravedutti	1ad981a059	Add static analysis for gromacs-avx2-dp on Zen3 Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-09 00:07:04 +02:00
Rafael Ravedutti	c438fc6832	Fix GROMACS AVX2 code Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-07 21:54:07 +02:00
Rafael Ravedutti	17e239ed6d	Add uiCA reference to its analyses Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>	2023-04-05 23:58:52 +02:00
Rafael Ravedutti	d151b9b3e4	Update scripts with division factor Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-05 23:56:35 +02:00
Rafael Ravedutti	98257b746c	Add scripts to properly generate agr data Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-05 23:19:48 +02:00
Rafael Ravedutti	a101f8588a	Add analyses with llvm-mca Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-05 22:11:55 +02:00
Rafael Ravedutti	c14a6b2186	Add outputs for uiCA Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>	2023-04-05 19:51:09 +02:00
Rafael Ravedutti	300776f512	Add outputs for new analyses Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-05 19:48:04 +02:00
Rafael Ravedutti	4e5fe27c0f	Add object files for new static analyses Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-05 19:46:05 +02:00
Rafael Ravedutti	989bec2c7d	Add first analyses with GROMACS changes Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-05 02:44:50 +02:00
Rafael Ravedutti	2971ddcc63	Separate log by hostname and allow to set prefetchers to be used Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-04 21:56:03 +02:00
Rafael Ravedutti	5341938b60	Increase cutoff for Argon case Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-04-03 15:06:32 +02:00
Rafael Ravedutti	039de0be99	Fix stubbed versions and debug messages Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-30 03:49:57 +02:00
Rafael Ravedutti	43259eb3cf	Adjust neighbor lists layout to keep neighbor ids contiguous in memory Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-30 01:57:26 +02:00
Rafael Ravedutti	3eb7170a65	Adapt stubbed version for new neighbor lists in GROMACS Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-29 21:54:33 +02:00
Rafael Ravedutti	59145644e3	Last changes to 2xnn kernels Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 23:34:07 +02:00
Rafael Ravedutti	4a460b2c88	Adjust input files indent in output Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 23:04:11 +02:00
Rafael Ravedutti	b15aa2f461	Optimize 4xn kernels Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 23:00:21 +02:00
Rafael Ravedutti	5c000444a4	Pre-compute masks for 4xn kernels Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 22:30:30 +02:00
Rafael Ravedutti	04ade6bcec	Pre-compute masks for 2xnn kernel with full neighbor-lists Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 19:33:26 +02:00
Rafael Ravedutti	85f1484449	Specialize force kernel when there are no masks to be checked Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 18:04:18 +02:00
Rafael Ravedutti	965fda3879	Pre-compute masks in the same way as in the master branch Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 17:32:42 +02:00
Rafael Ravedutti	a86d214c73	Add working version with old masking Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-28 02:19:46 +02:00
Rafael Ravedutti	d138f975f6	Add diagonal checks Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-23 02:17:27 +01:00
Rafael Ravedutti	296a4c4e01	Set interaction masks as gromacs does Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-23 00:58:25 +01:00
Rafael Ravedutti	f5fd3e265a	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-03-22 13:50:51 +01:00
Rafael Ravedutti	1fbf9dbdac	Update skin for argon case Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-03-22 13:50:42 +01:00
JanLJL	89e1b9a9b6	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-02-14 14:00:29 +01:00
JanLJL	4e99f7a623	fixed wrong markers and added OSACA output for ICX	2023-02-14 13:52:59 +01:00
Rafael Ravedutti	4607202752	fix markers for gromacs-icx-avx512-sp Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-02-13 16:55:39 +01:00
JanLJL	301274c9b6	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-02-13 14:15:17 +01:00
JanLJL	95d63334fa	added asm files and analysis output	2023-02-13 14:15:08 +01:00
JanLJL	d0277765c3	changed double constants to MD_FLOAT constants	2023-02-13 14:13:53 +01:00
Rafael Ravedutti	5814a86125	Small fixes Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-02-12 01:44:48 +01:00
JanLJL	98583cdade	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-02-09 17:33:42 +01:00
JanLJL	cb5598bc91	added static analysis and likwid files	2023-02-09 17:33:22 +01:00
JanLJL	3b076cdb49	changed double consts to MD_FLOAT	2023-02-09 17:33:07 +01:00
Rafael Ravedutti	122a23e2b8	Fix compilation error when not using explicit SIMD version Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-02-07 23:53:32 +01:00
Rafael Ravedutti	32e004944f	Fix flags parsing for script Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-23 22:54:44 +01:00
Rafael Ravedutti	6126d74aa9	Change latency and CFG script to use binary as input Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-23 22:50:37 +01:00
Rafael Ravedutti	016f07dcaa	Merge branch 'master' of github.com:RRZE-HPC/MD-Bench	2023-01-23 22:13:42 +01:00
Rafael Ravedutti	90f30d26a3	Add lammps-avx512 kernels without correction instructions Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-23 22:13:33 +01:00
rafaelravedutti	01cc05a5d6	Merge pull request #5 from scarboot/master Move likwid marker calls into OpenMP parallel region	2023-01-22 17:20:02 +01:00
Yannick Paschke	c61cf9a0ac	Move likwid marker calls into OpenMP parallel region	2023-01-22 15:33:05 +01:00
Rafael Ravedutti	d545ca65d4	Add -xHost option for AVX2 Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-18 16:39:19 +01:00
Rafael Ravedutti	5833f00894	Change ICX flags based on ISA Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-16 23:13:40 +01:00
Rafael Ravedutti	8aad7e87a0	Use ISA for GCC flags and change binary and build directory names Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>	2023-01-16 23:05:21 +01:00
Rafael Ravedutti	ffad9d40f3	Use ICC compiler options based on ISA Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>	2023-01-16 22:45:24 +01:00
Rafael Ravedutti	99da76d59c	Add flags with -march=core-avx2 for Milan Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-11 15:30:26 +01:00
Rafael Ravedutti	cfe888c132	Add analysis files from gromacs-avx512-dp with ICX compiler Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-03 16:14:28 +01:00
Rafael Ravedutti	c7b136f629	Fix build directories in .gitignore Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-03 16:13:26 +01:00
Rafael Ravedutti	07f2f74561	Adjust force_iters stats for 4xN kernel Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-02 23:57:51 +01:00
Rafael Ravedutti	fd368609e8	Remove binaries on cleanall Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-02 19:15:42 +01:00
Rafael Ravedutti	db5f8cf1c6	Update .gitignore Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-02 19:12:25 +01:00
Rafael Ravedutti	f467d10ed3	Add cleanall command for all TAG and OPT_SCHEME variants Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-02 19:11:05 +01:00
Rafael Ravedutti	fe86c948a8	Adjust time and likwid measurements on 4xN kernels Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-02 14:19:59 +01:00
Rafael Ravedutti	ae1cfa2800	Include static_analysis directory Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-01-02 14:14:16 +01:00
Rafael Ravedutti	e5c233e072	Update script Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-12-21 18:04:18 +01:00
Rafael Ravedutti	8d5e10f635	Fix compilation for gromacs-avx512-sp Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-12-21 16:19:00 +01:00
Rafael Ravedutti	56ff0d19af	Run script with different prefetcher settings and provide better output Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-12-20 18:51:54 +01:00