First changes in the supercluster code

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
Neighbor list preparation
2023-05-25 01:10:37 +02:00 · 2023-05-23 16:25:00 +02:00 · 2023-05-09 00:44:37 +02:00 · 2023-04-11 02:55:30 +02:00
80 changed files with 3851 additions and 3363 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -51,17 +51,14 @@ Module.symvers
 Mkfile.old
 dkms.conf
 # Logs
 *.log
 # TODO list
 todo.txt
 # Build directories and executables
-#GCC-*/
+GCC/
-#ICC-*/
+ICC/
-#ICX-*/
+ICX/
-#CLANG-*/
+CLANG/
-#NVCC-*/
+NVCC/
-build-*/
+MDBench-GCC*
-MDBench-*
+MDBench-ICC*
 MDBench-ICX*
 MDBench-CLANG*
 MDBench-NVCC*
--- a/20
+++ b/20
@@ -1,7 +1,6 @@
 #CONFIGURE BUILD SYSTEM
-IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
+TARGET	   = MDBench-$(TAG)-$(OPT_SCHEME)
-TARGET	   = MDBench-$(IDENTIFIER)
+BUILD_DIR  = ./$(TAG)-$(OPT_SCHEME)
 BUILD_DIR  = ./build-$(IDENTIFIER)
 SRC_DIR    = ./$(OPT_SCHEME)
 ASM_DIR    = ./asm
 COMMON_DIR = ./common
@@ -30,10 +29,6 @@ ifneq ($(ASM_SYNTAX), ATT)
    ASFLAGS += -masm=intel
 endif
 ifeq ($(strip $(SORT_ATOMS)),true)
    DEFINES += -DSORT_ATOMS
 endif
 ifeq ($(strip $(EXPLICIT_TYPES)),true)
    DEFINES += -DEXPLICIT_TYPES
 endif
@@ -102,6 +97,10 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true)
    DEFINES += -DUSE_SIMD_KERNEL
 endif
 ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
    DEFINES += -DUSE_SUPER_CLUSTERS
 endif
 VPATH     = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
 ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
@@ -156,13 +155,6 @@ $(BUILD_DIR)/%.o:  %.s
 clean:
 	$(info ===>  CLEAN)
 	@rm -rf $(BUILD_DIR)
 	@rm -rf $(TARGET)*
 	@rm -f tags
 cleanall:
 	$(info ===>  CLEAN)
 	@rm -rf build-*
 	@rm -rf MDBench-*
 	@rm -f tags
 distclean: clean
--- a/asm/.gitkeep
+++ b/asm/.gitkeep
--- a/asm/unused/force-mem-only-with-likwid.s
+++ b/asm/unused/force-mem-only-with-likwid.s
@@ -0,0 +1,626 @@
 # mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
 # mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
 # mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
 # mark_description "ICC/force.s";
 	.file "force.c"
 	.text
 ..TXTST0:
 .L_2__routine_start_computeForce_0:
 # -- Begin  computeForce
 	.text
 # mark_begin;
       .align    16,0x90
 	.globl computeForce
 # --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
 computeForce:
 # parameter 1: %rdi
 # parameter 2: %rsi
 # parameter 3: %rdx
 # parameter 4: %ecx
 # parameter 5: %r8d
 # parameter 6: %r9d
 ..B1.1:                         # Preds ..B1.0
                                # Execution count [1.00e+00]
 	.cfi_startproc
 ..___tag_value_computeForce.1:
 ..L2:
                                                          #121.112
        pushq     %rbp                                          #121.112
 	.cfi_def_cfa_offset 16
        movq      %rsp, %rbp                                    #121.112
 	.cfi_def_cfa 6, 16
 	.cfi_offset 6, -16
        andq      $-64, %rsp                                    #121.112
        pushq     %r12                                          #121.112
        pushq     %r13                                          #121.112
        pushq     %r14                                          #121.112
        pushq     %r15                                          #121.112
        pushq     %rbx                                          #121.112
        subq      $88, %rsp                                     #121.112
        xorl      %eax, %eax                                    #124.16
 	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
        movq      %rdx, %r15                                    #121.112
        movq      %rsi, %r12                                    #121.112
        movq      %rdi, %rbx                                    #121.112
 ..___tag_value_computeForce.11:
 #       getTimeStamp()
        call      getTimeStamp                                  #124.16
 ..___tag_value_computeForce.12:
                                # LOE rbx r12 r15 xmm0
 ..B1.51:                        # Preds ..B1.1
                                # Execution count [1.00e+00]
        vmovsd    %xmm0, 24(%rsp)                               #124.16[spill]
                                # LOE rbx r12 r15
 ..B1.2:                         # Preds ..B1.51
                                # Execution count [1.00e+00]
        movl      4(%r12), %r13d                                #125.18
        movq      64(%r12), %r9                                 #127.20
        movq      72(%r12), %r14                                #127.45
        movq      80(%r12), %r8                                 #127.70
        vmovsd    72(%rbx), %xmm2                               #129.27
        vmovsd    8(%rbx), %xmm1                                #130.23
        vmovsd    (%rbx), %xmm0                                 #131.24
        testl     %r13d, %r13d                                  #134.24
        jle       ..B1.43       # Prob 50%                      #134.24
                                # LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
 ..B1.3:                         # Preds ..B1.2
                                # Execution count [1.00e+00]
        xorl      %ebx, %ebx                                    #134.5
        movl      %r13d, %edx                                   #134.5
        xorl      %ecx, %ecx                                    #134.5
        movl      $1, %esi                                      #134.5
        xorl      %eax, %eax                                    #135.17
        shrl      $1, %edx                                      #134.5
        je        ..B1.7        # Prob 9%                       #134.5
                                # LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
 ..B1.5:                         # Preds ..B1.3 ..B1.5
                                # Execution count [2.50e+00]
        movq      %rax, (%rcx,%r9)                              #135.9
        incq      %rbx                                          #134.5
        movq      %rax, (%rcx,%r14)                             #136.9
        movq      %rax, (%rcx,%r8)                              #137.9
        movq      %rax, 8(%rcx,%r9)                             #135.9
        movq      %rax, 8(%rcx,%r14)                            #136.9
        movq      %rax, 8(%rcx,%r8)                             #137.9
        addq      $16, %rcx                                     #134.5
        cmpq      %rdx, %rbx                                    #134.5
        jb        ..B1.5        # Prob 63%                      #134.5
                                # LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
 ..B1.6:                         # Preds ..B1.5
                                # Execution count [9.00e-01]
        lea       1(%rbx,%rbx), %esi                            #135.9
                                # LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
 ..B1.7:                         # Preds ..B1.3 ..B1.6
                                # Execution count [1.00e+00]
        lea       -1(%rsi), %edx                                #134.5
        cmpl      %r13d, %edx                                   #134.5
        jae       ..B1.9        # Prob 9%                       #134.5
                                # LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
 ..B1.8:                         # Preds ..B1.7
                                # Execution count [9.00e-01]
        movslq    %esi, %rsi                                    #134.5
        movq      %rax, -8(%r9,%rsi,8)                          #135.9
        movq      %rax, -8(%r14,%rsi,8)                         #136.9
        movq      %rax, -8(%r8,%rsi,8)                          #137.9
                                # LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
 ..B1.9:                         # Preds ..B1.7 ..B1.8
                                # Execution count [5.00e-01]
        movl      $.L_2__STRING.0, %edi                         #141.5
        movq      %r8, 32(%rsp)                                 #141.5[spill]
        movq      %r9, 80(%rsp)                                 #141.5[spill]
        vmovsd    %xmm2, (%rsp)                                 #141.5[spill]
        vmovsd    %xmm1, 8(%rsp)                                #141.5[spill]
        vmovsd    %xmm0, 16(%rsp)                               #141.5[spill]
 ..___tag_value_computeForce.18:
 #       likwid_markerStartRegion(const char *)
        call      likwid_markerStartRegion                      #141.5
 ..___tag_value_computeForce.19:
                                # LOE r12 r14 r15 r13d
 ..B1.10:                        # Preds ..B1.9
                                # Execution count [9.00e-01]
        vmovsd    16(%rsp), %xmm0                               #[spill]
        xorl      %esi, %esi                                    #143.15
        vmovsd    (%rsp), %xmm2                                 #[spill]
        xorl      %eax, %eax                                    #143.5
        vmulsd    %xmm2, %xmm2, %xmm13                          #129.45
        xorl      %edi, %edi                                    #143.5
        vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16            #173.13
        vmulsd    .L_2il0floatpacket.3(%rip), %xmm0, %xmm0      #197.45
        vmovdqu   .L_2il0floatpacket.1(%rip), %ymm15            #173.13
        vmovups   .L_2il0floatpacket.4(%rip), %zmm5             #197.58
        vmovsd    8(%rsp), %xmm1                                #[spill]
        vbroadcastsd %xmm13, %zmm14                             #129.25
        vbroadcastsd %xmm1, %zmm13                              #130.21
        vbroadcastsd %xmm0, %zmm9                               #197.45
        movslq    %r13d, %r13                                   #143.5
        movq      24(%r15), %r10                                #145.25
        movslq    16(%r15), %rdx                                #144.43
        movq      8(%r15), %rcx                                 #144.19
        movq      32(%rsp), %r8                                 #[spill]
        movq      16(%r12), %rbx                                #146.25
        shlq      $2, %rdx                                      #126.5
        movq      %r13, 64(%rsp)                                #143.5[spill]
        movq      %r10, 72(%rsp)                                #143.5[spill]
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.11:                        # Preds ..B1.41 ..B1.10
                                # Execution count [5.00e+00]
        movq      72(%rsp), %r9                                 #145.25[spill]
        vxorpd    %xmm24, %xmm24, %xmm24                        #149.22
        vmovapd   %xmm24, %xmm18                                #150.22
        movl      (%r9,%rax,4), %r10d                           #145.25
        vmovapd   %xmm18, %xmm4                                 #151.22
        vmovsd    (%rdi,%rbx), %xmm10                           #146.25
        vmovsd    8(%rdi,%rbx), %xmm6                           #147.25
        vmovsd    16(%rdi,%rbx), %xmm12                         #148.25
        testl     %r10d, %r10d                                  #173.32
        jle       ..B1.41       # Prob 50%                      #173.32
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.12:                        # Preds ..B1.11
                                # Execution count [4.50e+00]
        vpxord    %zmm8, %zmm8, %zmm8                           #149.22
        vmovaps   %zmm8, %zmm7                                  #150.22
        vmovaps   %zmm7, %zmm11                                 #151.22
        cmpl      $8, %r10d                                     #173.13
        jl        ..B1.48       # Prob 10%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.13:                        # Preds ..B1.12
                                # Execution count [4.50e+00]
        cmpl      $1200, %r10d                                  #173.13
        jl        ..B1.47       # Prob 10%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.14:                        # Preds ..B1.13
                                # Execution count [4.50e+00]
        movq      %rdx, %r15                                    #144.43
        imulq     %rsi, %r15                                    #144.43
        addq      %rcx, %r15                                    #126.5
        movq      %r15, %r11                                    #173.13
        andq      $63, %r11                                     #173.13
        testl     $3, %r11d                                     #173.13
        je        ..B1.16       # Prob 50%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.15:                        # Preds ..B1.14
                                # Execution count [2.25e+00]
        xorl      %r11d, %r11d                                  #173.13
        jmp       ..B1.18       # Prob 100%                     #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.16:                        # Preds ..B1.14
                                # Execution count [2.25e+00]
        testl     %r11d, %r11d                                  #173.13
        je        ..B1.18       # Prob 50%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.17:                        # Preds ..B1.16
                                # Execution count [2.50e+01]
        negl      %r11d                                         #173.13
        addl      $64, %r11d                                    #173.13
        shrl      $2, %r11d                                     #173.13
        cmpl      %r11d, %r10d                                  #173.13
        cmovl     %r10d, %r11d                                  #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.18:                        # Preds ..B1.15 ..B1.17 ..B1.16
                                # Execution count [5.00e+00]
        movl      %r10d, %r13d                                  #173.13
        subl      %r11d, %r13d                                  #173.13
        andl      $7, %r13d                                     #173.13
        negl      %r13d                                         #173.13
        addl      %r10d, %r13d                                  #173.13
        cmpl      $1, %r11d                                     #173.13
        jb        ..B1.26       # Prob 50%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.19:                        # Preds ..B1.18
                                # Execution count [4.50e+00]
        vmovdqa   %ymm15, %ymm4                                 #173.13
        xorl      %r12d, %r12d                                  #173.13
        vpbroadcastd %r11d, %ymm3                               #173.13
        vbroadcastsd %xmm10, %zmm2                              #146.23
        vbroadcastsd %xmm6, %zmm1                               #147.23
        vbroadcastsd %xmm12, %zmm0                              #148.23
        movslq    %r11d, %r9                                    #173.13
        movq      %r8, 32(%rsp)                                 #173.13[spill]
        movq      %r14, (%rsp)                                  #173.13[spill]
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.20:                        # Preds ..B1.24 ..B1.19
                                # Execution count [2.50e+01]
        vpcmpgtd  %ymm4, %ymm3, %k3                             #173.13
        vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z}                 #174.25
        kmovw     %k3, %r14d                                    #173.13
        vpaddd    %ymm17, %ymm17, %ymm18                        #175.40
        vpaddd    %ymm18, %ymm17, %ymm17                        #175.40
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 ..B1.23:                        # Preds ..B1.20
                                # Execution count [1.25e+01]
        kmovw     %k3, %k1                                      #175.40
        kmovw     %k3, %k2                                      #175.40
        vpxord    %zmm18, %zmm18, %zmm18                        #175.40
        vpxord    %zmm19, %zmm19, %zmm19                        #175.40
        vpxord    %zmm20, %zmm20, %zmm20                        #175.40
        vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1}               #175.40
        vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2}                #175.40
        vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3}                 #175.40
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
 ..B1.24:                        # Preds ..B1.23
                                # Execution count [2.50e+01]
        addq      $8, %r12                                      #173.13
        #vpaddd    %ymm16, %ymm4, %ymm4                          #173.13
        #vsubpd    %zmm18, %zmm0, %zmm29                         #177.40
        #vsubpd    %zmm19, %zmm1, %zmm27                         #176.40
        #vsubpd    %zmm20, %zmm2, %zmm26                         #175.40
        #vmulpd    %zmm27, %zmm27, %zmm25                        #178.53
        #vfmadd231pd %zmm26, %zmm26, %zmm25                      #178.53
        #vfmadd231pd %zmm29, %zmm29, %zmm25                      #178.67
        #vrcp14pd  %zmm25, %zmm24                                #195.42
        #vcmppd    $1, %zmm14, %zmm25, %k2                       #194.26
        #vfpclasspd $30, %zmm24, %k0                             #195.42
        #kmovw     %k2, %r8d                                     #194.26
        #knotw     %k0, %k1                                      #195.42
        #vmovaps   %zmm25, %zmm17                                #195.42
        #andl      %r8d, %r14d                                   #194.26
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
        #kmovw     %r14d, %k3                                    #198.21
        #vmulpd    %zmm17, %zmm17, %zmm18                        #195.42
        #vfmadd213pd %zmm24, %zmm17, %zmm24{%k1}                 #195.42
        #vfmadd213pd %zmm24, %zmm18, %zmm24{%k1}                 #195.42
        #vmulpd    %zmm13, %zmm24, %zmm19                        #196.42
        #vmulpd    %zmm9, %zmm24, %zmm21                         #197.58
        #vmulpd    %zmm19, %zmm24, %zmm22                        #196.48
        #vmulpd    %zmm22, %zmm24, %zmm20                        #196.54
        #vfmsub213pd %zmm5, %zmm22, %zmm24                       #197.58
        #vmulpd    %zmm21, %zmm20, %zmm23                        #197.65
        #vmulpd    %zmm24, %zmm23, %zmm28                        #197.71
        #vfmadd231pd %zmm26, %zmm28, %zmm8{%k3}                  #198.21
        #vfmadd231pd %zmm27, %zmm28, %zmm7{%k3}                  #199.21
        #vfmadd231pd %zmm29, %zmm28, %zmm11{%k3}                 #200.21
        cmpq      %r9, %r12                                     #173.13
        jb        ..B1.20       # Prob 82%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.25:                        # Preds ..B1.24
                                # Execution count [4.50e+00]
        movq      32(%rsp), %r8                                 #[spill]
        movq      (%rsp), %r14                                  #[spill]
        cmpl      %r11d, %r10d                                  #173.13
        je        ..B1.40       # Prob 10%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.26:                        # Preds ..B1.25 ..B1.18 ..B1.47
                                # Execution count [2.50e+01]
        lea       8(%r11), %r9d                                 #173.13
        cmpl      %r9d, %r13d                                   #173.13
        jl        ..B1.34       # Prob 50%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.27:                        # Preds ..B1.26
                                # Execution count [4.50e+00]
        movq      %rdx, %r12                                    #144.43
        imulq     %rsi, %r12                                    #144.43
        vbroadcastsd %xmm10, %zmm1                              #146.23
        vbroadcastsd %xmm6, %zmm0                               #147.23
        vbroadcastsd %xmm12, %zmm2                              #148.23
        movslq    %r11d, %r9                                    #173.13
        addq      %rcx, %r12                                    #126.5
        movq      %rdi, 8(%rsp)                                 #126.5[spill]
        movq      %rdx, 16(%rsp)                                #126.5[spill]
        movq      %rcx, 40(%rsp)                                #126.5[spill]
        movq      %rax, 48(%rsp)                                #126.5[spill]
        movq      %rsi, 56(%rsp)                                #126.5[spill]
        movq      %r8, 32(%rsp)                                 #126.5[spill]
        movq      %r14, (%rsp)                                  #126.5[spill]
                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.28:                        # Preds ..B1.32 ..B1.27
                                # Execution count [2.50e+01]
        vmovdqu   (%r12,%r9,4), %ymm3                           #174.25
        vpaddd    %ymm3, %ymm3, %ymm4                           #175.40
        vpaddd    %ymm4, %ymm3, %ymm3                           #175.40
        movl      (%r12,%r9,4), %r14d                           #174.25
        movl      4(%r12,%r9,4), %r8d                           #174.25
        movl      8(%r12,%r9,4), %edi                           #174.25
        movl      12(%r12,%r9,4), %esi                          #174.25
        lea       (%r14,%r14,2), %r14d                          #175.40
        movl      16(%r12,%r9,4), %ecx                          #174.25
        lea       (%r8,%r8,2), %r8d                             #175.40
        movl      20(%r12,%r9,4), %edx                          #174.25
        lea       (%rdi,%rdi,2), %edi                           #175.40
        movl      24(%r12,%r9,4), %eax                          #174.25
        lea       (%rsi,%rsi,2), %esi                           #175.40
        movl      28(%r12,%r9,4), %r15d                         #174.25
        lea       (%rcx,%rcx,2), %ecx                           #175.40
        lea       (%rdx,%rdx,2), %edx                           #175.40
        lea       (%rax,%rax,2), %eax                           #175.40
        lea       (%r15,%r15,2), %r15d                          #175.40
                                # LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.31:                        # Preds ..B1.28
                                # Execution count [1.25e+01]
        vpcmpeqb  %xmm0, %xmm0, %k1                             #175.40
        vpcmpeqb  %xmm0, %xmm0, %k2                             #175.40
        vpcmpeqb  %xmm0, %xmm0, %k3                             #175.40
        vpxord    %zmm4, %zmm4, %zmm4                           #175.40
        vpxord    %zmm17, %zmm17, %zmm17                        #175.40
        vpxord    %zmm18, %zmm18, %zmm18                        #175.40
        vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1}                 #175.40
        vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2}                 #175.40
        vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3}                  #175.40
                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
 ..B1.32:                        # Preds ..B1.31
                                # Execution count [2.50e+01]
        addl      $8, %r11d                                     #173.13
        addq      $8, %r9                                       #173.13
        #vsubpd    %zmm4, %zmm2, %zmm26                          #177.40
        #vsubpd    %zmm17, %zmm0, %zmm24                         #176.40
        #vsubpd    %zmm18, %zmm1, %zmm23                         #175.40
        #vmulpd    %zmm24, %zmm24, %zmm3                         #178.53
        #vfmadd231pd %zmm23, %zmm23, %zmm3                       #178.53
        #vfmadd231pd %zmm26, %zmm26, %zmm3                       #178.67
        #vrcp14pd  %zmm3, %zmm22                                 #195.42
        #vcmppd    $1, %zmm14, %zmm3, %k2                        #194.26
        #vfpclasspd $30, %zmm22, %k0                             #195.42
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
        #knotw     %k0, %k1                                      #195.42
        #vmulpd    %zmm3, %zmm3, %zmm4                           #195.42
        #vfmadd213pd %zmm22, %zmm3, %zmm22{%k1}                  #195.42
        #vfmadd213pd %zmm22, %zmm4, %zmm22{%k1}                  #195.42
        #vmulpd    %zmm13, %zmm22, %zmm17                        #196.42
        #vmulpd    %zmm9, %zmm22, %zmm19                         #197.58
        #vmulpd    %zmm17, %zmm22, %zmm20                        #196.48
        #vmulpd    %zmm20, %zmm22, %zmm18                        #196.54
        #vfmsub213pd %zmm5, %zmm20, %zmm22                       #197.58
        #vmulpd    %zmm19, %zmm18, %zmm21                        #197.65
        #vmulpd    %zmm22, %zmm21, %zmm25                        #197.71
        #vfmadd231pd %zmm23, %zmm25, %zmm8{%k2}                  #198.21
        #vfmadd231pd %zmm24, %zmm25, %zmm7{%k2}                  #199.21
        #vfmadd231pd %zmm26, %zmm25, %zmm11{%k2}                 #200.21
        cmpl      %r13d, %r11d                                  #173.13
        jb        ..B1.28       # Prob 82%                      #173.13
                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.33:                        # Preds ..B1.32
                                # Execution count [4.50e+00]
        movq      8(%rsp), %rdi                                 #[spill]
        movq      16(%rsp), %rdx                                #[spill]
        movq      40(%rsp), %rcx                                #[spill]
        movq      48(%rsp), %rax                                #[spill]
        movq      56(%rsp), %rsi                                #[spill]
        movq      32(%rsp), %r8                                 #[spill]
        movq      (%rsp), %r14                                  #[spill]
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.34:                        # Preds ..B1.33 ..B1.26 ..B1.48
                                # Execution count [5.00e+00]
        lea       1(%r13), %r9d                                 #173.13
        cmpl      %r10d, %r9d                                   #173.13
        ja        ..B1.40       # Prob 50%                      #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.35:                        # Preds ..B1.34
                                # Execution count [2.50e+01]
        imulq     %rdx, %rsi                                    #144.43
        vbroadcastsd %xmm10, %zmm4                              #146.23
        subl      %r13d, %r10d                                  #173.13
        addq      %rcx, %rsi                                    #126.5
        vpbroadcastd %r10d, %ymm0                               #173.13
        vpcmpgtd  %ymm15, %ymm0, %k3                            #173.13
        movslq    %r13d, %r13                                   #173.13
        kmovw     %k3, %r9d                                     #173.13
        vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z}                  #174.25
        vpaddd    %ymm1, %ymm1, %ymm2                           #175.40
        vpaddd    %ymm2, %ymm1, %ymm0                           #175.40
                                # LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 ..B1.38:                        # Preds ..B1.35
                                # Execution count [1.25e+01]
        kmovw     %k3, %k1                                      #175.40
        kmovw     %k3, %k2                                      #175.40
        vpxord    %zmm1, %zmm1, %zmm1                           #175.40
        vpxord    %zmm2, %zmm2, %zmm2                           #175.40
        vpxord    %zmm3, %zmm3, %zmm3                           #175.40
        vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1}                 #175.40
        vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2}                  #175.40
        vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3}                   #175.40
                                # LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.39:                        # Preds ..B1.38
                                # Execution count [2.50e+01]
        #vbroadcastsd %xmm6, %zmm6                               #147.23
        #vbroadcastsd %xmm12, %zmm12                             #148.23
        #vsubpd    %zmm1, %zmm12, %zmm23                         #177.40
        #vsubpd    %zmm2, %zmm6, %zmm21                          #176.40
        #vsubpd    %zmm3, %zmm4, %zmm20                          #175.40
        #vmulpd    %zmm21, %zmm21, %zmm19                        #178.53
        #vfmadd231pd %zmm20, %zmm20, %zmm19                      #178.53
        #vfmadd231pd %zmm23, %zmm23, %zmm19                      #178.67
        #vrcp14pd  %zmm19, %zmm18                                #195.42
        #vcmppd    $1, %zmm14, %zmm19, %k2                       #194.26
        #vfpclasspd $30, %zmm18, %k0                             #195.42
        #kmovw     %k2, %esi                                     #194.26
        #knotw     %k0, %k1                                      #195.42
        #vmovaps   %zmm19, %zmm0                                 #195.42
        #andl      %esi, %r9d                                    #194.26
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
        #kmovw     %r9d, %k3                                     #198.21
        #vmulpd    %zmm0, %zmm0, %zmm1                           #195.42
        #vfmadd213pd %zmm18, %zmm0, %zmm18{%k1}                  #195.42
        #vfmadd213pd %zmm18, %zmm1, %zmm18{%k1}                  #195.42
        #vmulpd    %zmm13, %zmm18, %zmm2                         #196.42
        #vmulpd    %zmm9, %zmm18, %zmm4                          #197.58
        #vmulpd    %zmm2, %zmm18, %zmm10                         #196.48
        #vmulpd    %zmm10, %zmm18, %zmm3                         #196.54
        #vfmsub213pd %zmm5, %zmm10, %zmm18                       #197.58
        #vmulpd    %zmm4, %zmm3, %zmm17                          #197.65
        #vmulpd    %zmm18, %zmm17, %zmm22                        #197.71
        #vfmadd231pd %zmm20, %zmm22, %zmm8{%k3}                  #198.21
        #vfmadd231pd %zmm21, %zmm22, %zmm7{%k3}                  #199.21
        #vfmadd231pd %zmm23, %zmm22, %zmm11{%k3}                 #200.21
                                # LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.40:                        # Preds ..B1.25 ..B1.39 ..B1.34
                                # Execution count [4.50e+00]
        vmovups   .L_2il0floatpacket.10(%rip), %zmm19           #151.22
        vpermd    %zmm11, %zmm19, %zmm0                         #151.22
        vpermd    %zmm7, %zmm19, %zmm6                          #150.22
        vpermd    %zmm8, %zmm19, %zmm20                         #149.22
        vaddpd    %zmm11, %zmm0, %zmm11                         #151.22
        vaddpd    %zmm7, %zmm6, %zmm7                           #150.22
        vaddpd    %zmm8, %zmm20, %zmm8                          #149.22
        vpermpd   $78, %zmm11, %zmm1                            #151.22
        vpermpd   $78, %zmm7, %zmm10                            #150.22
        vpermpd   $78, %zmm8, %zmm21                            #149.22
        vaddpd    %zmm1, %zmm11, %zmm2                          #151.22
        vaddpd    %zmm10, %zmm7, %zmm12                         #150.22
        vaddpd    %zmm21, %zmm8, %zmm22                         #149.22
        vpermpd   $177, %zmm2, %zmm3                            #151.22
        vpermpd   $177, %zmm12, %zmm17                          #150.22
        vpermpd   $177, %zmm22, %zmm23                          #149.22
        vaddpd    %zmm3, %zmm2, %zmm4                           #151.22
        vaddpd    %zmm17, %zmm12, %zmm18                        #150.22
        vaddpd    %zmm23, %zmm22, %zmm24                        #149.22
                                # LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.41:                        # Preds ..B1.40 ..B1.11
                                # Execution count [5.00e+00]
        movq      80(%rsp), %rsi                                #208.9[spill]
        addq      $24, %rdi                                     #143.5
        vaddsd    (%rsi,%rax,8), %xmm24, %xmm0                  #208.9
        vmovsd    %xmm0, (%rsi,%rax,8)                          #208.9
        movslq    %eax, %rsi                                    #143.32
        vaddsd    (%r14,%rax,8), %xmm18, %xmm1                  #209.9
        vmovsd    %xmm1, (%r14,%rax,8)                          #209.9
        incq      %rsi                                          #143.32
        vaddsd    (%r8,%rax,8), %xmm4, %xmm2                    #210.9
        vmovsd    %xmm2, (%r8,%rax,8)                           #210.9
        incq      %rax                                          #143.5
        cmpq      64(%rsp), %rax                                #143.5[spill]
        jb        ..B1.11       # Prob 82%                      #143.5
        jmp       ..B1.44       # Prob 100%                     #143.5
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.43:                        # Preds ..B1.2
                                # Execution count [5.00e-01]
        movl      $.L_2__STRING.0, %edi                         #141.5
 ..___tag_value_computeForce.48:
 #       likwid_markerStartRegion(const char *)
        call      likwid_markerStartRegion                      #141.5
 ..___tag_value_computeForce.49:
                                # LOE
 ..B1.44:                        # Preds ..B1.41 ..B1.43
                                # Execution count [1.00e+00]
        movl      $.L_2__STRING.0, %edi                         #219.5
        vzeroupper                                              #219.5
 ..___tag_value_computeForce.50:
 #       likwid_markerStopRegion(const char *)
        call      likwid_markerStopRegion                       #219.5
 ..___tag_value_computeForce.51:
                                # LOE
 ..B1.45:                        # Preds ..B1.44
                                # Execution count [1.00e+00]
        xorl      %eax, %eax                                    #221.16
 ..___tag_value_computeForce.52:
 #       getTimeStamp()
        call      getTimeStamp                                  #221.16
 ..___tag_value_computeForce.53:
                                # LOE xmm0
 ..B1.46:                        # Preds ..B1.45
                                # Execution count [1.00e+00]
        vsubsd    24(%rsp), %xmm0, %xmm0                        #224.14[spill]
        addq      $88, %rsp                                     #224.14
 	.cfi_restore 3
        popq      %rbx                                          #224.14
 	.cfi_restore 15
        popq      %r15                                          #224.14
 	.cfi_restore 14
        popq      %r14                                          #224.14
 	.cfi_restore 13
        popq      %r13                                          #224.14
 	.cfi_restore 12
        popq      %r12                                          #224.14
        movq      %rbp, %rsp                                    #224.14
        popq      %rbp                                          #224.14
 	.cfi_def_cfa 7, 8
 	.cfi_restore 6
        ret                                                     #224.14
 	.cfi_def_cfa 6, 16
 	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
 	.cfi_offset 6, -16
 	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
                                # LOE
 ..B1.47:                        # Preds ..B1.13
                                # Execution count [4.50e-01]: Infreq
        movl      %r10d, %r13d                                  #173.13
        xorl      %r11d, %r11d                                  #173.13
        andl      $-8, %r13d                                    #173.13
        jmp       ..B1.26       # Prob 100%                     #173.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.48:                        # Preds ..B1.12
                                # Execution count [4.50e-01]: Infreq
        xorl      %r13d, %r13d                                  #173.13
        jmp       ..B1.34       # Prob 100%                     #173.13
        .align    16,0x90
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 	.cfi_endproc
 # mark_end;
 	.type	computeForce,@function
 	.size	computeForce,.-computeForce
 ..LNcomputeForce.0:
 	.data
 # -- End  computeForce
 	.section .rodata, "a"
 	.align 64
 	.align 64
 .L_2il0floatpacket.2:
 	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.2,@object
 	.size	.L_2il0floatpacket.2,64
 	.align 64
 .L_2il0floatpacket.4:
 	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 	.type	.L_2il0floatpacket.4,@object
 	.size	.L_2il0floatpacket.4,64
 	.align 64
 .L_2il0floatpacket.5:
 	.long	0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
 	.type	.L_2il0floatpacket.5,@object
 	.size	.L_2il0floatpacket.5,64
 	.align 64
 .L_2il0floatpacket.6:
 	.long	0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
 	.type	.L_2il0floatpacket.6,@object
 	.size	.L_2il0floatpacket.6,64
 	.align 64
 .L_2il0floatpacket.7:
 	.long	0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
 	.type	.L_2il0floatpacket.7,@object
 	.size	.L_2il0floatpacket.7,64
 	.align 64
 .L_2il0floatpacket.8:
 	.long	0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
 	.type	.L_2il0floatpacket.8,@object
 	.size	.L_2il0floatpacket.8,64
 	.align 64
 .L_2il0floatpacket.10:
 	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 	.type	.L_2il0floatpacket.10,@object
 	.size	.L_2il0floatpacket.10,64
 	.align 32
 .L_2il0floatpacket.0:
 	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 	.type	.L_2il0floatpacket.0,@object
 	.size	.L_2il0floatpacket.0,32
 	.align 32
 .L_2il0floatpacket.1:
 	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 	.type	.L_2il0floatpacket.1,@object
 	.size	.L_2il0floatpacket.1,32
 	.align 8
 .L_2il0floatpacket.3:
 	.long	0x00000000,0x40480000
 	.type	.L_2il0floatpacket.3,@object
 	.size	.L_2il0floatpacket.3,8
 	.align 8
 .L_2il0floatpacket.9:
 	.long	0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.9,@object
 	.size	.L_2il0floatpacket.9,8
 	.section .rodata.str1.4, "aMS",@progbits,1
 	.align 4
 	.align 4
 .L_2__STRING.0:
 	.long	1668444006
 	.word	101
 	.type	.L_2__STRING.0,@object
 	.size	.L_2__STRING.0,6
 	.data
 	.section .note.GNU-stack, ""
 # End
--- a/asm/unused/force-mem-only.s
+++ b/asm/unused/force-mem-only.s
@@ -0,0 +1,585 @@
 # mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
 # mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
 # mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
 	.file "force.c"
 	.text
 ..TXTST0:
 .L_2__routine_start_computeForce_0:
 # -- Begin  computeForce
 	.text
 # mark_begin;
       .align    16,0x90
 	.globl computeForce
 # --- computeForce(Parameter *, Atom *, Neighbor *, int)
 computeForce:
 # parameter 1: %rdi
 # parameter 2: %rsi
 # parameter 3: %rdx
 # parameter 4: %ecx
 ..B1.1:                         # Preds ..B1.0
                                # Execution count [1.00e+00]
 	.cfi_startproc
 ..___tag_value_computeForce.1:
 ..L2:
                                                          #103.87
        pushq     %rbp                                          #103.87
 	.cfi_def_cfa_offset 16
        movq      %rsp, %rbp                                    #103.87
 	.cfi_def_cfa 6, 16
 	.cfi_offset 6, -16
        andq      $-64, %rsp                                    #103.87
        pushq     %r12                                          #103.87
        pushq     %r13                                          #103.87
        pushq     %r14                                          #103.87
        subq      $104, %rsp                                    #103.87
        xorl      %eax, %eax                                    #106.16
 	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
        movq      %rdx, %r14                                    #103.87
        movq      %rsi, %r13                                    #103.87
        movq      %rdi, %r12                                    #103.87
 ..___tag_value_computeForce.9:
 #       getTimeStamp()
        call      getTimeStamp                                  #106.16
 ..___tag_value_computeForce.10:
                                # LOE rbx r12 r13 r14 r15 xmm0
 ..B1.48:                        # Preds ..B1.1
                                # Execution count [1.00e+00]
        vmovsd    %xmm0, 16(%rsp)                               #106.16[spill]
                                # LOE rbx r12 r13 r14 r15
 ..B1.2:                         # Preds ..B1.48
                                # Execution count [1.00e+00]
        movl      4(%r13), %ecx                                 #107.18
        movq      64(%r13), %r11                                #109.20
        movq      72(%r13), %r10                                #109.45
        movq      80(%r13), %r9                                 #109.70
        vmovsd    72(%r12), %xmm2                               #111.27
        vmovsd    8(%r12), %xmm1                                #112.23
        vmovsd    (%r12), %xmm0                                 #113.24
        testl     %ecx, %ecx                                    #116.24
        jle       ..B1.42       # Prob 50%                      #116.24
                                # LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
 ..B1.3:                         # Preds ..B1.2
                                # Execution count [1.00e+00]
        xorl      %edi, %edi                                    #116.5
        movl      %ecx, %edx                                    #116.5
        xorl      %esi, %esi                                    #116.5
        movl      $1, %r8d                                      #116.5
        xorl      %eax, %eax                                    #117.17
        shrl      $1, %edx                                      #116.5
        je        ..B1.7        # Prob 9%                       #116.5
                                # LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
 ..B1.5:                         # Preds ..B1.3 ..B1.5
                                # Execution count [2.50e+00]
        movq      %rax, (%rsi,%r11)                             #117.9
        incq      %rdi                                          #116.5
        movq      %rax, (%rsi,%r10)                             #118.9
        movq      %rax, (%rsi,%r9)                              #119.9
        movq      %rax, 8(%rsi,%r11)                            #117.9
        movq      %rax, 8(%rsi,%r10)                            #118.9
        movq      %rax, 8(%rsi,%r9)                             #119.9
        addq      $16, %rsi                                     #116.5
        cmpq      %rdx, %rdi                                    #116.5
        jb        ..B1.5        # Prob 63%                      #116.5
                                # LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
 ..B1.6:                         # Preds ..B1.5
                                # Execution count [9.00e-01]
        lea       1(%rdi,%rdi), %r8d                            #117.9
                                # LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
 ..B1.7:                         # Preds ..B1.3 ..B1.6
                                # Execution count [1.00e+00]
        lea       -1(%r8), %edx                                 #116.5
        cmpl      %ecx, %edx                                    #116.5
        jae       ..B1.9        # Prob 9%                       #116.5
                                # LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
 ..B1.8:                         # Preds ..B1.7
                                # Execution count [9.00e-01]
        movslq    %r8d, %r8                                     #116.5
        movq      %rax, -8(%r11,%r8,8)                          #117.9
        movq      %rax, -8(%r10,%r8,8)                          #118.9
        movq      %rax, -8(%r9,%r8,8)                           #119.9
                                # LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
 ..B1.9:                         # Preds ..B1.7 ..B1.8
                                # Execution count [9.00e-01]
        vmulsd    %xmm2, %xmm2, %xmm13                          #111.45
        xorl      %edi, %edi                                    #124.15
        vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16            #153.13
        vmulsd    .L_2il0floatpacket.3(%rip), %xmm0, %xmm0      #177.45
        vmovdqu   .L_2il0floatpacket.1(%rip), %ymm15            #153.13
        vmovups   .L_2il0floatpacket.4(%rip), %zmm5             #177.58
        vbroadcastsd %xmm13, %zmm14                             #111.25
        vbroadcastsd %xmm1, %zmm13                              #112.21
        vbroadcastsd %xmm0, %zmm9                               #177.45
        movq      16(%r13), %rdx                                #127.25
        xorl      %r8d, %r8d                                    #124.5
        movslq    %ecx, %r12                                    #124.5
        xorl      %eax, %eax                                    #124.5
        movq      24(%r14), %r13                                #126.25
        movslq    16(%r14), %rcx                                #125.43
        movq      8(%r14), %rsi                                 #125.19
        shlq      $2, %rcx                                      #108.5
        movq      %r12, 80(%rsp)                                #124.5[spill]
        movq      %r13, 88(%rsp)                                #124.5[spill]
        movq      %r11, 96(%rsp)                                #124.5[spill]
        movq      %r15, 8(%rsp)                                 #124.5[spill]
        movq      %rbx, (%rsp)                                  #124.5[spill]
 	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.10:                        # Preds ..B1.40 ..B1.9
                                # Execution count [5.00e+00]
        movq      88(%rsp), %rbx                                #126.25[spill]
        vxorpd    %xmm24, %xmm24, %xmm24                        #130.22
        vmovapd   %xmm24, %xmm18                                #131.22
        movl      (%rbx,%r8,4), %r11d                           #126.25
        vmovapd   %xmm18, %xmm4                                 #132.22
        vmovsd    (%rax,%rdx), %xmm10                           #127.25
        vmovsd    8(%rax,%rdx), %xmm6                           #128.25
        vmovsd    16(%rax,%rdx), %xmm12                         #129.25
        testl     %r11d, %r11d                                  #153.32
        jle       ..B1.40       # Prob 50%                      #153.32
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.11:                        # Preds ..B1.10
                                # Execution count [4.50e+00]
        vpxord    %zmm8, %zmm8, %zmm8                           #130.22
        vmovaps   %zmm8, %zmm7                                  #131.22
        vmovaps   %zmm7, %zmm11                                 #132.22
        cmpl      $8, %r11d                                     #153.13
        jl        ..B1.45       # Prob 10%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.12:                        # Preds ..B1.11
                                # Execution count [4.50e+00]
        cmpl      $1200, %r11d                                  #153.13
        jl        ..B1.44       # Prob 10%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.13:                        # Preds ..B1.12
                                # Execution count [4.50e+00]
        movq      %rcx, %r15                                    #125.43
        imulq     %rdi, %r15                                    #125.43
        addq      %rsi, %r15                                    #108.5
        movq      %r15, %r12                                    #153.13
        andq      $63, %r12                                     #153.13
        testl     $3, %r12d                                     #153.13
        je        ..B1.15       # Prob 50%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.14:                        # Preds ..B1.13
                                # Execution count [2.25e+00]
        xorl      %r12d, %r12d                                  #153.13
        jmp       ..B1.17       # Prob 100%                     #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.15:                        # Preds ..B1.13
                                # Execution count [2.25e+00]
        testl     %r12d, %r12d                                  #153.13
        je        ..B1.17       # Prob 50%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.16:                        # Preds ..B1.15
                                # Execution count [2.50e+01]
        negl      %r12d                                         #153.13
        addl      $64, %r12d                                    #153.13
        shrl      $2, %r12d                                     #153.13
        cmpl      %r12d, %r11d                                  #153.13
        cmovl     %r11d, %r12d                                  #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.17:                        # Preds ..B1.14 ..B1.16 ..B1.15
                                # Execution count [5.00e+00]
        movl      %r11d, %r14d                                  #153.13
        subl      %r12d, %r14d                                  #153.13
        andl      $7, %r14d                                     #153.13
        negl      %r14d                                         #153.13
        addl      %r11d, %r14d                                  #153.13
        cmpl      $1, %r12d                                     #153.13
        jb        ..B1.25       # Prob 50%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.18:                        # Preds ..B1.17
                                # Execution count [4.50e+00]
        vmovdqa   %ymm15, %ymm4                                 #153.13
        xorl      %r13d, %r13d                                  #153.13
        vpbroadcastd %r12d, %ymm3                               #153.13
        vbroadcastsd %xmm10, %zmm2                              #127.23
        vbroadcastsd %xmm6, %zmm1                               #128.23
        vbroadcastsd %xmm12, %zmm0                              #129.23
        movslq    %r12d, %rbx                                   #153.13
        movq      %r9, 24(%rsp)                                 #153.13[spill]
        movq      %r10, 32(%rsp)                                #153.13[spill]
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.19:                        # Preds ..B1.23 ..B1.18
                                # Execution count [2.50e+01]
        vpcmpgtd  %ymm4, %ymm3, %k3                             #153.13
        vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z}                 #154.25
        kmovw     %k3, %r10d                                    #153.13
        vpaddd    %ymm17, %ymm17, %ymm18                        #155.40
        vpaddd    %ymm18, %ymm17, %ymm17                        #155.40
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 ..B1.22:                        # Preds ..B1.19
                                # Execution count [1.25e+01]
        kmovw     %k3, %k1                                      #155.40
        kmovw     %k3, %k2                                      #155.40
        vpxord    %zmm18, %zmm18, %zmm18                        #155.40
        vpxord    %zmm19, %zmm19, %zmm19                        #155.40
        vpxord    %zmm20, %zmm20, %zmm20                        #155.40
        vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1}               #155.40
        vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2}                #155.40
        vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3}                 #155.40
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
 ..B1.23:                        # Preds ..B1.22
                                # Execution count [2.50e+01]
        addq      $8, %r13                                      #153.13
        #vpaddd    %ymm16, %ymm4, %ymm4                          #153.13
        #vsubpd    %zmm18, %zmm0, %zmm29                         #157.40
        #vsubpd    %zmm19, %zmm1, %zmm27                         #156.40
        #vsubpd    %zmm20, %zmm2, %zmm26                         #155.40
        #vmulpd    %zmm27, %zmm27, %zmm25                        #158.53
        #vfmadd231pd %zmm26, %zmm26, %zmm25                      #158.53
        #vfmadd231pd %zmm29, %zmm29, %zmm25                      #158.67
        #vrcp14pd  %zmm25, %zmm24                                #175.42
        #vcmppd    $1, %zmm14, %zmm25, %k2                       #174.26
        #vfpclasspd $30, %zmm24, %k0                             #175.42
        #kmovw     %k2, %r9d                                     #174.26
        #knotw     %k0, %k1                                      #175.42
        #vmovaps   %zmm25, %zmm17                                #175.42
        #andl      %r9d, %r10d                                   #174.26
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
        #kmovw     %r10d, %k3                                    #178.21
        #vmulpd    %zmm17, %zmm17, %zmm18                        #175.42
        #vfmadd213pd %zmm24, %zmm17, %zmm24{%k1}                 #175.42
        #vfmadd213pd %zmm24, %zmm18, %zmm24{%k1}                 #175.42
        #vmulpd    %zmm13, %zmm24, %zmm19                        #176.42
        #vmulpd    %zmm9, %zmm24, %zmm21                         #177.58
        #vmulpd    %zmm19, %zmm24, %zmm22                        #176.48
        #vmulpd    %zmm22, %zmm24, %zmm20                        #176.54
        #vfmsub213pd %zmm5, %zmm22, %zmm24                       #177.58
        #vmulpd    %zmm21, %zmm20, %zmm23                        #177.65
        #vmulpd    %zmm24, %zmm23, %zmm28                        #177.71
        #vfmadd231pd %zmm26, %zmm28, %zmm8{%k3}                  #178.21
        #vfmadd231pd %zmm27, %zmm28, %zmm7{%k3}                  #179.21
        #vfmadd231pd %zmm29, %zmm28, %zmm11{%k3}                 #180.21
        cmpq      %rbx, %r13                                    #153.13
        jb        ..B1.19       # Prob 82%                      #153.13
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.24:                        # Preds ..B1.23
                                # Execution count [4.50e+00]
        movq      24(%rsp), %r9                                 #[spill]
        movq      32(%rsp), %r10                                #[spill]
        cmpl      %r12d, %r11d                                  #153.13
        je        ..B1.39       # Prob 10%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.25:                        # Preds ..B1.24 ..B1.17 ..B1.44
                                # Execution count [2.50e+01]
        lea       8(%r12), %ebx                                 #153.13
        cmpl      %ebx, %r14d                                   #153.13
        jl        ..B1.33       # Prob 50%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.26:                        # Preds ..B1.25
                                # Execution count [4.50e+00]
        movq      %rcx, %r13                                    #125.43
        imulq     %rdi, %r13                                    #125.43
        vbroadcastsd %xmm10, %zmm1                              #127.23
        vbroadcastsd %xmm6, %zmm0                               #128.23
        vbroadcastsd %xmm12, %zmm2                              #129.23
        movslq    %r12d, %rbx                                   #153.13
        addq      %rsi, %r13                                    #108.5
        movq      %rax, 40(%rsp)                                #108.5[spill]
        movq      %rcx, 48(%rsp)                                #108.5[spill]
        movq      %rsi, 56(%rsp)                                #108.5[spill]
        movq      %r8, 64(%rsp)                                 #108.5[spill]
        movq      %rdi, 72(%rsp)                                #108.5[spill]
        movq      %r9, 24(%rsp)                                 #108.5[spill]
        movq      %r10, 32(%rsp)                                #108.5[spill]
                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.27:                        # Preds ..B1.31 ..B1.26
                                # Execution count [2.50e+01]
        vmovdqu   (%r13,%rbx,4), %ymm3                          #154.25
        vpaddd    %ymm3, %ymm3, %ymm4                           #155.40
        vpaddd    %ymm4, %ymm3, %ymm3                           #155.40
        movl      (%r13,%rbx,4), %r10d                          #154.25
        movl      4(%r13,%rbx,4), %r9d                          #154.25
        movl      8(%r13,%rbx,4), %r8d                          #154.25
        movl      12(%r13,%rbx,4), %edi                         #154.25
        lea       (%r10,%r10,2), %r10d                          #155.40
        movl      16(%r13,%rbx,4), %esi                         #154.25
        lea       (%r9,%r9,2), %r9d                             #155.40
        movl      20(%r13,%rbx,4), %ecx                         #154.25
        lea       (%r8,%r8,2), %r8d                             #155.40
        movl      24(%r13,%rbx,4), %eax                         #154.25
        lea       (%rdi,%rdi,2), %edi                           #155.40
        movl      28(%r13,%rbx,4), %r15d                        #154.25
        lea       (%rsi,%rsi,2), %esi                           #155.40
        lea       (%rcx,%rcx,2), %ecx                           #155.40
        lea       (%rax,%rax,2), %eax                           #155.40
        lea       (%r15,%r15,2), %r15d                          #155.40
                                # LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.30:                        # Preds ..B1.27
                                # Execution count [1.25e+01]
        vpcmpeqb  %xmm0, %xmm0, %k1                             #155.40
        vpcmpeqb  %xmm0, %xmm0, %k2                             #155.40
        vpcmpeqb  %xmm0, %xmm0, %k3                             #155.40
        vpxord    %zmm4, %zmm4, %zmm4                           #155.40
        vpxord    %zmm17, %zmm17, %zmm17                        #155.40
        vpxord    %zmm18, %zmm18, %zmm18                        #155.40
        vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1}                 #155.40
        vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2}                 #155.40
        vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3}                  #155.40
                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
 ..B1.31:                        # Preds ..B1.30
                                # Execution count [2.50e+01]
        addl      $8, %r12d                                     #153.13
        addq      $8, %rbx                                      #153.13
        #vsubpd    %zmm4, %zmm2, %zmm26                          #157.40
        #vsubpd    %zmm17, %zmm0, %zmm24                         #156.40
        #vsubpd    %zmm18, %zmm1, %zmm23                         #155.40
        #vmulpd    %zmm24, %zmm24, %zmm3                         #158.53
        #vfmadd231pd %zmm23, %zmm23, %zmm3                       #158.53
        #vfmadd231pd %zmm26, %zmm26, %zmm3                       #158.67
        #vrcp14pd  %zmm3, %zmm22                                 #175.42
        #vcmppd    $1, %zmm14, %zmm3, %k2                        #174.26
        #vfpclasspd $30, %zmm22, %k0                             #175.42
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
        #knotw     %k0, %k1                                      #175.42
        #vmulpd    %zmm3, %zmm3, %zmm4                           #175.42
        #vfmadd213pd %zmm22, %zmm3, %zmm22{%k1}                  #175.42
        #vfmadd213pd %zmm22, %zmm4, %zmm22{%k1}                  #175.42
        #vmulpd    %zmm13, %zmm22, %zmm17                        #176.42
        #vmulpd    %zmm9, %zmm22, %zmm19                         #177.58
        #vmulpd    %zmm17, %zmm22, %zmm20                        #176.48
        #vmulpd    %zmm20, %zmm22, %zmm18                        #176.54
        #vfmsub213pd %zmm5, %zmm20, %zmm22                       #177.58
        #vmulpd    %zmm19, %zmm18, %zmm21                        #177.65
        #vmulpd    %zmm22, %zmm21, %zmm25                        #177.71
        #vfmadd231pd %zmm23, %zmm25, %zmm8{%k2}                  #178.21
        #vfmadd231pd %zmm24, %zmm25, %zmm7{%k2}                  #179.21
        #vfmadd231pd %zmm26, %zmm25, %zmm11{%k2}                 #180.21
        cmpl      %r14d, %r12d                                  #153.13
        jb        ..B1.27       # Prob 82%                      #153.13
                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.32:                        # Preds ..B1.31
                                # Execution count [4.50e+00]
        movq      40(%rsp), %rax                                #[spill]
        movq      48(%rsp), %rcx                                #[spill]
        movq      56(%rsp), %rsi                                #[spill]
        movq      64(%rsp), %r8                                 #[spill]
        movq      72(%rsp), %rdi                                #[spill]
        movq      24(%rsp), %r9                                 #[spill]
        movq      32(%rsp), %r10                                #[spill]
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.33:                        # Preds ..B1.32 ..B1.25 ..B1.45
                                # Execution count [5.00e+00]
        lea       1(%r14), %ebx                                 #153.13
        cmpl      %r11d, %ebx                                   #153.13
        ja        ..B1.39       # Prob 50%                      #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.34:                        # Preds ..B1.33
                                # Execution count [2.50e+01]
        imulq     %rcx, %rdi                                    #125.43
        vbroadcastsd %xmm10, %zmm4                              #127.23
        subl      %r14d, %r11d                                  #153.13
        addq      %rsi, %rdi                                    #108.5
        vpbroadcastd %r11d, %ymm0                               #153.13
        vpcmpgtd  %ymm15, %ymm0, %k3                            #153.13
        movslq    %r14d, %r14                                   #153.13
        vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z}                  #154.25
        kmovw     %k3, %edi                                     #153.13
        vpaddd    %ymm1, %ymm1, %ymm2                           #155.40
        vpaddd    %ymm2, %ymm1, %ymm0                           #155.40
                                # LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 ..B1.37:                        # Preds ..B1.34
                                # Execution count [1.25e+01]
        kmovw     %k3, %k1                                      #155.40
        kmovw     %k3, %k2                                      #155.40
        vpxord    %zmm1, %zmm1, %zmm1                           #155.40
        vpxord    %zmm2, %zmm2, %zmm2                           #155.40
        vpxord    %zmm3, %zmm3, %zmm3                           #155.40
        vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1}                 #155.40
        vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2}                  #155.40
        vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3}                   #155.40
                                # LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.38:                        # Preds ..B1.37
                                # Execution count [2.50e+01]
        #vbroadcastsd %xmm6, %zmm6                               #128.23
        #vbroadcastsd %xmm12, %zmm12                             #129.23
        #vsubpd    %zmm1, %zmm12, %zmm23                         #157.40
        #vsubpd    %zmm2, %zmm6, %zmm21                          #156.40
        #vsubpd    %zmm3, %zmm4, %zmm20                          #155.40
        #vmulpd    %zmm21, %zmm21, %zmm19                        #158.53
        #vfmadd231pd %zmm20, %zmm20, %zmm19                      #158.53
        #vfmadd231pd %zmm23, %zmm23, %zmm19                      #158.67
        #vrcp14pd  %zmm19, %zmm18                                #175.42
        #vcmppd    $1, %zmm14, %zmm19, %k2                       #174.26
        #vfpclasspd $30, %zmm18, %k0                             #175.42
        #kmovw     %k2, %ebx                                     #174.26
        #knotw     %k0, %k1                                      #175.42
        #vmovaps   %zmm19, %zmm0                                 #175.42
        #andl      %ebx, %edi                                    #174.26
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
        #kmovw     %edi, %k3                                     #178.21
        #vmulpd    %zmm0, %zmm0, %zmm1                           #175.42
        #vfmadd213pd %zmm18, %zmm0, %zmm18{%k1}                  #175.42
        #vfmadd213pd %zmm18, %zmm1, %zmm18{%k1}                  #175.42
        #vmulpd    %zmm13, %zmm18, %zmm2                         #176.42
        #vmulpd    %zmm9, %zmm18, %zmm4                          #177.58
        #vmulpd    %zmm2, %zmm18, %zmm10                         #176.48
        #vmulpd    %zmm10, %zmm18, %zmm3                         #176.54
        #vfmsub213pd %zmm5, %zmm10, %zmm18                       #177.58
        #vmulpd    %zmm4, %zmm3, %zmm17                          #177.65
        #vmulpd    %zmm18, %zmm17, %zmm22                        #177.71
        #vfmadd231pd %zmm20, %zmm22, %zmm8{%k3}                  #178.21
        #vfmadd231pd %zmm21, %zmm22, %zmm7{%k3}                  #179.21
        #vfmadd231pd %zmm23, %zmm22, %zmm11{%k3}                 #180.21
                                # LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.39:                        # Preds ..B1.24 ..B1.38 ..B1.33
                                # Execution count [4.50e+00]
        vmovups   .L_2il0floatpacket.10(%rip), %zmm19           #132.22
        vpermd    %zmm11, %zmm19, %zmm0                         #132.22
        vpermd    %zmm7, %zmm19, %zmm6                          #131.22
        vpermd    %zmm8, %zmm19, %zmm20                         #130.22
        vaddpd    %zmm11, %zmm0, %zmm11                         #132.22
        vaddpd    %zmm7, %zmm6, %zmm7                           #131.22
        vaddpd    %zmm8, %zmm20, %zmm8                          #130.22
        vpermpd   $78, %zmm11, %zmm1                            #132.22
        vpermpd   $78, %zmm7, %zmm10                            #131.22
        vpermpd   $78, %zmm8, %zmm21                            #130.22
        vaddpd    %zmm1, %zmm11, %zmm2                          #132.22
        vaddpd    %zmm10, %zmm7, %zmm12                         #131.22
        vaddpd    %zmm21, %zmm8, %zmm22                         #130.22
        vpermpd   $177, %zmm2, %zmm3                            #132.22
        vpermpd   $177, %zmm12, %zmm17                          #131.22
        vpermpd   $177, %zmm22, %zmm23                          #130.22
        vaddpd    %zmm3, %zmm2, %zmm4                           #132.22
        vaddpd    %zmm17, %zmm12, %zmm18                        #131.22
        vaddpd    %zmm23, %zmm22, %zmm24                        #130.22
                                # LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.40:                        # Preds ..B1.39 ..B1.10
                                # Execution count [5.00e+00]
        movq      96(%rsp), %rbx                                #188.9[spill]
        addq      $24, %rax                                     #124.5
        movslq    %r8d, %rdi                                    #124.32
        incq      %rdi                                          #124.32
        #vaddsd    (%rbx,%r8,8), %xmm24, %xmm0                   #188.9
        #vmovsd    %xmm0, (%rbx,%r8,8)                           #188.9
        #vaddsd    (%r10,%r8,8), %xmm18, %xmm1                   #189.9
        #vmovsd    %xmm1, (%r10,%r8,8)                           #189.9
        #vaddsd    (%r9,%r8,8), %xmm4, %xmm2                     #190.9
        #vmovsd    %xmm2, (%r9,%r8,8)                            #190.9
        incq      %r8                                           #124.5
        cmpq      80(%rsp), %r8                                 #124.5[spill]
        jb        ..B1.10       # Prob 82%                      #124.5
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 ..B1.41:                        # Preds ..B1.40
                                # Execution count [9.00e-01]
        movq      8(%rsp), %r15                                 #[spill]
 	.cfi_restore 15
        movq      (%rsp), %rbx                                  #[spill]
 	.cfi_restore 3
                                # LOE rbx r15
 ..B1.42:                        # Preds ..B1.2 ..B1.41
                                # Execution count [1.00e+00]
        xorl      %eax, %eax                                    #201.16
        vzeroupper                                              #201.16
 ..___tag_value_computeForce.43:
 #       getTimeStamp()
        call      getTimeStamp                                  #201.16
 ..___tag_value_computeForce.44:
                                # LOE rbx r15 xmm0
 ..B1.43:                        # Preds ..B1.42
                                # Execution count [1.00e+00]
        vsubsd    16(%rsp), %xmm0, %xmm0                        #204.14[spill]
        addq      $104, %rsp                                    #204.14
 	.cfi_restore 14
        popq      %r14                                          #204.14
 	.cfi_restore 13
        popq      %r13                                          #204.14
 	.cfi_restore 12
        popq      %r12                                          #204.14
        movq      %rbp, %rsp                                    #204.14
        popq      %rbp                                          #204.14
 	.cfi_def_cfa 7, 8
 	.cfi_restore 6
        ret                                                     #204.14
 	.cfi_def_cfa 6, 16
 	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
 	.cfi_offset 6, -16
 	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
                                # LOE
 ..B1.44:                        # Preds ..B1.12
                                # Execution count [4.50e-01]: Infreq
        movl      %r11d, %r14d                                  #153.13
        xorl      %r12d, %r12d                                  #153.13
        andl      $-8, %r14d                                    #153.13
        jmp       ..B1.25       # Prob 100%                     #153.13
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 ..B1.45:                        # Preds ..B1.11
                                # Execution count [4.50e-01]: Infreq
        xorl      %r14d, %r14d                                  #153.13
        jmp       ..B1.33       # Prob 100%                     #153.13
        .align    16,0x90
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 	.cfi_endproc
 # mark_end;
 	.type	computeForce,@function
 	.size	computeForce,.-computeForce
 ..LNcomputeForce.0:
 	.data
 # -- End  computeForce
 	.section .rodata, "a"
 	.align 64
 	.align 64
 .L_2il0floatpacket.2:
 	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.2,@object
 	.size	.L_2il0floatpacket.2,64
 	.align 64
 .L_2il0floatpacket.4:
 	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 	.type	.L_2il0floatpacket.4,@object
 	.size	.L_2il0floatpacket.4,64
 	.align 64
 .L_2il0floatpacket.5:
 	.long	0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
 	.type	.L_2il0floatpacket.5,@object
 	.size	.L_2il0floatpacket.5,64
 	.align 64
 .L_2il0floatpacket.6:
 	.long	0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
 	.type	.L_2il0floatpacket.6,@object
 	.size	.L_2il0floatpacket.6,64
 	.align 64
 .L_2il0floatpacket.7:
 	.long	0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
 	.type	.L_2il0floatpacket.7,@object
 	.size	.L_2il0floatpacket.7,64
 	.align 64
 .L_2il0floatpacket.8:
 	.long	0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
 	.type	.L_2il0floatpacket.8,@object
 	.size	.L_2il0floatpacket.8,64
 	.align 64
 .L_2il0floatpacket.10:
 	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 	.type	.L_2il0floatpacket.10,@object
 	.size	.L_2il0floatpacket.10,64
 	.align 32
 .L_2il0floatpacket.0:
 	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 	.type	.L_2il0floatpacket.0,@object
 	.size	.L_2il0floatpacket.0,32
 	.align 32
 .L_2il0floatpacket.1:
 	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 	.type	.L_2il0floatpacket.1,@object
 	.size	.L_2il0floatpacket.1,32
 	.align 8
 .L_2il0floatpacket.3:
 	.long	0x00000000,0x40480000
 	.type	.L_2il0floatpacket.3,@object
 	.size	.L_2il0floatpacket.3,8
 	.align 8
 .L_2il0floatpacket.9:
 	.long	0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.9,@object
 	.size	.L_2il0floatpacket.9,8
 	.data
 	.section .note.GNU-stack, ""
 # End
--- a/asm/unused/force.s
+++ b/asm/unused/force.s
@@ -0,0 +1,324 @@
 .intel_syntax noprefix
 .text
 .align    16,0x90
 .globl computeForce
 computeForce:
 # parameter 1: rdi Parameter*
 # parameter 2: rsi Atom*
 # parameter 3: rdx Neighbor*
        push      rbp
        push      r12
        push      r13
        push      r14
        push      r15
        push      rbx
        #call      getTimeStamp                                      # xmm0 <- getTimeStamp()
        #vmovsd    QWORD PTR [-56+rsp], xmm0                         # [-56+rsp] <- xmm0 [spill]
        mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal
        vmovsd    xmm2, QWORD PTR [96+rdi]                          # xmm2 <- param->cutforce
        vmovsd    xmm1, QWORD PTR [32+rdi]                          # xmm1 <- param->sigma6
        vmovsd    xmm0, QWORD PTR [24+rdi]                          # xmm0 <- param->epsilon
        mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx
        mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy
        mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz
        test      r9d, r9d                                          # atom->Nlocal <= 0
        jle       ..atom_loop_exit
        xor       r10d, r10d                                        # r10d <- 0
        mov       ecx, r9d                                          # ecx <- atom->Nlocal
        xor       r8d, r8d                                          # r8d <- 0
        mov       r11d, 1                                           # r11d <- 1
        xor       eax, eax                                          # eax <- 0
        shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1
        je        ..zero_last_element                               # ecx == 0
 # Init forces to zero loop (unroll factor = 2)
 ..init_force_loop:
        mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0
        mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0
        mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0
        mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0
        mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0
        mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0
        add       r8, 16                                            # i++
        inc       r10                                               # i++
        cmp       r10, rcx                                          # i < Nlocal
        jb        ..init_force_loop
 # Trick to make r11d contain value of last element to be zeroed plus 1
 # Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
        lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1
 ..zero_last_element:
        lea       ecx, DWORD PTR [-1+r11]                           # ecx <- i * 2
        cmp       ecx, r9d                                          # i >= Nlocal
        jae       ..before_atom_loop
        # Set last element to zero
        movsxd    r11, r11d                                         # r11 <- i * 2
        mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0
        mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0
        mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0
 # Initialize registers to be used within atom loop
 ..before_atom_loop:
        vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq
        vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...]
        vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon
        vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7]
        vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...]
        vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...]
        vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...]
        vbroadcastsd zmm14, xmm0                                    # zmm14 <- [48 * epsilon, ...]
        movsxd    r9, r9d                                           # r9 <- atom->Nlocal
        xor       r10d, r10d                                        # r10d <- 0 (i)
        mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh
        mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors
        movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs
        mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x
        ### AOS
        xor       eax, eax
        ### SOA
        #mov       rax, QWORD PTR [24+rsi]                          # rax <- atom->y
        #mov       rsi, QWORD PTR [32+rsi]                          # rsi <- atom->z
        ###
        shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4
        # Register spilling
        mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal
        mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh
        mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy
        mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx
        mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15
        mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx
 ..atom_loop_begin:
        mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh
        vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 (fix)
        vmovapd   xmm20, xmm25                                      # xmm20 <- 0 (fiy)
        mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs)
        vmovapd   xmm4, xmm20                                       # xmm4 <- 0 (fiz)
        ### AOS
        vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3]
        vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1]
        vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2]
        ### SOA
        #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i]
        #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i]
        #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i]
        ###
        vbroadcastsd zmm0, xmm8                                     # zmm0 <- atom_x(i)
        vbroadcastsd zmm1, xmm9                                     # zmm1 <- atom_y(i)
        vbroadcastsd zmm2, xmm10                                    # zmm2 <- atom_z(i)
        test      r13d, r13d                                        # numneighs <= 0
        jle       ..atom_loop_exit
        vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix)
        vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy)
        vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz)
        mov       rcx, r12                                          # rcx <- neighbor->maxneighs * 4
        imul      rcx, r10                                          # rcx <- neighbor->maxneighs * 4 * i
        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
        xor       r9d, r9d                                          # r9d <- 0 (k)
        mov       r14d, r13d                                        # r14d <- numneighs
        cmp       r14d, 8
        jl        ..compute_forces_remainder
 ..compute_forces:
        vpcmpeqb  k1, xmm0, xmm0
        vpcmpeqb  k2, xmm0, xmm0
        vpcmpeqb  k3, xmm0, xmm0
        vmovdqu   ymm3, YMMWORD PTR [rcx+r9*4]
        vpxord    zmm5, zmm5, zmm5
        vpxord    zmm6, zmm6, zmm6
        ### AOS
        vpaddd     ymm4, ymm3, ymm3
        vpaddd     ymm3, ymm3, ymm4
        vpxord     zmm4, zmm4, zmm4
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
        ### SOA
        #vpxord     zmm4, zmm4, zmm4
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
        ###
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
        # Cutoff radius condition
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
        vfmadd231pd zmm13{k5}, zmm30, zmm28                         # fix += force * delx
        vfmadd231pd zmm12{k5}, zmm30, zmm29                         # fiy += force * dely
        vfmadd231pd zmm11{k5}, zmm30, zmm31                         # fiz += force * delz
        sub       r14d, 8
        add       r9, 8
        cmp       r14d, 8
        jge       ..compute_forces
 # Check if there are remaining neighbors to be computed
 ..compute_forces_remainder:
        test      r14d, r14d
        jle       ..sum_up_forces
        vpbroadcastd ymm4, r14d
        vpcmpgtd  k1, ymm4, ymm17
        kmovw     r15d, k1
        vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
        kmovw     k2, k1
        kmovw     k3, k1
        vpxord    zmm5, zmm5, zmm5
        vpxord    zmm6, zmm6, zmm6
        ### AOS
        vpaddd     ymm4, ymm3, ymm3
        vpaddd     ymm3, ymm3, ymm4
        vpxord     zmm4, zmm4, zmm4
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
        #### SOA
        #vpxord     zmm4, zmm4, zmm4
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
        ###
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
        # Cutoff radius condition
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
        kmovw     r9d, k5                                           # r9d <- rsq < cutforcesq
        and       r15d, r9d                                         # r15d <- rsq < cutforcesq && k < numneighs
        kmovw     k3, r15d                                          # k3 <- rsq < cutforcesq && k < numneighs
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
        vfmadd231pd zmm13{k3}, zmm30, zmm28                         # fix += force * delx
        vfmadd231pd zmm12{k3}, zmm30, zmm29                         # fiy += force * dely
        vfmadd231pd zmm11{k3}, zmm30, zmm31                         # fiz += force * delz
 # Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
 # and add them (reduction) to obtain the final contribution for the current atom
 ..sum_up_forces:
        vmovups   zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
        vpermd    zmm0, zmm10, zmm11
        vpermd    zmm5, zmm10, zmm12
        vpermd    zmm21, zmm10, zmm13
        vaddpd    zmm11, zmm0, zmm11
        vaddpd    zmm12, zmm5, zmm12
        vaddpd    zmm13, zmm21, zmm13
        vpermpd   zmm1, zmm11, 78
        vpermpd   zmm6, zmm12, 78
        vpermpd   zmm22, zmm13, 78
        vaddpd    zmm2, zmm11, zmm1
        vaddpd    zmm8, zmm12, zmm6
        vaddpd    zmm23, zmm13, zmm22
        vpermpd   zmm3, zmm2, 177
        vpermpd   zmm9, zmm8, 177
        vpermpd   zmm24, zmm23, 177
        vaddpd    zmm4, zmm2, zmm3
        vaddpd    zmm20, zmm8, zmm9
        vaddpd    zmm25, zmm23, zmm24
 ..atom_loop_exit:
        mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill]
        mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill]
        ### AOS
        add       rax, 24
        ###
        vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9
        vmovsd    QWORD PTR [rcx+r10*8], xmm0                   #84.9
        vaddsd    xmm1, xmm20, QWORD PTR [rbx+r10*8]            #85.9
        vmovsd    QWORD PTR [rbx+r10*8], xmm1                   #85.9
        vaddsd    xmm2, xmm4, QWORD PTR [rdi+r10*8]             #86.9
        vmovsd    QWORD PTR [rdi+r10*8], xmm2                   #86.9
        inc       r10                                           #55.5
        cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill]
        jb        ..atom_loop_begin
        vzeroupper                                              #93.12
        vxorpd    xmm0, xmm0, xmm0                              #93.12
        #call      getTimeStamp                                  # xmm0 <- getTimeStamp()
        #vsubsd    xmm0, xmm0, QWORD PTR [-56+rsp]               # xmm0 <- E-S
        pop       rbx
        pop       r15
        pop       r14                                           #93.12
        pop       r13                                           #93.12
        pop       r12                                           #93.12
        pop       rbp                                           #93.12
        ret                                                     #93.12
 .type	computeForce,@function
 .size	computeForce,.-computeForce
 ..LNcomputeForce.0:
 	.data
 # -- End  computeForce
 	.section .rodata, "a"
 	.align 64
 	.align 64
 .L_2il0floatpacket.2:
 	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.2,@object
 	.size	.L_2il0floatpacket.2,64
 	.align 64
 .L_2il0floatpacket.4:
 	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 	.type	.L_2il0floatpacket.4,@object
 	.size	.L_2il0floatpacket.4,64
 	.align 64
 .L_2il0floatpacket.6:
 	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 	.type	.L_2il0floatpacket.6,@object
 	.size	.L_2il0floatpacket.6,64
 	.align 32
 .L_2il0floatpacket.0:
 	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 	.type	.L_2il0floatpacket.0,@object
 	.size	.L_2il0floatpacket.0,32
 	.align 32
 .L_2il0floatpacket.1:
 	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 	.type	.L_2il0floatpacket.1,@object
 	.size	.L_2il0floatpacket.1,32
 	.align 8
 .L_2il0floatpacket.3:
 	.long	0x00000000,0x40480000
 	.type	.L_2il0floatpacket.3,@object
 	.size	.L_2il0floatpacket.3,8
 	.align 8
 .L_2il0floatpacket.5:
 	.long	0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.5,@object
 	.size	.L_2il0floatpacket.5,8
 	.data
 	.section .note.GNU-stack, ""
 # End
--- a/asm/unused/force_lj.s
+++ b/asm/unused/force_lj.s
@@ -0,0 +1,326 @@
 .intel_syntax noprefix
 .text
 .align    16,0x90
 .globl computeForceLJ
 computeForceLJ:
 # parameter 1: rdi Parameter*
 # parameter 2: rsi Atom*
 # parameter 3: rdx Neighbor*
        push      rbp
        push      r12
        push      r13
        push      r14
        push      r15
        push      rbx
        mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal
        vmovsd    xmm2, QWORD PTR [96+rdi]                          # xmm2 <- param->cutforce
        vmovsd    xmm1, QWORD PTR [32+rdi]                          # xmm1 <- param->sigma6
        vmovsd    xmm0, QWORD PTR [24+rdi]                          # xmm0 <- param->epsilon
        mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx
        mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy
        mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz
        test      r9d, r9d                                          # atom->Nlocal <= 0
        jle       ..atom_loop_exit
        xor       r10d, r10d                                        # r10d <- 0
        mov       ecx, r9d                                          # ecx <- atom->Nlocal
        xor       r8d, r8d                                          # r8d <- 0
        mov       r11d, 1                                           # r11d <- 1
        xor       eax, eax                                          # eax <- 0
        shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1
        je        ..zero_last_element                               # ecx == 0
 # Init forces to zero loop (unroll factor = 2)
 ..init_force_loop:
        mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0
        mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0
        mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0
        mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0
        mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0
        mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0
        add       r8, 16                                            # i++
        inc       r10                                               # i++
        cmp       r10, rcx                                          # i < Nlocal
        jb        ..init_force_loop
 # Trick to make r11d contain value of last element to be zeroed plus 1
 # Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
        lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1
 ..zero_last_element:
        lea       ecx, DWORD PTR [-1+r11]                           # ecx <- i * 2
        cmp       ecx, r9d                                          # i >= Nlocal
        jae       ..before_atom_loop
        # Set last element to zero
        movsxd    r11, r11d                                         # r11 <- i * 2
        mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0
        mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0
        mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0
 # Initialize registers to be used within atom loop
 ..before_atom_loop:
        vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq
        vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...]
        vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon
        vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7]
        vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...]
        vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...]
        vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...]
        vbroadcastsd zmm14, xmm0                                    # zmm14 <- [48 * epsilon, ...]
        movsxd    r9, r9d                                           # r9 <- atom->Nlocal
        xor       r10d, r10d                                        # r10d <- 0 (i)
        mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh
        mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors
        movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs
        mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x
        ### AOS
        xor       eax, eax
        ### SOA
        #mov       rax, QWORD PTR [24+rsi]                          # rax <- atom->y
        #mov       rsi, QWORD PTR [32+rsi]                          # rsi <- atom->z
        ###
        shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4
        # Register spilling
        mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal
        mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh
        mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy
        mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx
        mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15
        mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx
        #sub       rsp, 64
        #call      getTimeStamp                                      # xmm0 <- getTimeStamp()
        #vmovsd    QWORD PTR [-56+rsp], xmm0                         # [-56+rsp] <- xmm0 [spill]
        #add       rsp, 64
 ..atom_loop_begin:
        mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh
        vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 (fix)
        vmovapd   xmm20, xmm25                                      # xmm20 <- 0 (fiy)
        mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs)
        vmovapd   xmm4, xmm20                                       # xmm4 <- 0 (fiz)
        ### AOS
        vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3]
        vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1]
        vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2]
        ### SOA
        #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i]
        #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i]
        #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i]
        ###
        vbroadcastsd zmm0, xmm8                                     # zmm0 <- atom_x(i)
        vbroadcastsd zmm1, xmm9                                     # zmm1 <- atom_y(i)
        vbroadcastsd zmm2, xmm10                                    # zmm2 <- atom_z(i)
        test      r13d, r13d                                        # numneighs <= 0
        jle       ..atom_loop_exit
        vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix)
        vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy)
        vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz)
        mov       rcx, r12                                          # rcx <- neighbor->maxneighs * 4
        imul      rcx, r10                                          # rcx <- neighbor->maxneighs * 4 * i
        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
        xor       r9d, r9d                                          # r9d <- 0 (k)
        mov       r14d, r13d                                        # r14d <- numneighs
        cmp       r14d, 8
        jl        ..compute_forces_remainder
 ..compute_forces:
        vpcmpeqb  k1, xmm0, xmm0
        vpcmpeqb  k2, xmm0, xmm0
        vpcmpeqb  k3, xmm0, xmm0
        vmovdqu   ymm3, YMMWORD PTR [rcx+r9*4]
        vpxord    zmm5, zmm5, zmm5
        vpxord    zmm6, zmm6, zmm6
        ### AOS
        vpaddd     ymm4, ymm3, ymm3
        vpaddd     ymm3, ymm3, ymm4
        vpxord     zmm4, zmm4, zmm4
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
        ### SOA
        #vpxord     zmm4, zmm4, zmm4
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
        ###
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
        # Cutoff radius condition
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
        vfmadd231pd zmm13{k5}, zmm30, zmm28                         # fix += force * delx
        vfmadd231pd zmm12{k5}, zmm30, zmm29                         # fiy += force * dely
        vfmadd231pd zmm11{k5}, zmm30, zmm31                         # fiz += force * delz
        sub       r14d, 8
        add       r9, 8
        cmp       r14d, 8
        jge       ..compute_forces
 # Check if there are remaining neighbors to be computed
 ..compute_forces_remainder:
        test      r14d, r14d
        jle       ..sum_up_forces
        vpbroadcastd ymm4, r14d
        vpcmpgtd  k1, ymm4, ymm17
        kmovw     r15d, k1
        vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
        kmovw     k2, k1
        kmovw     k3, k1
        vpxord    zmm5, zmm5, zmm5
        vpxord    zmm6, zmm6, zmm6
        ### AOS
        vpaddd     ymm4, ymm3, ymm3
        vpaddd     ymm3, ymm3, ymm4
        vpxord     zmm4, zmm4, zmm4
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
        #### SOA
        #vpxord     zmm4, zmm4, zmm4
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
        ###
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
        # Cutoff radius condition
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
        kmovw     r9d, k5                                           # r9d <- rsq < cutforcesq
        and       r15d, r9d                                         # r15d <- rsq < cutforcesq && k < numneighs
        kmovw     k3, r15d                                          # k3 <- rsq < cutforcesq && k < numneighs
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
        vfmadd231pd zmm13{k3}, zmm30, zmm28                         # fix += force * delx
        vfmadd231pd zmm12{k3}, zmm30, zmm29                         # fiy += force * dely
        vfmadd231pd zmm11{k3}, zmm30, zmm31                         # fiz += force * delz
 # Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
 # and add them (reduction) to obtain the final contribution for the current atom
 ..sum_up_forces:
        vmovups   zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
        vpermd    zmm0, zmm10, zmm11
        vpermd    zmm5, zmm10, zmm12
        vpermd    zmm21, zmm10, zmm13
        vaddpd    zmm11, zmm0, zmm11
        vaddpd    zmm12, zmm5, zmm12
        vaddpd    zmm13, zmm21, zmm13
        vpermpd   zmm1, zmm11, 78
        vpermpd   zmm6, zmm12, 78
        vpermpd   zmm22, zmm13, 78
        vaddpd    zmm2, zmm11, zmm1
        vaddpd    zmm8, zmm12, zmm6
        vaddpd    zmm23, zmm13, zmm22
        vpermpd   zmm3, zmm2, 177
        vpermpd   zmm9, zmm8, 177
        vpermpd   zmm24, zmm23, 177
        vaddpd    zmm4, zmm2, zmm3
        vaddpd    zmm20, zmm8, zmm9
        vaddpd    zmm25, zmm23, zmm24
 ..atom_loop_exit:
        mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill]
        mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill]
        ### AOS
        add       rax, 24
        ###
        vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9
        vmovsd    QWORD PTR [rcx+r10*8], xmm0                   #84.9
        vaddsd    xmm1, xmm20, QWORD PTR [rbx+r10*8]            #85.9
        vmovsd    QWORD PTR [rbx+r10*8], xmm1                   #85.9
        vaddsd    xmm2, xmm4, QWORD PTR [rdi+r10*8]             #86.9
        vmovsd    QWORD PTR [rdi+r10*8], xmm2                   #86.9
        inc       r10                                           #55.5
        cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill]
        jb        ..atom_loop_begin
        vzeroupper                                              #93.12
        vxorpd    xmm0, xmm0, xmm0                              #93.12
        #call      getTimeStamp                                  # xmm0 <- getTimeStamp()
        #vsubsd    xmm0, xmm0, QWORD PTR [-56+rsp]               # xmm0 <- E-S
        pop       rbx
        pop       r15
        pop       r14                                           #93.12
        pop       r13                                           #93.12
        pop       r12                                           #93.12
        pop       rbp                                           #93.12
        ret                                                     #93.12
 .type	computeForceLJ,@function
 .size	computeForceLJ,.-computeForceLJ
 ..LNcomputeForce.0:
 	.data
 # -- End  computeForceLJ
 	.section .rodata, "a"
 	.align 64
 	.align 64
 .L_2il0floatpacket.2:
 	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.2,@object
 	.size	.L_2il0floatpacket.2,64
 	.align 64
 .L_2il0floatpacket.4:
 	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 	.type	.L_2il0floatpacket.4,@object
 	.size	.L_2il0floatpacket.4,64
 	.align 64
 .L_2il0floatpacket.6:
 	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 	.type	.L_2il0floatpacket.6,@object
 	.size	.L_2il0floatpacket.6,64
 	.align 32
 .L_2il0floatpacket.0:
 	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 	.type	.L_2il0floatpacket.0,@object
 	.size	.L_2il0floatpacket.0,32
 	.align 32
 .L_2il0floatpacket.1:
 	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 	.type	.L_2il0floatpacket.1,@object
 	.size	.L_2il0floatpacket.1,32
 	.align 8
 .L_2il0floatpacket.3:
 	.long	0x00000000,0x40480000
 	.type	.L_2il0floatpacket.3,@object
 	.size	.L_2il0floatpacket.3,8
 	.align 8
 .L_2il0floatpacket.5:
 	.long	0x00000000,0x3ff00000
 	.type	.L_2il0floatpacket.5,@object
 	.size	.L_2il0floatpacket.5,8
 	.data
 	.section .note.GNU-stack, ""
 # End
--- a/common/includes/parameter.h
+++ b/common/includes/parameter.h
@@ -9,10 +9,8 @@
 #if PRECISION == 1
 #define MD_FLOAT float
 #   define MD_UINT  unsigned int
 #else
 #define MD_FLOAT double
 #   define MD_UINT  unsigned long long int
 #endif
 typedef struct {
@@ -21,7 +19,6 @@ typedef struct {
    char* input_file;
    char* vtk_file;
    char* xtc_file;
    char* write_atom_file;
    MD_FLOAT epsilon;
    MD_FLOAT sigma;
    MD_FLOAT sigma6;
--- a/common/includes/simd/avx2_double.h
+++ b/common/includes/simd/avx2_double.h
@@ -48,13 +48,11 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
    t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
    t0 = _mm256_add_pd(t0, t2);
    t1 = _mm256_add_pd(t1, t2);
-    t0 = _mm256_blend_pd(t0, t1, 0xC);
+    t0 = _mm256_blend_pd(t0, t1, 0b1100);
    //t0 = _mm256_blend_pd(t0, t1, 0b1100);
    t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
    _mm256_store_pd(m, t1);
-    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
+    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
    //t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
    a0 = _mm256_castpd256_pd128(t0);
    a1 = _mm256_extractf128_pd(t0, 0x1);
    a0 = _mm_add_sd(a0, a1);
@@ -93,7 +91,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
 }
 // Functions used in LAMMPS kernel
-#define simd_gather(vidx, m, s)     _mm256_i32gather_pd(m, vidx, s);
+static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
 static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
 static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
 static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
--- a/common/includes/simd/avx512_double.h
+++ b/common/includes/simd/avx512_double.h
@@ -12,10 +12,7 @@
 #define MD_SIMD_FLOAT   __m512d
 #define MD_SIMD_MASK    __mmask8
 #define MD_SIMD_INT     __m256i
 #define MD_SIMD_BITMASK     MD_SIMD_INT
 #define MD_SIMD_IBOOL       __mmask16
 static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
 static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
--- a/common/includes/simd/avx512_float.h
+++ b/common/includes/simd/avx512_float.h
@@ -7,30 +7,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <immintrin.h>
 #ifndef NO_ZMM_INTRIN
 #include <zmmintrin.h>
 #endif
 #define MD_SIMD_FLOAT       __m512
 #define MD_SIMD_MASK        __mmask16
 #define MD_SIMD_INT         __m256i
 #define MD_SIMD_IBOOL       __mmask16
 #define MD_SIMD_INT32       __m512i
 #define MD_SIMD_BITMASK     MD_SIMD_INT32
 static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
    return _mm512_load_si512(m);
 }
 static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
    return _mm512_set1_epi32(a);
 }
 static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
    return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
 }
 static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
 static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
 static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
 static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
@@ -88,7 +69,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
    return _mm_cvtss_f32(t3);
 }
-static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
+inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
    __m256 t;
    a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
    t = _mm256_load_ps(m);
--- a/common/includes/timing.h
+++ b/common/includes/timing.h
@@ -7,8 +7,8 @@
 #ifndef __TIMING_H_
 #define __TIMING_H_
-extern double getTimeStamp(void);
+extern double getTimeStamp();
-extern double getTimeResolution(void);
+extern double getTimeResolution();
-extern double getTimeStamp_(void);
+extern double getTimeStamp_();
 #endif
--- a/common/includes/util.h
+++ b/common/includes/util.h
@@ -39,8 +39,8 @@ extern double myrandom(int*);
 extern void random_reset(int *seed, int ibase, double *coord);
 extern int str2ff(const char *string);
 extern const char* ff2str(int ff);
 extern int get_num_threads();
 extern void readline(char *line, FILE *fp);
 extern void debug_printf(const char *format, ...);
 extern int get_cuda_num_threads();
 #endif
--- a/common/parameter.c
+++ b/common/parameter.c
@@ -17,7 +17,6 @@ void initParameter(Parameter *param) {
    param->vtk_file = NULL;
    param->xtc_file = NULL;
    param->eam_file = NULL;
    param->write_atom_file = NULL;
    param->force_field = FF_LJ;
    param->epsilon = 1.0;
    param->sigma = 1.0;
@@ -132,19 +131,19 @@ void readParameter(Parameter *param, const char *filename) {
 void printParameter(Parameter *param) {
    printf("Parameters:\n");
    if(param->input_file != NULL) {
-        printf("\tInput file: %s\n", param->input_file);
+        printf("Input file: %s\n", param->input_file);
    }
    if(param->vtk_file != NULL) {
-        printf("\tVTK file: %s\n", param->vtk_file);
+        printf("VTK file: %s\n", param->vtk_file);
    }
    if(param->xtc_file != NULL) {
-        printf("\tXTC file: %s\n", param->xtc_file);
+        printf("XTC file: %s\n", param->xtc_file);
    }
    if(param->eam_file != NULL) {
-        printf("\tEAM file: %s\n", param->eam_file);
+        printf("EAM file: %s\n", param->eam_file);
    }
    printf("\tForce field: %s\n", ff2str(param->force_field));
@@ -170,11 +169,6 @@ void printParameter(Parameter *param) {
    printf("\tNumber of timesteps: %d\n", param->ntimes);
    printf("\tReport stats every (timesteps): %d\n", param->nstat);
    printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
    #ifdef SORT_ATOMS
    printf("\tSort atoms when reneighboring: yes\n");
    #else
    printf("\tSort atoms when reneighboring: no\n");
    #endif
    printf("\tPrune every (timesteps): %d\n", param->prune_every);
    printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
    printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
--- a/common/util.c
+++ b/common/util.c
@@ -79,7 +79,7 @@ const char* ff2str(int ff) {
    return "invalid";
 }
-int get_cuda_num_threads() {
+int get_num_threads() {
    const char *num_threads_env = getenv("NUM_THREADS");
    return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
 }
--- a/config.mk
+++ b/config.mk
@@ -1,9 +1,9 @@
 # Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
-TAG ?= ICC
+TAG ?= NVCC
 # Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
 ISA ?= AVX512
 # Optimization scheme (lammps/gromacs/clusters_per_bin)
-OPT_SCHEME ?= lammps
+OPT_SCHEME ?= gromacs
 # Enable likwid (true or false)
 ENABLE_LIKWID ?= true
 # SP or DP
@@ -13,10 +13,8 @@ DATA_LAYOUT ?= AOS
 # Assembly syntax to generate (ATT/INTEL)
 ASM_SYNTAX ?= ATT
 # Debug
-DEBUG ?= false
+DEBUG ?= true
 # Sort atoms when reneighboring (true or false)
 SORT_ATOMS ?= true
 # Explicitly store and load atom types (true or false)
 EXPLICIT_TYPES ?= false
 # Trace memory addresses for cache simulator (true or false)
@@ -38,11 +36,12 @@ USE_REFERENCE_VERSION ?= false
 # Enable XTC output
 XTC_OUTPUT ?= false
 # Check if cj is local when decreasing reaction force
-HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
+HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
 # Configurations for CUDA
 # Use CUDA host memory to optimize transfers
 USE_CUDA_HOST_MEMORY ?= false
 USE_SUPER_CLUSTERS ?= true
 #Feature options
 OPTIONS =  -DALIGNMENT=64
--- a/data/argon_1000/mdbench_params.conf
+++ b/data/argon_1000/mdbench_params.conf
@@ -6,7 +6,7 @@ dt 0.001
 temp 80
 x_out_freq 500
 v_out_freq 5
-cutforce 1.8
+cutforce 0.9
-skin 0.1
+skin 0.0
 reneigh_every 100
 nstat 125000
--- a/1
+++ b/1
--- a/gromacs/atom.c
+++ b/gromacs/atom.c
@@ -37,7 +37,24 @@ void initAtom(Atom *atom) {
    atom->iclusters = NULL;
    atom->jclusters = NULL;
    atom->icluster_bin = NULL;
-    initMasks(atom);
+
 #ifdef USE_SUPER_CLUSTERS
    atom->scl_x = NULL;
    atom->scl_v = NULL;
    atom->scl_f = NULL;
    atom->Nsclusters = 0;
    atom->Nsclusters_local = 0;
    atom->Nsclusters_ghost = 0;
    atom->Nsclusters_max = 0;
    atom->scl_type = NULL;
    atom->siclusters = NULL;
    atom->icluster_idx = NULL;
    atom->sicluster_bin = NULL;
 #endif //USE_SUPER_CLUSTERS
 }
 void createAtom(Atom *atom, Parameter *param) {
@@ -51,7 +68,6 @@ void createAtom(Atom *atom, Parameter *param) {
    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
        atom->epsilon[i] = param->epsilon;
        atom->sigma6[i] = param->sigma6;
@@ -394,113 +410,6 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
    return natoms;
 }
 void initMasks(Atom *atom) {
    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
    unsigned int mask0, mask1, mask2, mask3;
    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
    //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
    }
    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
    }
    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
        atom->exclusion_filter[i] = (1U << i);
    }
    #if CLUSTER_M == CLUSTER_N
    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
        mask0 = (unsigned int)(0xf - 0x1 * cond0);
        mask1 = (unsigned int)(0xf - 0x3 * cond0);
        mask2 = (unsigned int)(0xf - 0x7 * cond0);
        mask3 = (unsigned int)(0xf - 0xf * cond0);
        atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
        atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
        mask0 = (unsigned int)(0xf - 0x1 * cond0);
        mask1 = (unsigned int)(0xf - 0x2 * cond0);
        mask2 = (unsigned int)(0xf - 0x4 * cond0);
        mask3 = (unsigned int)(0xf - 0x8 * cond0);
        atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
        atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
        atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
        atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
        atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
        atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
        atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
        atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
        atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
        atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
    }
    #else
    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
        for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
            #if CLUSTER_M < CLUSTER_N
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
            #else
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
            #endif
            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
            #if CLUSTER_M < CLUSTER_N
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
            #else
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
            #endif
            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
            #if CLUSTER_M < CLUSTER_N
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
            #else
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
            #endif
        }
    }
    #endif
 }
 void growAtom(Atom *atom) {
    int nold = atom->Nmax;
    atom->Nmax += DELTA;
@@ -530,3 +439,18 @@ void growClusters(Atom *atom) {
    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
 }
 #ifdef USE_SUPER_CLUSTERS
 void growSuperClusters(Atom *atom) {
    int nold = atom->Nsclusters_max;
    atom->Nsclusters_max += DELTA;
    atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
    atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
    atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
    //atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
    atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 }
 #endif //USE_SUPER_CLUSTERS
--- a/gromacs/cuda/force_lj.cu
+++ b/gromacs/cuda/force_lj.cu
@@ -39,8 +39,29 @@ extern "C" {
    MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
    int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
    int isReneighboured;
    int *cuda_iclusters;
    int *cuda_nclusters;
    int cuda_max_scl;
    MD_FLOAT *cuda_scl_x;
    MD_FLOAT *cuda_scl_v;
    MD_FLOAT *cuda_scl_f;
    extern void alignDataToSuperclusters(Atom *atom);
    extern void alignDataFromSuperclusters(Atom *atom);
    extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
 }
 extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
                                                    int *cuda_nclusters,
                                                    int *cuda_natoms,
                                                    int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
 extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
                                                  int *cuda_nclusters, int *cuda_natoms,
                                                  int Nsclusters_local, MD_FLOAT dtforce);
 extern "C"
 void initDevice(Atom *atom, Neighbor *neighbor) {
    cuda_assert("cudaDeviceSetup", cudaDeviceReset());
@@ -59,10 +80,23 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
    natoms                  =   (int *) malloc(atom->Nclusters_max * sizeof(int));
    ngatoms                 =   (int *) malloc(atom->Nclusters_max * sizeof(int));
    isReneighboured = 1;
 #ifdef USE_SUPER_CLUSTERS
    cuda_max_scl            =   atom->Nsclusters_max;
    cuda_iclusters          =   (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
    cuda_nclusters          =   (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
    cuda_scl_x              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    cuda_scl_v              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    cuda_scl_f              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 #endif //USE_SUPER_CLUSTERS
 }
 extern "C"
 void copyDataToCUDADevice(Atom *atom) {
    DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
    memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
@@ -85,13 +119,49 @@ void copyDataToCUDADevice(Atom *atom) {
    memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
    memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
    memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
 #ifdef USE_SUPER_CLUSTERS
    //alignDataToSuperclusters(atom);
    if (cuda_max_scl < atom->Nsclusters_max) {
        cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
        cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
        cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
        cuda_max_scl            =   atom->Nsclusters_max;
        cuda_iclusters          =   (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
        cuda_nclusters          =   (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
        cuda_scl_x              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
        cuda_scl_v              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
        cuda_scl_f              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    }
    memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 #endif //USE_SUPER_CLUSTERS
    DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
 }
 extern "C"
 void copyDataFromCUDADevice(Atom *atom) {
    DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
    memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 #ifdef USE_SUPER_CLUSTERS
    memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
    //alignDataFromSuperclusters(atom);
 #endif //USE_SUPER_CLUSTERS
    DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
 }
 extern "C"
@@ -109,6 +179,12 @@ void cudaDeviceFree() {
    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
    free(natoms);
    free(ngatoms);
 #ifdef USE_SUPER_CLUSTERS
    cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
    cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
    cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
 #endif //USE_SUPER_CLUSTERS
 }
 __global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
@@ -165,6 +241,39 @@ __global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
    }
 }
 __global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
                                   int *cuda_jclusters_natoms,
                                   int *cuda_PBCx,
                                   int *cuda_PBCy,
                                   int *cuda_PBCz,
                                   int Nsclusters_local,
                                   int Nclusters_ghost,
                                   MD_FLOAT param_xprd,
                                   MD_FLOAT param_yprd,
                                   MD_FLOAT param_zprd) {
    unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
    if (cg >= Nclusters_ghost) return;
    //int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
    int jfac = SCLUSTER_SIZE / CLUSTER_M;
    int ncj = Nsclusters_local / jfac;
    MD_FLOAT xprd = param_xprd;
    MD_FLOAT yprd = param_yprd;
    MD_FLOAT zprd = param_zprd;
    const int cj = ncj + cg;
    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
    int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
    MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
    MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
    for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
        cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
        cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
        cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
    }
 }
 __global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
                                         int Nclusters_local, int Nclusters_max,
                                         int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
@@ -251,9 +360,17 @@ extern "C"
 void cudaInitialIntegrate(Parameter *param, Atom *atom) {
    const int threads_num = 16;
    dim3 block_size = dim3(threads_num, 1, 1);
    #ifdef USE_SUPER_CLUSTERS
    dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
    cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
                                                            cuda_nclusters,
                                                            cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
    #else
    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
    cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
                                                         cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
    #endif //USE_SUPER_CLUSTERS
    cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
    cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
 }
@@ -264,11 +381,19 @@ extern "C"
 void cudaUpdatePbc(Atom *atom, Parameter *param) {
    const int threads_num = 512;
    dim3 block_size = dim3(threads_num, 1, 1);;
-    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
+    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
 #ifdef USE_SUPER_CLUSTERS
    cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
                                       cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
                                       atom->Nclusters_local, atom->Nclusters_ghost,
                                       param->xprd, param->yprd, param->zprd);
 #else
    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
                                                  cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
                                                  atom->Nclusters_local, atom->Nclusters_ghost,
                                                  param->xprd, param->yprd, param->zprd);
 #endif //USE_SUPER_CLUSTERS
    cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
    cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
 }
@@ -310,8 +435,17 @@ extern "C"
 void cudaFinalIntegrate(Parameter *param, Atom *atom) {
    const int threads_num = 16;
    dim3 block_size = dim3(threads_num, 1, 1);
    #ifdef USE_SUPER_CLUSTERS
    dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
    cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
                                                          cuda_nclusters, cuda_natoms,
                                                          atom->Nsclusters_local, param->dt);
    #else
    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
-    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
+    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
                                                          atom->Nclusters_local, param->dt);
    #endif //USE_SUPER_CLUSTERS
    cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
    cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
 }
--- a/gromacs/cuda/force_lj_sup.cu
+++ b/gromacs/cuda/force_lj_sup.cu
@@ -0,0 +1,288 @@
 extern "C" {
 #include <stdio.h>
 //---
 #include <cuda.h>
 #include <driver_types.h>
 //---
 #include <likwid-marker.h>
 //---
 #include <atom.h>
 #include <device.h>
 #include <neighbor.h>
 #include <parameter.h>
 #include <stats.h>
 #include <timing.h>
 #include <util.h>
 }
 extern "C" {
    extern MD_FLOAT *cuda_cl_x;
    extern MD_FLOAT *cuda_cl_v;
    extern MD_FLOAT *cuda_cl_f;
    extern int *cuda_neighbors;
    extern int *cuda_numneigh;
    extern int *cuda_natoms;
    extern int *natoms;
    extern int *ngatoms;
    extern int *cuda_border_map;
    extern int *cuda_jclusters_natoms;
    extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
    extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
    extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
    extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
    extern int isReneighboured;
    extern int *cuda_iclusters;
    extern int *cuda_nclusters;
    extern MD_FLOAT *cuda_scl_x;
    extern MD_FLOAT *cuda_scl_v;
    extern MD_FLOAT *cuda_scl_f;
 }
 #ifdef USE_SUPER_CLUSTERS
 extern "C"
 void alignDataToSuperclusters(Atom *atom) {
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            /*
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
             */
            memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
        }
    }
 }
 extern "C"
 void alignDataFromSuperclusters(Atom *atom) {
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
            /*
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
             */
            memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
        }
    }
 }
 __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
                                             int *cuda_nclusters,
                                             int *cuda_natoms,
                                             int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
    //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
    if (sci_pos >= Nsclusters_local) return;
    //unsigned int ci_pos = cii_pos / CLUSTER_M;
    //unsigned int scii_pos = cii_pos % CLUSTER_M;
    //if (ci_pos >= cuda_nclusters[sci_pos]) return;
    //if (scii_pos >= cuda_natoms[ci_pos]) return;
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
    for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
        ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
        ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
        ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
        ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
        ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
        ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
    }
 }
 __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
                                           int *cuda_nclusters, int *cuda_natoms,
                                           int Nsclusters_local, MD_FLOAT dtforce) {
    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
    //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
    if (sci_pos >= Nsclusters_local) return;
    //unsigned int ci_pos = cii_pos / CLUSTER_M;
    //unsigned int scii_pos = cii_pos % CLUSTER_M;
    //if (ci_pos >= cuda_nclusters[sci_pos]) return;
    //if (scii_pos >= cuda_natoms[ci_pos]) return;
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
    for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
        ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
        ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
        ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
    }
 }
 __global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
                                            int *cuda_nclusters, int *cuda_iclusters,
                                            int Nsclusters_local,
                                            int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
                                            MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
    if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
    unsigned int ci_pos = scii_pos / CLUSTER_M;
    unsigned int cii_pos = scii_pos % CLUSTER_M;
    if (ci_pos >= cuda_nclusters[sci_pos]) return;
    int ci_cj0 = CJ0_FROM_CI(ci_pos);
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
    //int numneighs = cuda_numneigh[ci_pos];
    int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
    for(int k = 0; k < numneighs; k++) {
        int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
        int scj = glob_j / SCLUSTER_SIZE;
        // TODO Make cj accessible from super cluster data alignment (not reachable right now)
        int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
        int cj_vec_base = cj;
        MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
        MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
        MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
        MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
        MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
        MD_FLOAT fix = 0;
        MD_FLOAT fiy = 0;
        MD_FLOAT fiz = 0;
        //int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
        int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
        if(cond) {
            MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
            MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
            MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
            if(rsq < cutforcesq) {
                MD_FLOAT sr2 = 1.0 / rsq;
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
                if(half_neigh) {
                    atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
                    atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
                    atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
                }
                fix += delx * force;
                fiy += dely * force;
                fiz += delz * force;
                atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
                atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
                atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
            }
        }
    }
 }
 extern "C"
 double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
    memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
    if (isReneighboured) {
        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
            memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
            memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
        }
        for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
            memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
            //memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
        }
        memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
        isReneighboured = 0;
    }
    const int threads_num = 1;
    dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
    dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");
    computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
                                                           cuda_nclusters, cuda_iclusters,
                                                           atom->Nsclusters_local,
                                                           cuda_numneigh, cuda_neighbors,
                                                           neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
                                                           sigma6, epsilon);
    cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
    cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
    LIKWID_MARKER_STOP("force");
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
    return E-S;
 }
 #endif //USE_SUPER_CLUSTERS
--- a/gromacs/force_lj.c
+++ b/gromacs/force_lj.c
@@ -16,32 +16,6 @@
 #include <simd.h>
 /*
 static inline void gmx_load_simd_2xnn_interactions(
    int excl,
    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
    //SimdInt32 mask_pr_S(excl);
    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
 }
 static inline void gmx_load_simd_4xn_interactions(
    int excl,
    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
    //SimdInt32 mask_pr_S(excl);
    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
    *interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
    *interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
 }
 */
 double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
    DEBUG_MESSAGE("computeForceLJ begin\n");
    int Nlocal = atom->Nlocal;
@@ -61,12 +35,9 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
    }
    double S = getTimeStamp();
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force");
-    #pragma omp for schedule(runtime)
+    #pragma omp parallel for
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_cj0 = CJ0_FROM_CI(ci);
        int ci_cj1 = CJ1_FROM_CI(ci);
@@ -148,8 +119,6 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
    }
    LIKWID_MARKER_STOP("force");
    }
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJ end\n");
    return E-S;
@@ -167,6 +136,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
@@ -179,41 +149,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
    }
    double S = getTimeStamp();
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force");
-    /*
+    #pragma omp parallel for
    MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
    MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
    MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i);
    MD_SIMD_FLOAT zero_S = simd_broadcast(0.0);
    MD_SIMD_FLOAT one_S = simd_broadcast(1.0);
    #if CLUSTER_M <= CLUSTER_N
    MD_SIMD_MASK diagonal_mask0, diagonal_mask2;
    diagonal_mask0 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_mask2 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
    #else
    MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12;
    diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_jmi_S = diagonal_jmi_S - one_S;
    diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
    #endif
    */
    #pragma omp for schedule(runtime)
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_cj0 = CJ0_FROM_CI(ci);
        #if CLUSTER_M > CLUSTER_N
@@ -224,7 +162,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
        int numneighs_masked = neighbor->numneigh_masked[ci];
        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
@@ -239,138 +176,76 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
        MD_SIMD_FLOAT fiy2 = simd_zero();
        MD_SIMD_FLOAT fiz2 = simd_zero();
-        for(int k = 0; k < numneighs_masked; k++) {
+        for(int k = 0; k < numneighs; k++) {
            int cj = neighs[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            //int imask = neighs_imask[k];
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
-            //MD_SIMD_MASK interact0;
+            unsigned int mask0, mask1, mask2, mask3;
            //MD_SIMD_MASK interact2;
            //gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2);
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
-            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
            #if CLUSTER_M == CLUSTER_N
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 0]);
+            mask0 = (unsigned int)(0xf - 0x1 * cond0);
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 1]);
+            mask1 = (unsigned int)(0xf - 0x3 * cond0);
-            #else
+            mask2 = (unsigned int)(0xf - 0x7 * cond0);
-            #if CLUSTER_M < CLUSTER_N
+            mask3 = (unsigned int)(0xf - 0xf * cond0);
            #elif CLUSTER_M < CLUSTER_N
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
            #else
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
-            #endif
+            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0]);
+            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1]);
+            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
            #endif
-            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
-            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
            cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
            cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
-            /*
+            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
-            #if CLUSTER_M <= CLUSTER_N
+            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
-            if(ci == ci_cj0) {
+
-                cutoff_mask0 = simd_mask_and(cutoff_mask0, diagonal_mask0);
+            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
-                cutoff_mask2 = simd_mask_and(cutoff_mask2, diagonal_mask2);
+            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
            }
            #else
            if(ci == ci_cj0) {
                cutoff_mask0 = cutoff_mask0 && diagonal_mask00;
                cutoff_mask2 = cutoff_mask2 && diagonal_mask02;
            } else if(ci == ci_cj1) {
                cutoff_mask0 = cutoff_mask0 && diagonal_mask10;
                cutoff_mask2 = cutoff_mask2 && diagonal_mask12;
            }
            #endif
            */
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
-            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
+            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
-            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
+            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
-            fix0 += tx0;
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
-            fiy0 += ty0;
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
            fiz0 += tz0;
            fix2 += tx2;
            fiy2 += ty2;
            fiz2 += tz2;
-            #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
+            MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
-            if(cj < CJ1_FROM_CI(atom->Nlocal)) {
+            MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
-                simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
+            MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
-            }
+            MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
-            #else
+            MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
-            simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
+            MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
            #endif
        }
-        for(int k = numneighs_masked; k < numneighs; k++) {
+            fix0 = simd_add(fix0, tx0);
-            int cj = neighs[k];
+            fiy0 = simd_add(fiy0, ty0);
-            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
+            fiz0 = simd_add(fiz0, tz0);
-            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
+            fix2 = simd_add(fix2, tx2);
-            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
+            fiy2 = simd_add(fiy2, ty2);
-
+            fiz2 = simd_add(fiz2, tz2);
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
            fix0 += tx0;
            fiy0 += ty0;
            fiz0 += tz0;
            fix2 += tx2;
            fiy2 += ty2;
            fiz2 += tz2;
            #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
            if(cj < CJ1_FROM_CI(atom->Nlocal)) {
@@ -391,8 +266,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
    }
    LIKWID_MARKER_STOP("force");
    }
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
    return E-S;
@@ -410,6 +283,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
@@ -422,12 +296,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
    }
    double S = getTimeStamp();
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force");
-    #pragma omp for schedule(runtime)
+    #pragma omp parallel for
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_cj0 = CJ0_FROM_CI(ci);
        #if CLUSTER_M > CLUSTER_N
@@ -438,7 +309,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
        int numneighs_masked = neighbor->numneigh_masked[ci];
        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
@@ -453,7 +323,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
        MD_SIMD_FLOAT fiy2 = simd_zero();
        MD_SIMD_FLOAT fiz2 = simd_zero();
-        for(int k = 0; k < numneighs_masked; k++) {
+        for(int k = 0; k < numneighs; k++) {
            int cj = neighs[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -462,75 +332,52 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
-            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
            #if CLUSTER_M == CLUSTER_N
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 0]);
+            mask0 = (unsigned int)(0xf - 0x1 * cond0);
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 1]);
+            mask1 = (unsigned int)(0xf - 0x2 * cond0);
-            #else
+            mask2 = (unsigned int)(0xf - 0x4 * cond0);
-            #if CLUSTER_M < CLUSTER_N
+            mask3 = (unsigned int)(0xf - 0x8 * cond0);
            #elif CLUSTER_M < CLUSTER_N
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
            #else
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
            #endif
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0]);
+
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1]);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
-            #endif
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
-            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
+            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
-            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
+            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
            fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
            fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
            fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
            fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
        }
-        for(int k = numneighs_masked; k < numneighs; k++) {
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
-            int cj = neighs[k];
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
@@ -551,8 +398,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
    }
    LIKWID_MARKER_STOP("force");
    }
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
    return E-S;
@@ -578,6 +423,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
@@ -589,13 +436,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
        }
    }
-    double S = getTimeStamp();
+    #pragma omp parallel for
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force");
    #pragma omp for schedule(runtime)
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_cj0 = CJ0_FROM_CI(ci);
        #if CLUSTER_M > CLUSTER_N
@@ -606,7 +447,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
        int numneighs_masked = neighbor->numneigh_masked[ci];
        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
@@ -633,7 +473,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
        MD_SIMD_FLOAT fiy3 = simd_zero();
        MD_SIMD_FLOAT fiz3 = simd_zero();
-        for(int k = 0; k < numneighs_masked; k++) {
+        for(int k = 0; k < numneighs; k++) {
            int cj = neighs[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -641,43 +481,45 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
-            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
            #if CLUSTER_M == CLUSTER_N
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 0]);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 1]);
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 2]);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 3]);
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
-            #else
+            #elif CLUSTER_M < CLUSTER_N
            #if CLUSTER_M < CLUSTER_N
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
            #else
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
-            #endif
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0]);
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1]);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2]);
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3]);
            #endif
-            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
+            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
-            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
+            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
-            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
+            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
-            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
+            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
@@ -689,113 +531,28 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
-            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
-            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
-            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
+            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
-            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
-            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
+            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
-            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
+            MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
-            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
+            MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
-            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
+            MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
-            MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
+            MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
-            MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
+            MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
-            MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
+            MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
-            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
+            MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
-            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
+            MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
-            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
+            MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
-            MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
+            MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
-            MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
+            MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
-            MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
+            MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
            fix0 = simd_add(fix0, tx0);
            fiy0 = simd_add(fiy0, ty0);
            fiz0 = simd_add(fiz0, tz0);
            fix1 = simd_add(fix1, tx1);
            fiy1 = simd_add(fiy1, ty1);
            fiz1 = simd_add(fiz1, tz1);
            fix2 = simd_add(fix2, tx2);
            fiy2 = simd_add(fiy2, ty2);
            fiz2 = simd_add(fiz2, tz2);
            fix3 = simd_add(fix3, tx3);
            fiy3 = simd_add(fiy3, ty3);
            fiz3 = simd_add(fiz3, tz3);
            #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
            if(cj < CJ1_FROM_CI(atom->Nlocal)) {
                simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
                simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
                simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
            }
            #else
            simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
            simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
            simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
            #endif
        }
        for(int k = numneighs_masked; k < numneighs; k++) {
            int cj = neighs[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
            MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
            MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
            MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
            MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
            MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
            MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
            fix0 = simd_add(fix0, tx0);
            fiy0 = simd_add(fiy0, ty0);
@@ -833,8 +590,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
    }
    LIKWID_MARKER_STOP("force");
    }
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
    return E-S;
@@ -852,6 +607,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
@@ -863,13 +620,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
        }
    }
-    double S = getTimeStamp();
+    #pragma omp parallel for
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force");
    #pragma omp for schedule(runtime)
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int ci_cj0 = CJ0_FROM_CI(ci);
        #if CLUSTER_M > CLUSTER_N
@@ -880,7 +631,6 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
        int numneighs_masked = neighbor->numneigh_masked[ci];
        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
@@ -907,50 +657,52 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
        MD_SIMD_FLOAT fiy3 = simd_zero();
        MD_SIMD_FLOAT fiz3 = simd_zero();
-        for(int k = 0; k < numneighs_masked; k++) {
+        for(int k = 0; k < numneighs; k++) {
            int cj = neighs[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
-            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
-            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
+            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
-            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
+            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
-            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
+            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
            #if CLUSTER_M == CLUSTER_N
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 0]);
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 1]);
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 2]);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
-            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 3]);
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
-            #else
+            #elif CLUSTER_M < CLUSTER_N
            #if CLUSTER_M < CLUSTER_N
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
            #else
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
-            #endif
+            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
-            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0]);
+            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
-            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1]);
+            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
-            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2]);
+            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3]);
            #endif
-            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
+            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
-            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
+            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
-            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
+            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
-            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
+            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
@@ -962,87 +714,28 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
-            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
-            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
+            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
-            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
+            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
-            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
+            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
-            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
+            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
-            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
+            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
-            fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
+            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
-            fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
+            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
-            fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
+            fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
-            fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
+            fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
-            fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
+            fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
-            fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
+            fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
-            fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
+            fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
-            fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
+            fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
-            fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
+            fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
-            fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
+            fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
-            fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
+            fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
-            fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
+            fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
        }
        for(int k = numneighs_masked; k < numneighs; k++) {
            int cj = neighs[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
            MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
            fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
            fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
            fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
            fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
            fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
            fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
            fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
            fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
            fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
            fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
            fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
            fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
        }
        simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
@@ -1051,13 +744,10 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
        addStat(stats->calculated_forces, 1);
        addStat(stats->num_neighs, numneighs);
-        addStat(stats->force_iters, (long long int)((double)numneighs));
+        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
        //addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
    }
    LIKWID_MARKER_STOP("force");
    }
    double E = getTimeStamp();
    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
    return E-S;
--- a/gromacs/includes/atom.h
+++ b/gromacs/includes/atom.h
@@ -22,8 +22,25 @@
 #   define KERNEL_NAME              "CUDA"
 #   define CLUSTER_M                8
 #   define CLUSTER_N                VECTOR_WIDTH
-#   define UNROLL_J                 1
+
 #ifdef USE_SUPER_CLUSTERS
 #   define XX                       0
 #   define YY                       1
 #   define ZZ                       2
 #   define SCLUSTER_SIZE_X          2
 #   define SCLUSTER_SIZE_Y          2
 #   define SCLUSTER_SIZE_Z          2
 #   define SCLUSTER_SIZE            (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
 #   define DIM_COORD(dim,coord)     ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
 #   define MIN(a,b)                 ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
 #   define SCLUSTER_M               CLUSTER_M * SCLUSTER_SIZE
 #   define computeForceLJ           computeForceLJSup_cuda
 #else
 #   define computeForceLJ           computeForceLJ_cuda
 #endif //USE_SUPER_CLUSTERS
 #   define initialIntegrate         cudaInitialIntegrate
 #   define finalIntegrate           cudaFinalIntegrate
 #   define updatePbc                cudaUpdatePbc
@@ -33,15 +50,11 @@
 #   if VECTOR_WIDTH > CLUSTER_M * 2
 #       define KERNEL_NAME          "Simd2xNN"
 #       define CLUSTER_N            (VECTOR_WIDTH / 2)
 #       define UNROLL_I             4
 #       define UNROLL_J             2
 #       define computeForceLJ       computeForceLJ_2xnn
 // Simd4xN
 #   else
 #       define KERNEL_NAME          "Simd4xN"
 #       define CLUSTER_N            VECTOR_WIDTH
 #       define UNROLL_I             4
 #       define UNROLL_J             1
 #       define computeForceLJ       computeForceLJ_4xn
 #   endif
 #   ifdef USE_REFERENCE_VERSION
@@ -60,16 +73,29 @@
 #   define CJ1_FROM_CI(a)           (a)
 #   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
 #   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
 #ifdef USE_SUPER_CLUSTERS
 #   define CJ1_FROM_SCI(a)          (a)
 #   define SCI_BASE_INDEX(a,b)      ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
 #   define SCJ_BASE_INDEX(a,b)      ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
 #endif //USE_SUPER_CLUSTERS
 #elif CLUSTER_M == CLUSTER_N * 2 // M > N
 #   define CJ0_FROM_CI(a)           ((a) << 1)
 #   define CJ1_FROM_CI(a)           (((a) << 1) | 0x1)
 #   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_M * (b))
 #   define CJ_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
 #ifdef USE_SUPER_CLUSTERS
 #   define SCI_BASE_INDEX(a,b)      ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
 #   define SCJ_BASE_INDEX(a,b)      (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
 #endif //USE_SUPER_CLUSTERS
 #elif CLUSTER_M == CLUSTER_N / 2 // M < N
 #   define CJ0_FROM_CI(a)           ((a) >> 1)
 #   define CJ1_FROM_CI(a)           ((a) >> 1)
 #   define CI_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
 #   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
 #ifdef USE_SUPER_CLUSTERS
 #   define SCI_BASE_INDEX(a,b)      (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
 #   define SCJ_BASE_INDEX(a,b)      ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
 #endif //USE_SUPER_CLUSTERS
 #else
 #   error "Invalid cluster configuration!"
 #endif
@@ -83,14 +109,37 @@
 #define CJ_SCALAR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 1))
 #define CJ_VECTOR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 3))
 #ifdef USE_SUPER_CLUSTERS
 #define SCI_SCALAR_BASE_INDEX(a)    (SCI_BASE_INDEX(a, 1))
 #define SCI_VECTOR_BASE_INDEX(a)    (SCI_BASE_INDEX(a, 3))
 #define SCJ_SCALAR_BASE_INDEX(a)    (SCJ_BASE_INDEX(a, 1))
 #define SCJ_VECTOR_BASE_INDEX(a)    (SCJ_BASE_INDEX(a, 3))
 #endif //USE_SUPER_CLUSTERS
 #if CLUSTER_M >= CLUSTER_N
 #   define CL_X_OFFSET              (0 * CLUSTER_M)
 #   define CL_Y_OFFSET              (1 * CLUSTER_M)
 #   define CL_Z_OFFSET              (2 * CLUSTER_M)
 #ifdef USE_SUPER_CLUSTERS
 #   define SCL_CL_X_OFFSET(ci)      (ci * CLUSTER_M + 0 * SCLUSTER_M)
 #   define SCL_CL_Y_OFFSET(ci)      (ci * CLUSTER_M + 1 * SCLUSTER_M)
 #   define SCL_CL_Z_OFFSET(ci)      (ci * CLUSTER_M + 2 * SCLUSTER_M)
 #   define SCL_X_OFFSET             (0 * SCLUSTER_M)
 #   define SCL_Y_OFFSET             (1 * SCLUSTER_M)
 #   define SCL_Z_OFFSET             (2 * SCLUSTER_M)
 #endif //USE_SUPER_CLUSTERS
 #else
 #   define CL_X_OFFSET              (0 * CLUSTER_N)
 #   define CL_Y_OFFSET              (1 * CLUSTER_N)
 #   define CL_Z_OFFSET              (2 * CLUSTER_N)
 #ifdef USE_SUPER_CLUSTERS
 #   define SCL_X_OFFSET             (0 * SCLUSTER_SIZE * CLUSTER_N)
 #   define SCL_Y_OFFSET             (1 * SCLUSTER_SIZE * CLUSTER_N)
 #   define SCL_Z_OFFSET             (2 * SCLUSTER_SIZE * CLUSTER_N)
 #endif //USE_SUPER_CLUSTERS
 #endif
 typedef struct {
@@ -100,6 +149,13 @@ typedef struct {
    MD_FLOAT bbminz, bbmaxz;
 } Cluster;
 typedef struct {
    int nclusters;
    MD_FLOAT bbminx, bbmaxx;
    MD_FLOAT bbminy, bbmaxy;
    MD_FLOAT bbminz, bbmaxz;
 } SuperCluster;
 typedef struct {
    int Natoms, Nlocal, Nghost, Nmax;
    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
@@ -121,17 +177,20 @@ typedef struct {
    Cluster *iclusters, *jclusters;
    int *icluster_bin;
    int dummy_cj;
-    MD_UINT *exclusion_filter;
+
-    MD_FLOAT *diagonal_4xn_j_minus_i;
+#ifdef USE_SUPER_CLUSTERS
-    MD_FLOAT *diagonal_2xnn_j_minus_i;
+    int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
-    unsigned int masks_2xnn_hn[8];
+    MD_FLOAT *scl_x;
-    unsigned int masks_2xnn_fn[8];
+    MD_FLOAT *scl_v;
-    unsigned int masks_4xn_hn[16];
+    MD_FLOAT *scl_f;
-    unsigned int masks_4xn_fn[16];
+    int *scl_type;
    int *icluster_idx;
    SuperCluster *siclusters;
    int *sicluster_bin;
 #endif //USE_SUPER_CLUSTERS
 } Atom;
 extern void initAtom(Atom*);
 extern void initMasks(Atom*);
 extern void createAtom(Atom*, Parameter*);
 extern int readAtom(Atom*, Parameter*);
 extern int readAtom_pdb(Atom*, Parameter*);
@@ -139,6 +198,7 @@ extern int readAtom_gro(Atom*, Parameter*);
 extern int readAtom_dmp(Atom*, Parameter*);
 extern void growAtom(Atom*);
 extern void growClusters(Atom*);
 extern void growSuperClusters(Atom*);
 #ifdef AOS
 #   define POS_DATA_LAYOUT     "AoS"
--- a/gromacs/includes/neighbor.h
+++ b/gromacs/includes/neighbor.h
@@ -9,31 +9,13 @@
 #ifndef __NEIGHBOR_H_
 #define __NEIGHBOR_H_
 // Interaction masks from GROMACS, things to remember (maybe these confused just me):
 //   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
 //      interaction masks (1 = interaction, 0 = no interaction)
 //   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
 //      so read them from right to left (least significant to most significant bit)
 // All interaction mask is the same for all kernels
 #define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
 // 4x4 kernel diagonal mask
 #define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
 // 4x2 kernel diagonal masks
 #define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
 #define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
 // 4x8 kernel diagonal masks
 #define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
 #define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
 typedef struct {
    int every;
    int ncalls;
    int* neighbors;
    int maxneighs;
    int* numneigh;
    int* numneigh_masked;
    int half_neigh;
    int* neighbors;
    unsigned int* neighbors_imask;
 } Neighbor;
 extern void initNeighbor(Neighbor*, Parameter*);
@@ -43,6 +25,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
 extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
 extern void sortAtom(Atom*);
 extern void buildClusters(Atom*);
 extern void buildClustersGPU(Atom*);
 extern void defineJClusters(Atom*);
 extern void binClusters(Atom*);
 extern void updateSingleAtoms(Atom*);
--- a/gromacs/includes/pbc.h
+++ b/gromacs/includes/pbc.h
@@ -16,5 +16,8 @@ extern void setupPbc(Atom*, Parameter*);
 #ifdef CUDA_TARGET
 extern void cudaUpdatePbc(Atom*, Parameter*, int);
 #if defined(USE_SUPER_CLUSTERS)
 extern void setupPbcGPU(Atom*, Parameter*);
 #endif //defined(USE_SUPER_CLUSTERS)
 #endif
 #endif
--- a/gromacs/includes/utils.h
+++ b/gromacs/includes/utils.h
@@ -0,0 +1,19 @@
 /*
 * Temporal functions for debugging, remove before proceeding with pull request
 */
 #ifndef MD_BENCH_UTILS_H
 #define MD_BENCH_UTILS_H
 #include <atom.h>
 #include <neighbor.h>
 #ifdef USE_SUPER_CLUSTERS
 void verifyClusters(Atom *atom);
 void verifyLayout(Atom *atom);
 void checkAlignment(Atom *atom);
 void showSuperclusters(Atom *atom);
 void printNeighs(Atom *atom, Neighbor *neighbor);
 #endif //USE_SUPER_CLUSTERS
 #endif //MD_BENCH_UTILS_H
--- a/gromacs/includes/vtk.h
+++ b/gromacs/includes/vtk.h
@@ -9,6 +9,7 @@
 #ifndef __VTK_H_
 #define __VTK_H_
 extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
 extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
 extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
 extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
 extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
--- a/gromacs/main-stub.c
+++ b/gromacs/main-stub.c
@@ -60,15 +60,18 @@ void init(Parameter *param) {
    param->eam_file = NULL;
 }
-void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
+// Show debug messages
 #define DEBUG(msg)  printf(msg)
 // Do not show debug messages
 //#define DEBUG(msg)
 void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
    const int maxneighs = nneighs * nreps;
    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
    const int ncj = atom->Nclusters_local / jfac;
    const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
    neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
    neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
    neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
    neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
    if(pattern == P_RAND && ncj <= nneighs) {
        fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
@@ -77,7 +80,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
        unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
        int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
        int m = (pattern == P_SEQ) ? ncj : nneighs;
        int k = 0;
@@ -88,7 +90,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
                do {
                    int cj = rand() % ncj;
                    neighptr[k] = cj;
                    neighptr_imask[k] = imask;
                    found = 0;
                    for(int l = 0; l < k; l++) {
                        if(neighptr[l] == cj) {
@@ -98,7 +99,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
                } while(found == 1);
            } else {
                neighptr[k] = j;
                neighptr_imask[k] = imask;
                j = (j + 1) % m;
            }
        }
@@ -106,12 +106,10 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
        for(int r = 1; r < nreps; r++) {
            for(int k = 0; k < nneighs; k++) {
                neighptr[r * nneighs + k] = neighptr[k];
                neighptr_imask[r * nneighs + k] = neighptr_imask[k];
            }
        }
        neighbor->numneigh[ci] = nneighs * nreps;
        neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
    }
 }
@@ -127,13 +125,12 @@ int main(int argc, const char *argv[]) {
    int niclusters = 256;               // Number of local i-clusters
    int iclusters_natoms = CLUSTER_M;   // Number of valid atoms within i-clusters
    int nneighs = 9;                    // Number of j-cluster neighbors per i-cluster
    int masked = 0;                     // Use masked loop 
    int nreps = 1;
    int csv = 0;
    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("force");
-    DEBUG_MESSAGE("Initializing parameters...\n");
+    DEBUG("Initializing parameters...\n");
    init(&param);
    for(int i = 0; i < argc; i++) {
@@ -159,10 +156,6 @@ int main(int argc, const char *argv[]) {
            param.eam_file = strdup(argv[++i]);
            continue;
        }
        if((strcmp(argv[i], "-m") == 0)) {
            masked = 1;
            continue;
        }
        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
            param.ntimes = atoi(argv[++i]);
            continue;
@@ -213,11 +206,11 @@ int main(int argc, const char *argv[]) {
    }
    if(param.force_field == FF_EAM) {
-        DEBUG_MESSAGE("Initializing EAM parameters...\n");
+        DEBUG("Initializing EAM parameters...\n");
        initEam(&eam, &param);
    }
-    DEBUG_MESSAGE("Initializing atoms...\n");
+    DEBUG("Initializing atoms...\n");
    initAtom(atom);
    initStats(&stats);
@@ -233,7 +226,7 @@ int main(int argc, const char *argv[]) {
        atom->cutforcesq[i] = param.cutforce * param.cutforce;
    }
-    DEBUG_MESSAGE("Creating atoms...\n");
+    DEBUG("Creating atoms...\n");
    while(atom->Nmax < niclusters * iclusters_natoms) {
        growAtom(atom);
    }
@@ -288,13 +281,13 @@ int main(int argc, const char *argv[]) {
        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
    }
-    DEBUG_MESSAGE("Defining j-clusters...\n");
+    DEBUG("Defining j-clusters...\n");
    defineJClusters(atom);
-    DEBUG_MESSAGE("Initializing neighbor lists...\n");
+    DEBUG("Initializing neighbor lists...\n");
    initNeighbor(&neighbor, &param);
-    DEBUG_MESSAGE("Creating neighbor lists...\n");
+    DEBUG("Creating neighbor lists...\n");
-    createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
+    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
-    DEBUG_MESSAGE("Computing forces...\n");
+    DEBUG("Computing forces...\n");
    double T_accum = 0.0;
    for(int i = 0; i < param.ntimes; i++) {
--- a/gromacs/main.c
+++ b/gromacs/main.c
@@ -5,9 +5,7 @@
 * license that can be found in the LICENSE file.
 */
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <omp.h>
 //--
 #include <likwid-marker.h>
 //--
@@ -40,7 +38,16 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb
 extern void copyDataToCUDADevice(Atom *atom);
 extern void copyDataFromCUDADevice(Atom *atom);
 extern void cudaDeviceFree();
-#endif
+
 #ifdef USE_SUPER_CLUSTERS
 #include <utils.h>
 extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
 extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
 extern void alignDataToSuperclusters(Atom *atom);
 extern void alignDataFromSuperclusters(Atom *atom);
 extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
 #endif //USE_SUPER_CLUSTERS
 #endif //CUDA_TARGET
 double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
    if(param->force_field == FF_EAM) { initEam(eam, param); }
@@ -64,11 +71,24 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
    setupNeighbor(param, atom);
    setupThermo(param, atom->Natoms);
    if(param->input_file == NULL) { adjustThermo(param, atom); }
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    buildClustersGPU(atom);
    #else
    buildClusters(atom);
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    defineJClusters(atom);
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    setupPbcGPU(atom, param);
    //setupPbc(atom, param);
    #else
    setupPbc(atom, param);
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    binClusters(atom);
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    buildNeighborGPU(atom, neighbor);
    #else
    buildNeighbor(atom, neighbor);
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    initDevice(atom, neighbor);
    E = getTimeStamp();
    return E-S;
@@ -80,11 +100,24 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
    LIKWID_MARKER_START("reneighbour");
    updateSingleAtoms(atom);
    updateAtomsPbc(atom, param);
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    buildClustersGPU(atom);
    #else
    buildClusters(atom);
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    defineJClusters(atom);
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    //setupPbcGPU(atom, param);
    setupPbc(atom, param);
    #else
    setupPbc(atom, param);
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    binClusters(atom);
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    buildNeighborGPU(atom, neighbor);
    #else
    buildNeighbor(atom, neighbor);
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
    LIKWID_MARKER_STOP("reneighbour");
    E = getTimeStamp();
    return E-S;
@@ -119,7 +152,7 @@ int main(int argc, char** argv) {
    initParameter(&param);
    for(int i = 0; i < argc; i++) {
-        if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
+        if((strcmp(argv[i], "-p") == 0)) {
            readParameter(&param, argv[++i]);
            continue;
        }
@@ -211,6 +244,8 @@ int main(int argc, char** argv) {
    printParameter(&param);
    printf(HLINE);
    //verifyNeigh(&atom, &neighbor);
    printf("step\ttemp\t\tpressure\n");
    computeThermo(0, &param, &atom);
    #if defined(MEM_TRACER) || defined(INDEX_TRACER)
@@ -239,14 +274,23 @@ int main(int argc, char** argv) {
    }
    for(int n = 0; n < param.ntimes; n++) {
        //printf("Step:\t%d\r\n", n);
        initialIntegrate(&param, &atom);
        if((n + 1) % param.reneigh_every) {
            if(!((n + 1) % param.prune_every)) {
                #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
                pruneNeighborGPU(&param, &atom, &neighbor);
                #else
                pruneNeighbor(&param, &atom, &neighbor);
                #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
            }
            copyDataFromCUDADevice(&atom);
            updatePbc(&atom, &param, 0);
            copyDataToCUDADevice(&atom);
        } else {
            #ifdef CUDA_TARGET
            copyDataFromCUDADevice(&atom);
@@ -264,12 +308,29 @@ int main(int argc, char** argv) {
        traceAddresses(&param, &atom, &neighbor, n + 1);
        #endif
        /*
        printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
        copyDataToCUDADevice(&atom);
        verifyLayout(&atom);
        //printClusterIndices(&atom);
        */
        if(param.force_field == FF_EAM) {
            timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
        } else {
            timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
        }
        /*
        copyDataFromCUDADevice(&atom);
        verifyLayout(&atom);
        getchar();
        */
        finalIntegrate(&param, &atom);
        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
@@ -310,30 +371,6 @@ int main(int argc, char** argv) {
    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
    printf(HLINE);
    int nthreads = 0;
    int chunkSize = 0;
    omp_sched_t schedKind;
    char schedType[10];
 #pragma omp parallel
 #pragma omp master
    {
 	omp_get_schedule(&schedKind, &chunkSize);
    	switch (schedKind)
    	{
        	case omp_sched_static:  strcpy(schedType, "static"); break;
        	case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
        	case omp_sched_guided:  strcpy(schedType, "guided"); break;
        	case omp_sched_auto:    strcpy(schedType, "auto"); break;
    	}
    	nthreads = omp_get_max_threads();
    }
    printf("Num threads: %d\n", nthreads);
    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
    printf("Performance: %.2f million atom updates per second\n",
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
    #ifdef COMPUTE_STATS
--- a/gromacs/neighbor.c
+++ b/gromacs/neighbor.c
@@ -56,9 +56,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
    neighbor->half_neigh = param->half_neigh;
    neighbor->maxneighs = 100;
    neighbor->numneigh = NULL;
    neighbor->numneigh_masked = NULL;
    neighbor->neighbors = NULL;
    neighbor->neighbors_imask = NULL;
 }
 void setupNeighbor(Parameter *param, Atom *atom) {
@@ -79,8 +77,13 @@ void setupNeighbor(Parameter *param, Atom *atom) {
    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
    #ifdef USE_SUPER_CLUSTERS
    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_X;
    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_Y;
    #else
    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
    #endif
    nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
    nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
    binsizex = (xhi - xlo) / nbinx;
@@ -186,43 +189,29 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
    return 0;
 }
-/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
+int atomDistanceInRangeGPU(Atom *atom, int sci, int cj, MD_FLOAT rsq) {
-static unsigned int get_imask(int rdiag, int ci, int cj) {
+    for (int ci = 0; ci < atom->siclusters[sci].nclusters; ci++) {
-    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
+        const int icluster_idx = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
        int ci_vec_base = CI_VECTOR_BASE_INDEX(icluster_idx);
        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
        for(int cii = 0; cii < atom->iclusters[icluster_idx].natoms; cii++) {
            for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
                MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
                MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
                MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
                if(delx * delx + dely * dely + delz * delz < rsq) {
                    return 1;
                }
            }
        }
    }
-/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
+    return 0;
 static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
                                                               : NBNXN_INTERACTION_MASK_ALL));
 }
 /* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
 static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
 }
 /* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
 static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
                                                               : NBNXN_INTERACTION_MASK_ALL));
 }
 #if VECTOR_WIDTH == 2
 #   define get_imask_simd_4xn get_imask_simd_j2
 #elif VECTOR_WIDTH== 4
 #   define get_imask_simd_4xn get_imask_simd_j4
 #elif VECTOR_WIDTH == 8
 #   define get_imask_simd_4xn get_imask_simd_j8
 #   define get_imask_simd_2xnn get_imask_simd_j4
 #elif VECTOR_WIDTH == 16
 #   define get_imask_simd_2xnn get_imask_simd_j8
 #else
 #   error "Invalid cluster configuration"
 #endif
 void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("buildNeighbor start\n");
@@ -230,13 +219,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    if(atom->Nclusters_local > nmax) {
        nmax = atom->Nclusters_local;
        if(neighbor->numneigh) free(neighbor->numneigh);
        if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
        if(neighbor->neighbors) free(neighbor->neighbors);
        if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
-        neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
+        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
        neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
    }
    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@@ -253,8 +238,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
            int ci_cj1 = CJ1_FROM_CI(ci);
            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
-            unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
+            int n = 0;
            int n = 0, nmasked = 0;
            int ibin = atom->icluster_bin[ci];
            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
@@ -319,30 +303,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
                            if(d_bb_sq < cutneighsq) {
                                if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
-                                    // We use true (1) for rdiag because we only care if there are masks
+                                    neighptr[n++] = cj;
                                    // at all, and when this is set to false (0) the self-exclusions are
                                    // not accounted for, which  makes the optimized version to not work!
                                    unsigned int imask;
                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
                                    imask = get_imask_simd_2xnn(1, ci, cj);
                                    #else // 4xn
                                    imask = get_imask_simd_4xn(1, ci, cj);
                                    #endif
                                    if(n < neighbor->maxneighs) {
                                        if(imask == NBNXN_INTERACTION_MASK_ALL) {
                                            neighptr[n] = cj;
                                            neighptr_imask[n] = imask;
                                        } else {
                                            neighptr[n] = neighptr[nmasked];
                                            neighptr_imask[n] = neighptr_imask[nmasked];
                                            neighptr[nmasked] = cj;
                                            neighptr_imask[nmasked] = imask;
                                            nmasked++;
                                        }
                                    }
                                    n++;
                                }
                            }
                        }
@@ -364,14 +325,11 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
            // Fill neighbor list with dummy values to fit vector width
            if(CLUSTER_N < VECTOR_WIDTH) {
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
-                    neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
                    neighptr_imask[n] = 0;
                    n++;
                }
            }
            neighbor->numneigh[ci] = n;
            neighbor->numneigh_masked[ci] = nmasked;
            if(n >= neighbor->maxneighs) {
                resize = 1;
@@ -382,12 +340,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
        }
        if(resize) {
            neighbor->maxneighs = new_maxneighs * 1.2;
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
            neighbor->maxneighs = new_maxneighs * 1.2;
            free(neighbor->neighbors);
-            free(neighbor->neighbors_imask);
+            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
            neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
            neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
        }
    }
@@ -436,6 +392,189 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("buildNeighbor end\n");
 }
 #ifdef USE_SUPER_CLUSTERS
 // TODO For future parallelization on GPU
 void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("buildNeighborGPU start\n");
    /* extend atom arrays if necessary */
    if(atom->Nsclusters_local > nmax) {
        nmax = atom->Nsclusters_local;
        if(neighbor->numneigh) free(neighbor->numneigh);
        if(neighbor->neighbors) free(neighbor->neighbors);
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
    }
    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
    MD_FLOAT bby = 0.5 * (binsizey + binsizey);
    MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
    rbb_sq = rbb_sq * rbb_sq;
    int resize = 1;
    /* loop over each atom, storing neighbors */
    while(resize) {
        int new_maxneighs = neighbor->maxneighs;
        resize = 0;
        for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
            int ci_cj1 = CJ1_FROM_SCI(sci);
            int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]);
            int n = 0;
            int ibin = atom->sicluster_bin[sci];
            MD_FLOAT ibb_xmin = atom->siclusters[sci].bbminx;
            MD_FLOAT ibb_xmax = atom->siclusters[sci].bbmaxx;
            MD_FLOAT ibb_ymin = atom->siclusters[sci].bbminy;
            MD_FLOAT ibb_ymax = atom->siclusters[sci].bbmaxy;
            MD_FLOAT ibb_zmin = atom->siclusters[sci].bbminz;
            MD_FLOAT ibb_zmax = atom->siclusters[sci].bbmaxz;
            for(int k = 0; k < nstencil; k++) {
                int jbin = ibin + stencil[k];
                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
                int cj, m = -1;
                MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
                const int c = bin_nclusters[jbin];
                if(c > 0) {
                    MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
                    do {
                        m++;
                        cj = loc_bin[m];
                        if(neighbor->half_neigh && ci_cj1 > cj) {
                            continue;
                        }
                        jbb_zmin = atom->jclusters[cj].bbminz;
                        jbb_zmax = atom->jclusters[cj].bbmaxz;
                        dl = ibb_zmin - jbb_zmax;
                        dh = jbb_zmin - ibb_zmax;
                        dm = MAX(dl, dh);
                        dm0 = MAX(dm, 0.0);
                        d_bb_sq = dm0 * dm0;
                    } while(m + 1 < c && d_bb_sq > cutneighsq);
                    jbb_xmin = atom->jclusters[cj].bbminx;
                    jbb_xmax = atom->jclusters[cj].bbmaxx;
                    jbb_ymin = atom->jclusters[cj].bbminy;
                    jbb_ymax = atom->jclusters[cj].bbmaxy;
                    while(m < c) {
                        if(!neighbor->half_neigh || ci_cj1 <= cj) {
                            dl = ibb_zmin - jbb_zmax;
                            dh = jbb_zmin - ibb_zmax;
                            dm = MAX(dl, dh);
                            dm0 = MAX(dm, 0.0);
                            d_bb_sq = dm0 * dm0;
                            /*if(d_bb_sq > cutneighsq) {
                                break;
                            }*/
                            dl = ibb_ymin - jbb_ymax;
                            dh = jbb_ymin - ibb_ymax;
                            dm = MAX(dl, dh);
                            dm0 = MAX(dm, 0.0);
                            d_bb_sq += dm0 * dm0;
                            dl = ibb_xmin - jbb_xmax;
                            dh = jbb_xmin - ibb_xmax;
                            dm = MAX(dl, dh);
                            dm0 = MAX(dm, 0.0);
                            d_bb_sq += dm0 * dm0;
                            if(d_bb_sq < cutneighsq) {
                                if(d_bb_sq < rbb_sq || atomDistanceInRangeGPU(atom, sci, cj, cutneighsq)) {
                                    neighptr[n++] = cj;
                                }
                            }
                        }
                        m++;
                        if(m < c) {
                            cj = loc_bin[m];
                            jbb_xmin = atom->jclusters[cj].bbminx;
                            jbb_xmax = atom->jclusters[cj].bbmaxx;
                            jbb_ymin = atom->jclusters[cj].bbminy;
                            jbb_ymax = atom->jclusters[cj].bbmaxy;
                            jbb_zmin = atom->jclusters[cj].bbminz;
                            jbb_zmax = atom->jclusters[cj].bbmaxz;
                        }
                    }
                }
            }
            // Fill neighbor list with dummy values to fit vector width
            if(CLUSTER_N < VECTOR_WIDTH) {
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
                }
            }
            neighbor->numneigh[sci] = n;
            if(n >= neighbor->maxneighs) {
                resize = 1;
                if(n >= new_maxneighs) {
                    new_maxneighs = n;
                }
            }
        }
        if(resize) {
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
            neighbor->maxneighs = new_maxneighs * 1.2;
            free(neighbor->neighbors);
            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
        }
    }
    /*
    DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
    for(int ci = 0; ci < 6; ci++) {
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
        int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
        DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
            ci,
            atom->iclusters[ci].bbminx,
            atom->iclusters[ci].bbmaxx,
            atom->iclusters[ci].bbminy,
            atom->iclusters[ci].bbmaxy,
            atom->iclusters[ci].bbminz,
            atom->iclusters[ci].bbmaxz);
        for(int cii = 0; cii < CLUSTER_M; cii++) {
            DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
        }
        DEBUG_MESSAGE("Neighbors:\n");
        for(int k = 0; k < neighbor->numneigh[ci]; k++) {
            int cj = neighptr[k];
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
            DEBUG_MESSAGE("    Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
                cj,
                atom->jclusters[cj].bbminx,
                atom->jclusters[cj].bbmaxx,
                atom->jclusters[cj].bbminy,
                atom->jclusters[cj].bbmaxy,
                atom->jclusters[cj].bbminz,
                atom->jclusters[cj].bbmaxz);
            for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
                DEBUG_MESSAGE("    %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
            }
        }
    }
    */
    DEBUG_MESSAGE("buildNeighborGPU end\n");
 }
 #endif //USE_SUPER_CLUSTERS
 void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("pruneNeighbor start\n");
    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
@@ -443,9 +582,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
        unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[ci];
        int numneighs_masked = neighbor->numneigh_masked[ci];
        int k = 0;
        // Remove dummy clusters if necessary
@@ -461,9 +598,6 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
                k++;
            } else {
                numneighs--;
                if(k < numneighs_masked) {
                    numneighs_masked--;
                }
                neighs[k] = neighs[numneighs];
            }
        }
@@ -471,19 +605,63 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
        // Readd dummy clusters if necessary
        if(CLUSTER_N < VECTOR_WIDTH) {
            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
-                neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
+                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
                neighs_imask[numneighs] = 0;
                numneighs++;
            }
        }
        neighbor->numneigh[ci] = numneighs;
        neighbor->numneigh_masked[ci] = numneighs_masked;
    }
    DEBUG_MESSAGE("pruneNeighbor end\n");
 }
 #ifdef USE_SUPER_CLUSTERS
 void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor) {
    DEBUG_MESSAGE("pruneNeighbor start\n");
    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
    MD_FLOAT cutsq = cutneighsq;
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
        for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
            //const int ci = atom->siclusters[sci].iclusters[scii];
            const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
            int *neighs = &neighbor->neighbors[sci * neighbor->maxneighs];
            int numneighs = neighbor->numneigh[sci];
            int k = 0;
            // Remove dummy clusters if necessary
            if(CLUSTER_N < VECTOR_WIDTH) {
                while(neighs[numneighs - 1] == atom->dummy_cj) {
                    numneighs--;
                }
            }
            while(k < numneighs) {
                int cj = neighs[k];
                if(atomDistanceInRange(atom, ci, cj, cutsq)) {
                    k++;
                } else {
                    numneighs--;
                    neighs[k] = neighs[numneighs];
                }
            }
            // Readd dummy clusters if necessary
            if(CLUSTER_N < VECTOR_WIDTH) {
                while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
                    neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
                }
            }
            neighbor->numneigh[sci] = numneighs;
        }
    }
    DEBUG_MESSAGE("pruneNeighbor end\n");
 }
 #endif //USE_SUPER_CLUSTERS
 /* internal subroutines */
 MD_FLOAT bindist(int i, int j) {
    MD_FLOAT delx, dely, delz;
@@ -609,6 +787,36 @@ void sortAtomsByZCoord(Atom *atom) {
    DEBUG_MESSAGE("sortAtomsByZCoord end\n");
 }
 #ifdef USE_SUPER_CLUSTERS
 // TODO: Use pigeonhole sorting
 void sortAtomsByCoord(Atom *atom, int dim, int bin, int start_index, int end_index) {
    //DEBUG_MESSAGE("sortAtomsByCoord start\n");
    int *bin_ptr = &bins[bin * atoms_per_bin];
    for(int ac_i = start_index; ac_i <= end_index; ac_i++) {
        int i = bin_ptr[ac_i];
        int min_ac = ac_i;
        int min_idx = i;
        MD_FLOAT min_coord = DIM_COORD(dim, i);
        for(int ac_j = ac_i + 1; ac_j <= end_index; ac_j++) {
            int j = bin_ptr[ac_j];
            MD_FLOAT coordj = DIM_COORD(dim, j);
            if(coordj < min_coord) {
                min_ac = ac_j;
                min_idx = j;
                min_coord = coordj;
            }
        }
        bin_ptr[ac_i] = min_idx;
        bin_ptr[min_ac] = i;
    }
    //DEBUG_MESSAGE("sortAtomsByCoord end\n");
 }
 #endif //USE_SUPER_CLUSTERS
 void buildClusters(Atom *atom) {
    DEBUG_MESSAGE("buildClusters start\n");
    atom->Nclusters_local = 0;
@@ -685,6 +893,153 @@ void buildClusters(Atom *atom) {
    DEBUG_MESSAGE("buildClusters end\n");
 }
 #ifdef USE_SUPER_CLUSTERS
 void buildClustersGPU(Atom *atom) {
    DEBUG_MESSAGE("buildClustersGPU start\n");
    atom->Nclusters_local = 0;
    atom->Nsclusters_local = 0;
    /* bin local atoms */
    binAtoms(atom);
    for(int bin = 0; bin < mbins; bin++) {
        int c = bincount[bin];
        sortAtomsByCoord(atom, ZZ, bin, 0, c - 1);
        int ac = 0;
        int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
        if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
        const int supercluster_size = SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z;
        int nsclusters = ((nclusters + supercluster_size - 1) / supercluster_size);
        for(int scl = 0; scl < nsclusters; scl++) {
            const int sci = atom->Nsclusters_local;
            if(sci >= atom->Nsclusters_max) {
                growSuperClusters(atom);
            }
            int scl_offset = scl * SCLUSTER_SIZE * CLUSTER_M;
            MD_FLOAT sc_bbminx = INFINITY, sc_bbmaxx = -INFINITY;
            MD_FLOAT sc_bbminy = INFINITY, sc_bbmaxy = -INFINITY;
            MD_FLOAT sc_bbminz = INFINITY, sc_bbmaxz = -INFINITY;
            atom->siclusters[sci].nclusters = 0;
            for(int scl_z = 0; scl_z < SCLUSTER_SIZE_Z; scl_z++) {
                const int atom_scl_z_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M;
                const int atom_scl_z_end_idx = MIN(atom_scl_z_offset + SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
                sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx);
                for(int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) {
                    const int atom_scl_y_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M + scl_y * SCLUSTER_SIZE_Y * CLUSTER_M;
                    const int atom_scl_y_end_idx = MIN(atom_scl_y_offset + SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
                    sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx);
                    for(int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) {
                        const int cluster_sup_idx = scl_z * SCLUSTER_SIZE_Z * SCLUSTER_SIZE_Y + scl_y * SCLUSTER_SIZE_X + scl_x;
                        const int ci = atom->Nclusters_local;
                        if(ci >= atom->Nclusters_max) {
                            growClusters(atom);
                        }
                        int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
                        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
                        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
                        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
                        int sci_sca_base = SCI_SCALAR_BASE_INDEX(sci);
                        int sci_vec_base = SCI_VECTOR_BASE_INDEX(sci);
                        MD_FLOAT *sci_x = &atom->scl_x[sci_vec_base];
                        MD_FLOAT *sci_v = &atom->scl_v[sci_vec_base];
                        int *ci_type = &atom->cl_type[ci_sca_base];
                        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
                        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
                        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
                        atom->iclusters[ci].natoms = 0;
                        for(int cii = 0; cii < CLUSTER_M; cii++) {
                            if(ac < c) {
                                int i = bins[bin * atoms_per_bin + ac];
                                MD_FLOAT xtmp = atom_x(i);
                                MD_FLOAT ytmp = atom_y(i);
                                MD_FLOAT ztmp = atom_z(i);
                                ci_x[CL_X_OFFSET + cii] = xtmp;
                                ci_x[CL_Y_OFFSET + cii] = ytmp;
                                ci_x[CL_Z_OFFSET + cii] = ztmp;
                                ci_v[CL_X_OFFSET + cii] = atom->vx[i];
                                ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
                                ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
                                sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = xtmp;
                                sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = ytmp;
                                sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = ztmp;
                                sci_v[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vx[i];
                                sci_v[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vy[i];
                                sci_v[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vz[i];
                                // TODO: To create the bounding boxes faster, we can use SIMD operations
                                if(bbminx > xtmp) { bbminx = xtmp; }
                                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
                                if(bbminy > ytmp) { bbminy = ytmp; }
                                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
                                if(bbminz > ztmp) { bbminz = ztmp; }
                                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
                                ci_type[cii] = atom->type[i];
                                atom->iclusters[ci].natoms++;
                            } else {
                                ci_x[CL_X_OFFSET + cii] = INFINITY;
                                ci_x[CL_Y_OFFSET + cii] = INFINITY;
                                ci_x[CL_Z_OFFSET + cii] = INFINITY;
                                sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = INFINITY;
                                sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = INFINITY;
                                sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = INFINITY;
                            }
                            ac++;
                        }
                        atom->icluster_bin[ci] = bin;
                        atom->iclusters[ci].bbminx = bbminx;
                        atom->iclusters[ci].bbmaxx = bbmaxx;
                        atom->iclusters[ci].bbminy = bbminy;
                        atom->iclusters[ci].bbmaxy = bbmaxy;
                        atom->iclusters[ci].bbminz = bbminz;
                        atom->iclusters[ci].bbmaxz = bbmaxz;
                        atom->Nclusters_local++;
                        // TODO: To create the bounding boxes faster, we can use SIMD operations
                        if(sc_bbminx > bbminx) { sc_bbminx = bbminx; }
                        if(sc_bbmaxx < bbmaxx) { sc_bbmaxx = bbmaxx; }
                        if(sc_bbminy > bbminy) { sc_bbminy = bbminy; }
                        if(sc_bbmaxy < bbmaxy) { sc_bbmaxy = bbmaxy; }
                        if(sc_bbminz > bbminz) { sc_bbminz = bbminz; }
                        if(sc_bbmaxz < bbmaxz) { sc_bbmaxz = bbmaxz; }
                        atom->siclusters[sci].nclusters++;
                        atom->icluster_idx[SCLUSTER_SIZE * sci + cluster_sup_idx] = ci;
                        //atom->siclusters[sci].iclusters[cluster_sup_idx] = ci;
                    }
                }
            }
            atom->sicluster_bin[sci] = bin;
            atom->siclusters[sci].bbminx = sc_bbminx;
            atom->siclusters[sci].bbmaxx = sc_bbmaxx;
            atom->siclusters[sci].bbminy = sc_bbminy;
            atom->siclusters[sci].bbmaxy = sc_bbmaxy;
            atom->siclusters[sci].bbminz = sc_bbminz;
            atom->siclusters[sci].bbmaxz = sc_bbmaxz;
            atom->Nsclusters_local++;
        }
    }
    DEBUG_MESSAGE("buildClustersGPU end\n");
 }
 #endif //USE_SUPER_CLUSTERS
 void defineJClusters(Atom *atom) {
    DEBUG_MESSAGE("defineJClusters start\n");
--- a/gromacs/pbc.c
+++ b/gromacs/pbc.c
@@ -86,6 +86,98 @@ void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
    DEBUG_MESSAGE("updatePbc end\n");
 }
 /* update coordinates of ghost atoms */
 /* uses mapping created in setupPbc */
 void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
    DEBUG_MESSAGE("gpuUpdatePbc start\n");
    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
    int ncj = atom->Nclusters_local / jfac;
    MD_FLOAT xprd = param->xprd;
    MD_FLOAT yprd = param->yprd;
    MD_FLOAT zprd = param->zprd;
    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
        const int cj = ncj + cg;
        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
        int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
        int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
        int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
        MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
        MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base];
        MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base];
        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
        MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY;
        MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY;
        MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY;
        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
            MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
            MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
            MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
            cj_x[CL_X_OFFSET + cjj] = xtmp;
            cj_x[CL_Y_OFFSET + cjj] = ytmp;
            cj_x[CL_Z_OFFSET + cjj] = ztmp;
            scj_x[SCL_X_OFFSET + cjj] = sxtmp;
            scj_x[SCL_Y_OFFSET + cjj] = sytmp;
            scj_x[SCL_Z_OFFSET + cjj] = sztmp;
            if(firstUpdate) {
                // TODO: To create the bounding boxes faster, we can use SIMD operations
                if(bbminx > xtmp) { bbminx = xtmp; }
                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
                if(bbminy > ytmp) { bbminy = ytmp; }
                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
                if(bbminz > ztmp) { bbminz = ztmp; }
                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
                if(sbbminx > sxtmp) { sbbminx = sxtmp; }
                if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; }
                if(sbbminy > sytmp) { sbbminy = sytmp; }
                if(sbbmaxy < sytmp) { sbbmaxy = sytmp; }
                if(sbbminz > sztmp) { sbbminz = sztmp; }
                if(sbbmaxz < sztmp) { sbbmaxz = sztmp; }
            }
        }
        if(firstUpdate) {
            for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
                cj_x[CL_X_OFFSET + cjj] = INFINITY;
                cj_x[CL_Y_OFFSET + cjj] = INFINITY;
                cj_x[CL_Z_OFFSET + cjj] = INFINITY;
                scj_x[SCL_X_OFFSET + cjj] = INFINITY;
                scj_x[SCL_Y_OFFSET + cjj] = INFINITY;
                scj_x[SCL_Z_OFFSET + cjj] = INFINITY;
            }
            atom->jclusters[cj].bbminx = bbminx;
            atom->jclusters[cj].bbmaxx = bbmaxx;
            atom->jclusters[cj].bbminy = bbminy;
            atom->jclusters[cj].bbmaxy = bbmaxy;
            atom->jclusters[cj].bbminz = bbminz;
            atom->jclusters[cj].bbmaxz = bbmaxz;
        }
    }
    DEBUG_MESSAGE("gpuUpdatePbc end\n");
 }
 /* relocate atoms that have left domain according
 * to periodic boundary conditions */
 void updateAtomsPbc(Atom *atom, Parameter *param) {
@@ -229,3 +321,91 @@ void setupPbc(Atom *atom, Parameter *param) {
    cpuUpdatePbc(atom, param, 1);
    DEBUG_MESSAGE("setupPbc end\n");
 }
 void setupPbcGPU(Atom *atom, Parameter *param) {
    DEBUG_MESSAGE("setupPbcGPU start\n");
    MD_FLOAT xprd = param->xprd;
    MD_FLOAT yprd = param->yprd;
    MD_FLOAT zprd = param->zprd;
    MD_FLOAT Cutneigh = param->cutneigh;
    //int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
    int jfac = SCLUSTER_M / CLUSTER_M;
    int ncj = atom->Nsclusters_local * jfac;
    int Nghost = -1;
    int Nghost_atoms = 0;
    for(int cj = 0; cj < ncj; cj++) {
        if(atom->jclusters[cj].natoms > 0) {
            if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
                growClusters(atom);
                //growSuperClusters(atom);
            }
            if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
                growPbc(atom);
            }
            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
            /* Setup ghost atoms */
            /* 6 planes */
            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
            /* 8 corners */
            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
            /* 12 edges */
            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
        }
    }
    if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
        growClusters(atom);
        //growSuperClusters(atom);
    }
    // Add dummy cluster at the end
    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
        cj_x[CL_X_OFFSET + cjj] = INFINITY;
        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
    }
    // increase by one to make it the ghost atom count
    atom->dummy_cj = ncj + Nghost + 1;
    atom->Nghost = Nghost_atoms;
    atom->Nclusters_ghost = Nghost + 1;
    atom->Nclusters = atom->Nclusters_local + Nghost + 1;
    // Update created ghost clusters positions
    gpuUpdatePbc(atom, param, 1);
    DEBUG_MESSAGE("setupPbcGPU end\n");
 }
--- a/gromacs/tracing.c
+++ b/gromacs/tracing.c
@@ -14,7 +14,6 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
    INDEX_TRACER_INIT;
    int Nlocal = atom->Nlocal;
    int* neighs;
    unsigned int *neighs_imask;
    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@@ -35,8 +34,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
        DIST_TRACE(neighs, numneighs);
        for(int k = 0; k < numneighs; k++) {
-            int j = neighs[k];
+            MEM_TRACE(neighs[k], 'R');
            MEM_TRACE(j, 'R');
            MEM_TRACE(atom_x(j), 'R');
            MEM_TRACE(atom_y(j), 'R');
            MEM_TRACE(atom_z(j), 'R');
--- a/gromacs/utils.c
+++ b/gromacs/utils.c
@@ -0,0 +1,332 @@
 /*
 * Temporal functions for debugging, remove before proceeding with pull request
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <utils.h>
 extern void alignDataToSuperclusters(Atom *atom);
 extern void alignDataFromSuperclusters(Atom *atom);
 #ifdef USE_SUPER_CLUSTERS
 /*
 void verifyClusters(Atom *atom) {
    unsigned int count = 0;
    for (int i = 0; i < atom->Nsclusters_local; i++) {
        for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
            for(int cii = 0; cii < CLUSTER_M; cii++, count++);
        }
    }
    MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
    MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
    MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
    count = 0;
    unsigned int diffs = 0;
    printf("######### %d #########\r\n", atom->Nsclusters_local);
    for (int i = 0; i < atom->Nsclusters_local; i++) {
        printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
        for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
            //printf("%d\t", atom.siclusters[i].iclusters[j]);
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
            if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
            for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
                x[count] = ci_x[CL_X_OFFSET + cii];
                y[count] = ci_x[CL_Y_OFFSET + cii];
                z[count] = ci_x[CL_Z_OFFSET + cii];
                //printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
            }
        }
        printf("######### \t #########\r\n");
    }
    printf("######### Diffs: %d\t #########\r\n", diffs);
    printf("\r\n");
    count = 0;
    diffs = 0;
    for (int i = 0; i < atom->Nclusters_local; i++) {
        MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
        for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
            if (ci_x[CL_X_OFFSET + cii] != x[count] ||
                ci_x[CL_Y_OFFSET + cii] != y[count] ||
                ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
        }
    }
    printf("######### Diffs: %d\t #########\r\n", diffs);
 }
 */
 void verifyLayout(Atom *atom) {
    printf("verifyLayout start\r\n");
    /*
    unsigned int count = 0;
    for (int i = 0; i < atom->Nsclusters_local; i++) {
        for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
    }
    MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
            const unsigned int atom_offset = scci;
            /*
            for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
                scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
                scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
                scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
                //printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
            }
            memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
            memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
        }
    }
    */
    //alignDataToSuperclusters(atom);
    //for (int sci = 0; sci < 2; sci++) {
    for (int sci = 4; sci < 6; sci++) {
        const unsigned int scl_offset = sci * SCLUSTER_SIZE;
        MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
            const unsigned int cl_idx = cii / CLUSTER_M;
            const unsigned int ciii = cii % CLUSTER_M;
            /*
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
                   sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
            */
            printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
                   sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
        }
        /*
        //for (int cii = 0; cii < SCLUSTER_M; ++cii) {
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
            const unsigned int cl_idx = cii / CLUSTER_M;
            const unsigned int ciii = cii % CLUSTER_M;
            /*
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
                   sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
                   */
        /*
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
                   sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
        }
        */
        /*
        for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
            for (int cii = 0; cii < CLUSTER_M; ++cii) {
                printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
                       sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
            }
            /*
            const unsigned int cl_offset = scii * 3 * CLUSTER_M;
            //MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
            for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
                printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
                       sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
            }
            */
        /*
        for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
            printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
                   scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
        }
        */
        //}
        printf("##########\t##########\r\n");
    }
    printf("\r\n");
    //for (int ci = 0; ci < 16; ci++) {
    for (int ci = 35; ci < 37; ci++) {
        printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
        MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
        //for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
        for(int cii = 0; cii < CLUSTER_M; cii++) {
            printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
                   ci_x[CL_Y_OFFSET + cii],
                   ci_x[CL_Z_OFFSET + cii]);
        }
        printf("##########\t##########\r\n");
    }
    printf("verifyLayout end\r\n");
    /*
    for (int i = 0; i < atom->Nclusters_local; i++) {
        MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
        for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
            if (ci_x[CL_X_OFFSET + cii] != x[count] ||
                ci_x[CL_Y_OFFSET + cii] != y[count] ||
                ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
        }
    }
     */
 }
 void checkAlignment(Atom *atom) {
    alignDataToSuperclusters(atom);
    for (int sci = 4; sci < 6; sci++) {
        MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
            const unsigned int cl_idx = cii / CLUSTER_M;
            const unsigned int ciii = cii % CLUSTER_M;
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
                   sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
        }
    }
    for (int ci = 35; ci < 37; ci++) {
        printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
        MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
        for(int cii = 0; cii < CLUSTER_M; cii++) {
            printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
                   ci_x[CL_Y_OFFSET + cii],
                   ci_x[CL_Z_OFFSET + cii]);
        }
        printf("##########\t##########\r\n");
    }
 }
 void showSuperclusters(Atom *atom) {
    for (int sci = 4; sci < 6; sci++) {
        MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
            const unsigned int cl_idx = cii / CLUSTER_M;
            const unsigned int ciii = cii % CLUSTER_M;
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
                   sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
        }
    }
 }
 void printNeighs(Atom *atom, Neighbor *neighbor) {
    for (int i = 0; i < atom->Nclusters_local; ++i) {
        int neigh_num = neighbor->numneigh[i];
        for (int j = 0; j < neigh_num; j++) {
            printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
        }
        printf("\r\n");
    }
 }
 void printClusterIndices(Atom *atom) {
    for (int i = 0; i < atom->Nsclusters_local; ++i) {
        int clusters_num = atom->siclusters[i].nclusters;
        for (int j = 0; j < clusters_num; j++) {
            printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
        }
        printf("\r\n");
    }
 }
 void verifyNeigh(Atom *atom, Neighbor *neighbor) {
    buildNeighbor(atom, neighbor);
    int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
    int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
    for (int i = 0; i < atom->Nclusters_local; ++i) {
        int neigh_num = neighbor->numneigh[i];
        numneigh[i] = neighbor->numneigh[i];
        neighbor->numneigh[i] = 0;
        for (int j = 0; j < neigh_num; j++) {
            neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
            neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
        }
    }
    buildNeighborGPU(atom, neighbor);
    unsigned int num_diff = 0;
    unsigned int neigh_diff = 0;
    for (int i = 0; i < atom->Nclusters_local; ++i) {
        int neigh_num = neighbor->numneigh[i];
        if (numneigh[i] != neigh_num) num_diff++;
        for (int j = 0; j < neigh_num; j++) {
            if (neighbors[i * neighbor->maxneighs + j] !=
            neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
        }
    }
    printf("%d\t%d\r\n", num_diff, neigh_diff);
 }
 #endif //USE_SUPER_CLUSTERS
--- a/gromacs/vtk.c
+++ b/gromacs/vtk.c
@@ -15,8 +15,61 @@ void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
    write_ghost_atoms_to_vtk_file(filename, atom, timestep);
    write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
    write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
 #ifdef USE_SUPER_CLUSTERS
    write_super_clusters_to_vtk_file(filename, atom, timestep);
 #endif //#ifdef USE_SUPER_CLUSTERS
 }
 #ifdef USE_SUPER_CLUSTERS
 int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
    char timestep_filename[128];
    snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
    FILE* fp = fopen(timestep_filename, "wb");
    if(fp == NULL) {
        fprintf(stderr, "Could not open VTK file for writing!\n");
        return -1;
    }
    fprintf(fp, "# vtk DataFile Version 2.0\n");
    fprintf(fp, "Particle data\n");
    fprintf(fp, "ASCII\n");
    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
    fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
    for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
        int factor = (rand() % 1000) + 1;
        //double factor = ci * 10;
        int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
        MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
        for(int cii = 0; cii < SCLUSTER_M; ++cii) {
            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
        }
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
    for(int i = 0; i < atom->Nlocal; ++i) {
        fprintf(fp, "1 %d\n", i);
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
    for(int i = 0; i < atom->Nlocal; ++i) {
        fprintf(fp, "1\n");
    }
    fprintf(fp, "\n\n");
    fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
    fprintf(fp, "SCALARS mass double\n");
    fprintf(fp, "LOOKUP_TABLE default\n");
    for(int i = 0; i < atom->Nlocal; i++) {
        fprintf(fp, "1.0\n");
    }
    fprintf(fp, "\n\n");
    fclose(fp);
    return 0;
 }
 #endif //USE_SUPER_CLUSTERS
 int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
    char timestep_filename[128];
    snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
--- a/include_CLANG.mk
+++ b/include_CLANG.mk
@@ -7,7 +7,6 @@ ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra
 CFLAGS   = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 #CFLAGS   = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 #CFLAGS   = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 #CFLAGS   = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
 ASFLAGS  = -masm=intel
--- a/include_GCC.mk
+++ b/include_GCC.mk
@@ -6,29 +6,13 @@ ANSI_CFLAGS += -std=c99
 ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra
 ifeq ($(ISA),AVX512)
 CFLAGS   = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -O3 -march=cascadelake  -ffast-math -funroll-loops # -fopenmp
 endif
 ifeq ($(ISA),AVX2)
 #CFLAGS   = -Ofast -march=native -mavx2  -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -O3 -march=znver1  -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
 CFLAGS   = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
 endif
 ifeq ($(ISA),AVX)
 CFLAGS   = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
 endif
 ifeq ($(ISA),SSE)
 CFLAGS   = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
 endif
 #CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
 #CFLAGS   = -O3 -march=cascadelake  -ffast-math -funroll-loops # -fopenmp
 CFLAGS   = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -Ofast -march=native -mavx2  -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -Ofast -march=native  -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -O3 -march=native  -ffast-math -funroll-loops # -fopenmp
 #CFLAGS   = -O3 -march=znver1  -ffast-math -funroll-loops # -fopenmp
 ASFLAGS  =  #-masm=intel
 LFLAGS   =
 DEFINES  = -D_GNU_SOURCE -DNO_ZMM_INTRIN
--- a/include_ICC.mk
+++ b/include_ICC.mk
@@ -1,27 +1,13 @@
 CC  = icc
 LINKER = $(CC)
-OPENMP  = -qopenmp
+OPENMP  = #-qopenmp
 PROFILE  = #-profile-functions -g  -pg
 ifeq ($(ISA),AVX512)
 OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
-endif
+#OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
-
+#OPTS     = -Ofast -xAVX  $(PROFILE)
 ifeq ($(ISA),AVX2)
 OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 #OPTS     = -Ofast -xAVX2  $(PROFILE)
-#OPTS     = -Ofast -march=core-avx2 $(PROFILE)
+#OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 endif
 ifeq ($(ISA),AVX)
 OPTS     = -Ofast -xAVX  $(PROFILE)
 endif
 ifeq ($(ISA),SSE)
 OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 endif
 #OPTS     = -Ofast -no-vec $(PROFILE)
 #OPTS     = -Ofast -xHost $(PROFILE)
 CFLAGS   = $(PROFILE) -restrict $(OPENMP) $(OPTS)
--- a/include_ICX.mk
+++ b/include_ICX.mk
@@ -3,28 +3,13 @@ LINKER = $(CC)
 OPENMP  = #-qopenmp
 PROFILE  = #-profile-functions -g  -pg
-
+#OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
-ifeq ($(ISA),AVX512)
+#OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
-OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
+#OPTS     = -Ofast -xAVX  $(PROFILE)
-#OPTS      = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
+#OPTS     = -Ofast -xAVX2  $(PROFILE)
-endif
+#OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 ifeq ($(ISA),AVX2)
 OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 #OPTS     = -Ofast -xHost  $(PROFILE)
 #OPTS     = -Ofast -march=core-avx2 $(PROFILE)
 endif
 ifeq ($(ISA),AVX)
 OPTS     = -Ofast -xAVX  $(PROFILE)
 endif
 ifeq ($(ISA),SSE)
 OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 endif
 #OPTS     = -Ofast -no-vec $(PROFILE)
-#OPTS     = -Ofast -xHost $(PROFILE)
+OPTS     = -Ofast -xHost $(PROFILE)
 CFLAGS   = $(PROFILE) $(OPENMP) $(OPTS)
 ASFLAGS  = #-masm=intel
 LFLAGS   = $(PROFILE) $(OPTS) $(OPENMP)
--- a/include_ISA.mk
+++ b/include_ISA.mk
@@ -9,15 +9,13 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
    __ISA_AVX_FMA__=true
    __SIMD_WIDTH_DBL__=4
 else ifeq ($(strip $(ISA)), AVX2)
    #__SIMD_KERNEL__=true
    __ISA_AVX2__=true
    #__SIMD_KERNEL__=true
    __SIMD_WIDTH_DBL__=4
 else ifeq ($(strip $(ISA)), AVX512)
    __ISA_AVX512__=true
    __SIMD_WIDTH_DBL__=8
    ifeq ($(strip $(DATA_TYPE)), DP)
    __SIMD_KERNEL__=true
-    endif
+    __SIMD_WIDTH_DBL__=8
 endif
 # SIMD width is specified in double-precision, hence it may
--- a/include_NVCC.mk
+++ b/include_NVCC.mk
@@ -8,7 +8,8 @@ ANSI_CFLAGS += -Wextra
 #
 # A100 + Native
-CFLAGS   = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
+#CFLAGS   = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 CFLAGS   = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 # A40 + Native
 #CFLAGS   = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 # Cascade Lake
--- a/lammps/atom.c
+++ b/lammps/atom.c
@@ -502,21 +502,6 @@ int readAtom_in(Atom* atom, Parameter* param) {
    return natoms;
 }
 void writeAtom(Atom *atom, Parameter *param) {
    FILE *fp = fopen(param->write_atom_file, "w");
    for(int i = 0; i < atom->Nlocal; i++) {
        fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
            atom->type[i], 1.0,
            atom_x(i), atom_y(i), atom_z(i),
            atom_vx(i), atom_vy(i), atom_vz(i));
    }
    fclose(fp);
    fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
        param->write_atom_file, param->xprd, param->yprd, param->zprd);
 }
 void growAtom(Atom *atom) {
    DeviceAtom *d_atom = &(atom->d_atom);
    int nold = atom->Nmax;
--- a/lammps/cuda/force.cu
+++ b/lammps/cuda/force.cu
@@ -29,7 +29,7 @@ extern "C" {
 }
 // cuda kernel
-__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
+__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i >= Nlocal) {
        return;
@@ -46,10 +46,6 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
    MD_FLOAT fiy = 0;
    MD_FLOAT fiz = 0;
 #ifdef EXPLICIT_TYPES
    const int type_i = atom->type[i];
 #endif
    for(int k = 0; k < numneighs; k++) {
        int j = neigh_neighbors[Nlocal * k + i];
        MD_FLOAT delx = xtmp - atom_x(j);
@@ -59,7 +55,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
 #ifdef EXPLICIT_TYPES
        const int type_j = atom->type[j];
-        const int type_ij = type_i * ntypes + type_j;
+        const int type_ij = type_i * atom->ntypes + type_j;
        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
        const MD_FLOAT epsilon = atom->epsilon[type_ij];
@@ -113,7 +109,7 @@ extern "C" {
 void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
    const int Nlocal = atom->Nlocal;
-    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_threads_per_block = get_num_threads();
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
@@ -127,7 +123,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
    const int Nlocal = atom->Nlocal;
-    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_threads_per_block = get_num_threads();
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
@@ -140,11 +136,13 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 }
 double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
-    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_threads_per_block = get_num_threads();
    int Nlocal = atom->Nlocal;
 #ifndef EXPLICIT_TYPES
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
 #endif
    /*
    int nDevices;
@@ -167,7 +165,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");
-    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
+    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
    cuda_assert("calc_force", cudaPeekAtLastError());
    cuda_assert("calc_force", cudaDeviceSynchronize());
    cudaProfilerStop();
--- a/lammps/cuda/neighbor.cu
+++ b/lammps/cuda/neighbor.cu
@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
 __global__ void compute_neighborhood(
    DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
-    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
+    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i >= nlocal) {
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
 #ifdef EXPLICIT_TYPES
            int type_j = atom->type[j];
-            const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
+            const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
 #else
            const MD_FLOAT cutoff = cutneighsq;
 #endif
@@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
 void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
-    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_threads_per_block = get_num_threads();
    int nall = atom->Nlocal + atom->Nghost;
    cudaProfilerStart();
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
                                                                    np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
                                                                    c_new_maxneighs,
-								                                    cutneighsq, atom->ntypes);
+								                                    cutneighsq);
        cuda_assert("compute_neighborhood", cudaPeekAtLastError());
        cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
--- a/lammps/cuda/pbc.cu
+++ b/lammps/cuda/pbc.cu
@@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
 /* update coordinates of ghost atoms */
 /* uses mapping created in setupPbc */
 void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
-    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_threads_per_block = get_num_threads();
    if(reneigh) {
        memcpyToGPU(atom->d_atom.x,     atom->x,    sizeof(MD_FLOAT) * atom->Nmax * 3);
@@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
 }
 void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
-    const int num_threads_per_block = get_cuda_num_threads();
+    const int num_threads_per_block = get_num_threads();
    MD_FLOAT xprd = param->xprd;
    MD_FLOAT yprd = param->yprd;
    MD_FLOAT zprd = param->zprd;
--- a/lammps/device_spec.c
+++ b/lammps/device_spec.c
@@ -14,7 +14,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
    d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_atom->cutneighsq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
    d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
@@ -23,7 +22,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
    memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
    memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->cutneighsq,     atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
    memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
 }
--- a/lammps/force_eam.c
+++ b/lammps/force_eam.c
@@ -31,12 +31,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
    double S = getTimeStamp();
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force_eam_fp");
-
+    #pragma omp parallel for
    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -99,19 +95,13 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
    }
    LIKWID_MARKER_STOP("force_eam_fp");
    }
    // We still need to update fp for PBC atoms
    for(int i = 0; i < atom->Nghost; i++) {
        fp[Nlocal + i] = fp[atom->border_map[i]];
    }
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force_eam");
    #pragma omp for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -202,8 +192,6 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
    }
    LIKWID_MARKER_STOP("force_eam");
    }
    double E = getTimeStamp();
    return E-S;
 }
--- a/lammps/force_lj.c
+++ b/lammps/force_lj.c
@@ -26,22 +26,17 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
    #endif
    const MD_FLOAT num1 = 1.0;
    const MD_FLOAT num48 = 48.0;
    const MD_FLOAT num05 = 0.5;
    for(int i = 0; i < Nlocal; i++) {
        atom_fx(i) = 0.0;
        atom_fy(i) = 0.0;
        atom_fz(i) = 0.0;
    }
    double S = getTimeStamp();
-    #pragma omp parallel
+    double S = getTimeStamp();
    {
    LIKWID_MARKER_START("force");
-    #pragma omp for schedule(runtime)
+    #pragma omp parallel for
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -72,9 +67,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
            #endif
            if(rsq < cutforcesq) {
-                MD_FLOAT sr2 = num1 / rsq;
+                MD_FLOAT sr2 = 1.0 / rsq;
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
-                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
+                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
                fix += delx * force;
                fiy += dely * force;
                fiz += delz * force;
@@ -90,19 +85,11 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
        atom_fy(i) += fiy;
        atom_fz(i) += fiz;
        #ifdef USE_REFERENCE_VERSION
        if(numneighs % VECTOR_WIDTH > 0) {
            addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
        }
        #endif
        addStat(stats->total_force_neighs, numneighs);
        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
    }
    LIKWID_MARKER_STOP("force");
    }
    double E = getTimeStamp();
    return E-S;
 }
@@ -115,9 +102,6 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
    MD_FLOAT sigma6 = param->sigma6;
    MD_FLOAT epsilon = param->epsilon;
    #endif
    const MD_FLOAT num1 = 1.0;
    const MD_FLOAT num48 = 48.0;
    const MD_FLOAT num05 = 0.5;
    for(int i = 0; i < Nlocal; i++) {
        atom_fx(i) = 0.0;
@@ -126,12 +110,8 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
    }
    double S = getTimeStamp();
    #pragma omp parallel
    {
    LIKWID_MARKER_START("forceLJ-halfneigh");
    #pragma omp for schedule(runtime)
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -166,9 +146,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
            #endif
            if(rsq < cutforcesq) {
-                MD_FLOAT sr2 = num1 / rsq;
+                MD_FLOAT sr2 = 1.0 / rsq;
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
-                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
+                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
                fix += delx * force;
                fiy += dely * force;
                fiz += delz * force;
@@ -191,8 +171,6 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
    }
    LIKWID_MARKER_STOP("forceLJ-halfneigh");
    }
    double E = getTimeStamp();
    return E-S;
 }
@@ -211,6 +189,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
    }
    double S = getTimeStamp();
    LIKWID_MARKER_START("force");
    #ifndef __SIMD_KERNEL__
    fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
@@ -222,12 +201,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
-
+    #pragma omp parallel for
    #pragma omp parallel
    {
    LIKWID_MARKER_START("force");
    #pragma omp for schedule(runtime)
    for(int i = 0; i < Nlocal; i++) {
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
        int numneighs = neighbor->numneigh[i];
@@ -268,11 +242,9 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
        atom_fy(i) += simd_h_reduce_sum(fiy);
        atom_fz(i) += simd_h_reduce_sum(fiz);
    }
    LIKWID_MARKER_STOP("force");
    }
    #endif
    LIKWID_MARKER_STOP("force");
    double E = getTimeStamp();
    return E-S;
 }
--- a/lammps/includes/atom.h
+++ b/lammps/includes/atom.h
@@ -73,7 +73,6 @@ extern int readAtom_pdb(Atom*, Parameter*);
 extern int readAtom_gro(Atom*, Parameter*);
 extern int readAtom_dmp(Atom*, Parameter*);
 extern int readAtom_in(Atom*, Parameter*);
 extern void writeAtom(Atom*, Parameter*);
 extern void growAtom(Atom*);
 #ifdef AOS
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
@@ -59,6 +59,12 @@ void init(Parameter *param) {
    param->eam_file = NULL;
 }
 // Show debug messages
 #define DEBUG(msg)  printf(msg)
 // Do not show debug messages
 //#define DEBUG(msg)
 void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
    const int maxneighs = nneighs * nreps;
    neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
@@ -119,7 +125,7 @@ int main(int argc, const char *argv[]) {
    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("force");
-    DEBUG_MESSAGE("Initializing parameters...\n");
+    DEBUG("Initializing parameters...\n");
    init(&param);
    for(int i = 0; i < argc; i++) {
@@ -190,11 +196,11 @@ int main(int argc, const char *argv[]) {
    }
    if(param.force_field == FF_EAM) {
-        DEBUG_MESSAGE("Initializing EAM parameters...\n");
+        DEBUG("Initializing EAM parameters...\n");
        initEam(&eam, &param);
    }
-    DEBUG_MESSAGE("Initializing atoms...\n");
+    DEBUG("Initializing atoms...\n");
    initAtom(atom);
    initStats(&stats);
@@ -210,7 +216,7 @@ int main(int argc, const char *argv[]) {
        atom->cutforcesq[i] = param.cutforce * param.cutforce;
    }
-    DEBUG_MESSAGE("Creating atoms...\n");
+    DEBUG("Creating atoms...\n");
    for(int i = 0; i < natoms; ++i) {
        while(atom->Nlocal > atom->Nmax - natoms) {
            growAtom(atom);
@@ -241,11 +247,11 @@ int main(int argc, const char *argv[]) {
        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
    }
-    DEBUG_MESSAGE("Initializing neighbor lists...\n");
+    DEBUG("Initializing neighbor lists...\n");
    initNeighbor(&neighbor, &param);
-    DEBUG_MESSAGE("Creating neighbor lists...\n");
+    DEBUG("Creating neighbor lists...\n");
    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
-    DEBUG_MESSAGE("Computing forces...\n");
+    DEBUG("Computing forces...\n");
    double T_accum = 0.0;
    for(int i = 0; i < param.ntimes; i++) {
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -11,7 +11,6 @@
 #include <limits.h>
 #include <math.h>
 #include <float.h>
 #include <omp.h>
 #include <likwid-marker.h>
@@ -64,10 +63,6 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
    setupNeighbor(param);
    setupThermo(param, atom->Natoms);
    if(param->input_file == NULL) { adjustThermo(param, atom); }
    #ifdef SORT_ATOMS
    atom->Nghost = 0;
    sortAtom(atom);
    #endif
    setupPbc(atom, param);
    initDevice(atom, neighbor);
    updatePbc(atom, param, true);
@@ -81,12 +76,9 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
    S = getTimeStamp();
    LIKWID_MARKER_START("reneighbour");
    updateAtomsPbc(atom, param);
    #ifdef SORT_ATOMS
    atom->Nghost = 0;
    sortAtom(atom);
    #endif
    setupPbc(atom, param);
    updatePbc(atom, param, true);
    //sortAtom(atom);
    buildNeighbor(atom, neighbor);
    LIKWID_MARKER_STOP("reneighbour");
    E = getTimeStamp();
@@ -153,7 +145,7 @@ int main(int argc, char** argv) {
    initParameter(&param);
    for(int i = 0; i < argc; i++) {
-        if((strcmp(argv[i], "-p") == 0) || strcmp(argv[i], "--params") == 0) {
+        if((strcmp(argv[i], "-p") == 0)) {
            readParameter(&param, argv[++i]);
            continue;
        }
@@ -208,23 +200,17 @@ int main(int argc, char** argv) {
            param.vtk_file = strdup(argv[++i]);
            continue;
        }
        if((strcmp(argv[i], "-w") == 0)) {
            param.write_atom_file = strdup(argv[++i]);
            continue;
        }
        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
            printf(HLINE);
-            printf("-p / --params <string>:     file to read parameters from (can be specified more than once)\n");
+            printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
            printf("-f <string>:          force field (lj, eam or dem), default lj\n");
            printf("-i <string>:          input file with atom positions (dump)\n");
            printf("-e <string>:          input file for EAM\n");
            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
            printf("-half <int>:                use half (1) or full (0) neighbor lists\n");
            printf("-r / --radius <real>: set cutoff radius\n");
            printf("-s / --skin <real>:   set skin (verlet buffer)\n");
            printf("-w <file>:                  write input atoms to file\n");
            printf("--freq <real>:        processor frequency (GHz)\n");
            printf("--vtk <string>:       VTK file for visualization\n");
            printf(HLINE);
@@ -243,10 +229,6 @@ int main(int argc, char** argv) {
    traceAddresses(&param, &atom, &neighbor, n + 1);
    #endif
    if(param.write_atom_file != NULL) {
        writeAtom(&atom, &param);
    }
    //writeInput(&param, &atom);
    timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
@@ -293,30 +275,6 @@ int main(int argc, char** argv) {
    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
    printf(HLINE);
    int nthreads = 0;
    int chunkSize = 0;
    omp_sched_t schedKind;
    char schedType[10];
 #pragma omp parallel
 #pragma omp master
    {
    	omp_get_schedule(&schedKind, &chunkSize);
    	switch (schedKind)
    	{
        	case omp_sched_static:  strcpy(schedType, "static"); break;
        	case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
        	case omp_sched_guided:  strcpy(schedType, "guided"); break;
        	case omp_sched_auto:    strcpy(schedType, "auto"); break;
    	}
 	nthreads = omp_get_max_threads();
    }
    printf("Num threads: %d\n", nthreads);
    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
    printf("Performance: %.2f million atom updates per second\n",
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
 #ifdef COMPUTE_STATS
--- a/lammps/pbc.c
+++ b/lammps/pbc.c
@@ -125,7 +125,7 @@ void setupPbc(Atom *atom, Parameter *param) {
        if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
            if (x < Cutneigh         && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(+1,+1,+1); }
            if (x < Cutneigh         && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(+1,-1,+1); }
-            if (x < Cutneigh         && y < Cutneigh        && z >= (zprd-Cutneigh))  { ADDGHOST(+1,+1,-1); }
+            if (x < Cutneigh         && y >= Cutneigh        && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
            if (x < Cutneigh         && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
            if (x >= (xprd-Cutneigh) && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(-1,+1,+1); }
            if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(-1,-1,+1); }
--- a/util/evaluate_latency_and_cfd.sh
+++ b/util/evaluate_latency_and_cfd.sh
@@ -1,116 +1,46 @@
 #!/bin/bash
-[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
+TAG=ICX
-[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
+OPT_SCHEME=gromacs
-[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit
+MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
 FREQ=2.4
 NRUNS=3
 FIXED_PARAMS=--freq $FREQ
-MDBENCH_BIN=$1
+if [ "$OPT_SCHEME" = "gromacs" ]; then
-BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
+    STUB1_NAME=Stub-33
-OPT_SCHEME="${BIN_INFO%%-*}"
+    STUB1_PARAMS=-na 4 -nn 33
-PREC="${BIN_INFO##*-}"
+    STUB2_NAME=Stub-128
-BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
+    STUB2_PARAMS=-na 4 -nn 128
 BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
 TAG="${BIN_INFO%%-*}"
 ISA="${BIN_INFO##*-}"
 CORE="${CORE:-0}"
 FREQ="${FREQ:-2.4}"
 NRUNS="${NRUNS:-3}"
 LOG="${LOG:-latencies_and_cfds.$(hostname).log}"
 STUB_ONLY="${STUB_ONLY:-false}"
 SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
 OPTIND=2
 while getopts "c:f:n:l:s" flag; do
    case "${flag}" in
        c) CORE=${OPTARG};;
        f) FREQ=${OPTARG};;
        n) NRUNS=${OPTARG};;
        l) LOG=${OPTARG};;
        s) STUB_ONLY=true;;
    esac
 done
 # Other useful variables
 MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
 FIXED_PARAMS="--freq $FREQ"
 CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
 if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
    ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
    DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
 else
-    ALL_PREFETCHERS=""
+    STUB1_NAME=Stub-76
-    DEFAULT_PREFETCHERS=("IGNORE")
+    STUB1_PARAMS=-nn 76
-fi
+    STUB2_NAME=Stub-1024
-
+    STUB2_PARAMS=-nn 1024
 if [ -z ${PREFETCHERS+x} ]; then
    PREFETCHERS=${DEFAULT_PREFETCHERS}
 fi
 if [ "$OPT_SCHEME" == "gromacs" ]; then
    STUB1_NAME=stub-33
    STUB1_PARAMS="-na 4 -nn 33"
    STUB2_NAME=stub-128
    STUB2_PARAMS="-na 4 -nn 128"
 else
    STUB1_NAME=stub-76
    STUB1_PARAMS="-nn 76"
    STUB2_NAME=stub-1024
    STUB2_PARAMS="-nn 1024"
 fi
 function run_benchmark() {
    BEST=10000000
    for i in $(seq $NRUNS); do
-        RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
+        likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
        if (( $(echo "$BEST > $RES" | bc -l ) )); then
            BEST=$RES
        fi
    done
 }
-echo "Tag: $TAG" | tee -a $LOG
+echo "Tag: $TAG"
-echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
+echo "Optimization scheme: $OPT_SCHEME"
-echo "Instruction set: $ISA" | tee -a $LOG
+echo "Binary: $MDBENCH_BIN(-stub)"
-echo "Precision: $PREC" | tee -a $LOG
+echo "Frequency: $FREQ"
-echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
+echo "Number of runs: $NRUNS"
 echo "Frequency: $FREQ" | tee -a $LOG
 echo "Number of runs: $NRUNS" | tee -a $LOG
 echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG
 if [ "$SKIP_SET_FREQ" == "false" ]; then
 echo "Fixing frequencies..."
 likwid-setFrequencies -f $FREQ -t 0
 fi
-for p in $PREFETCHERS; do
+echo "Standard"
    if [ "$p" != "IGNORE" ]; then
        if [ "$p" == "ALL" ]; then
            likwid-features -c $CORE -e $ALL_PREFETCHERS
        elif [ "$p" == "NONE" ]; then
            likwid-features -c $CORE -d $ALL_PREFETCHERS
        else
            likwid-features -c $CORE -d $ALL_PREFETCHERS
            likwid-features -c $CORE -e $p
        fi
        echo "Prefetcher settings: $p"
        likwid-features -c $CORE -l
    fi
    MSG="$p: "
    if [ "$STUB_ONLY" == "false" ]; then
 run_benchmark $MDBENCH_BIN
-        MSG+="standard=$BEST, "
+echo "Melt"
 run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
-        MSG+="melt=$BEST, "
+echo "Argon"
 run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
-        MSG+="argon=$BEST, "
+echo "$STUB1_NAME"
    fi
 run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
-    MSG+="$STUB1_NAME=$BEST, "
+echo "$STUB2_NAME"
 run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
    MSG+="$STUB2_NAME=$BEST"
    echo $MSG | tee -a $LOG
 done
--- a/util/gather-bench/.gitignore
+++ b/util/gather-bench/.gitignore
@@ -1,52 +0,0 @@
 # Prerequisites
 *.d
 # Object files
 *.o
 *.ko
 *.obj
 *.elf
 # Linker output
 *.ilk
 *.map
 *.exp
 # Precompiled Headers
 *.gch
 *.pch
 # Libraries
 *.lib
 *.a
 *.la
 *.lo
 # Shared objects (inc. Windows DLLs)
 *.dll
 *.so
 *.so.*
 *.dylib
 # Executables
 *.exe
 *.out
 *.app
 *.i*86
 *.x86_64
 *.hex
 # Debug files
 *.dSYM/
 *.su
 *.idb
 *.pdb
 # Kernel Module Compile Results
 *.mod*
 *.cmd
 .tmp_versions/
 modules.order
 Module.symvers
 Mkfile.old
 dkms.conf
--- a/util/gather-bench/LICENSE
+++ b/util/gather-bench/LICENSE
@@ -1,21 +0,0 @@
 MIT License
 Copyright (c) 2021 RRZE-HPC
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/util/gather-bench/Makefile
+++ b/util/gather-bench/Makefile
@@ -1,126 +0,0 @@
 #CONFIGURE BUILD SYSTEM
 TARGET	   = gather-bench-$(TAG)
 BUILD_DIR  = ./$(TAG)
 SRC_DIR	= ./src
 MAKE_DIR   = ./
 ISA_DIR	= ./src/$(ISA)
 Q		 ?= @
 #DO NOT EDIT BELOW
 include $(MAKE_DIR)/config.mk
 include $(MAKE_DIR)/include_$(TAG).mk
 include $(MAKE_DIR)/include_LIKWID.mk
 INCLUDES  += -I./src/includes
 VPATH	 = $(SRC_DIR) ${ISA_DIR}
 ASM	   = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 ASM	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90))
 OBJ	   = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
 OBJ	  += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
 OBJ	  += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))
 OBJ	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90))
 OBJ	  += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90))
 OBJ	  += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
 OBJ	  += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S))
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA)
 ifneq ($(VARIANT),)
 	.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
 endif
 ifeq ($(strip $(DATA_LAYOUT)),AOS)
    CPPFLAGS += -DAOS
 endif
 ifeq ($(strip $(TEST)),true)
    CPPFLAGS += -DTEST
 endif
 ifeq ($(strip $(PADDING)),true)
    CPPFLAGS += -DPADDING
 endif
 ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true)
    CPPFLAGS += -DMEASURE_GATHER_CYCLES
 endif
 ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true)
    CPPFLAGS += -DONLY_FIRST_DIMENSION
 endif
 ifeq ($(strip $(MEM_TRACER)),true)
    CPPFLAGS += -DMEM_TRACER
 endif
 ${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c
 	@echo "===>  LINKING  $(TARGET)"
 	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS)
 ${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c
 	@echo "===>  LINKING  $(TARGET)-$* "
 	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS)
 asm:  $(BUILD_DIR) $(ASM)
 $(BUILD_DIR)/%.o:  %.c
 	@echo "===>  COMPILE  $@"
 	$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
 	$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 $(BUILD_DIR)/%.s:  %.c
 	@echo "===>  GENERATE ASM  $@"
 	$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
 $(BUILD_DIR)/%.s:  %.f90
 	@echo "===>  COMPILE  $@"
 	$(Q)$(FC) -S  $(FCFLAGS) $< -o $@
 $(BUILD_DIR)/%.o:  %.cc
 	@echo "===>  COMPILE  $@"
 	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
 	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 $(BUILD_DIR)/%.o:  %.cpp
 	@echo "===>  COMPILE  $@"
 	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
 	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 $(BUILD_DIR)/%.o:  %.f90
 	@echo "===>  COMPILE  $@"
 	$(Q)$(FC) -c  $(FCFLAGS) $< -o $@
 $(BUILD_DIR)/%.o:  %.F90
 	@echo "===>  COMPILE  $@"
 	$(Q)$(FC) -c  $(CPPFLAGS)  $(FCFLAGS) $< -o $@
 $(BUILD_DIR)/%.o:  %.s
 	@echo "===>  ASSEMBLE  $@"
 	$(Q)$(AS)  $(ASFLAGS) $< -o $@
 $(BUILD_DIR)/%.o:  %.S
 	@echo "===>  ASSEMBLE  $@"
 	$(Q)$(CC) -c $(CPPFLAGS) $< -o $@
 tags:
 	@echo "===>  GENERATE  TAGS"
 	$(Q)ctags -R
 $(BUILD_DIR):
 	@mkdir $(BUILD_DIR)
 ifeq ($(findstring $(MAKECMDGOALS),clean),)
 -include $(OBJ:.o=.d)
 endif
 .PHONY: clean distclean
 clean:
 	@echo "===>  CLEAN"
 	@rm -rf $(BUILD_DIR)
 	@rm -f tags
 distclean: clean
 	@echo "===>  DIST CLEAN"
 	@rm -f $(TARGET)
 	@rm -f tags
--- a/util/gather-bench/README.md
+++ b/util/gather-bench/README.md
@@ -1,2 +0,0 @@
 # gather-bench
 A X86 gather instruction performance benchmark
--- a/util/gather-bench/config.mk
+++ b/util/gather-bench/config.mk
@@ -1,22 +0,0 @@
 # Supported: GCC, CLANG, ICC
 TAG ?= ICC
 # Supported: avx2, avx512
 ISA ?= avx512
 # Use likwid?
 ENABLE_LIKWID ?= false
 # SP or DP
 DATA_TYPE ?= DP
 # AOS or SOA
 DATA_LAYOUT ?= AOS
 # Padding byte for AoS
 PADDING ?= false
 # Measure cycles for each gather separately
 MEASURE_GATHER_CYCLES ?= false
 # Gather data only for first dimension (one gather per iteration)
 ONLY_FIRST_DIMENSION ?= false
 # Trace memory addresses for cache simulator
 MEM_TRACER ?= false
 # Test correctness of gather kernels
 TEST ?= false
--- a/util/gather-bench/include_CLANG.mk
+++ b/util/gather-bench/include_CLANG.mk
@@ -1,9 +0,0 @@
 CC  = clang
 LINKER = $(CC)
 OPENMP   =# -fopenmp
 CFLAGS   = -Ofast -std=c11 -march=core-avx2 -mavx -mfma  $(OPENMP)
 LFLAGS   = $(OPENMP) -march=core-avx2 -mavx -mfma
 DEFINES  = -D_GNU_SOURCE
 INCLUDES =
 LIBS     =
--- a/util/gather-bench/include_GCC.mk
+++ b/util/gather-bench/include_GCC.mk
@@ -1,11 +0,0 @@
 CC  = gcc
 AS  = as
 LINKER = $(CC)
 OPENMP   = -fopenmp
 CFLAGS   = -Ofast -std=c11 -mavx2 -mfma $(OPENMP)
 ASFLAGS  =
 LFLAGS   = $(OPENMP) -mavx2 -mfma
 DEFINES  = -D_GNU_SOURCE
 INCLUDES =
 LIBS     =
--- a/util/gather-bench/include_ICC.mk
+++ b/util/gather-bench/include_ICC.mk
@@ -1,9 +0,0 @@
 CC  = icc
 LINKER = $(CC)
 OPENMP   = -qopenmp
 CFLAGS   = -Ofast -xhost -std=c11 $(OPENMP)
 LFLAGS   = $(OPENMP)
 DEFINES  = -D_GNU_SOURCE
 INCLUDES =
 LIBS     =
--- a/util/gather-bench/include_LIKWID.mk
+++ b/util/gather-bench/include_LIKWID.mk
@@ -1,10 +0,0 @@
 LIKWID_INC ?= -I/usr/local/include
 LIKWID_DEFINES ?= -DLIKWID_PERFMON
 LIKWID_LIB ?= -L/usr/local/lib
 ifeq ($(strip $(ENABLE_LIKWID)),true)
 INCLUDES += ${LIKWID_INC}
 DEFINES +=  ${LIKWID_DEFINES}
 LIBS += -llikwid
 LFLAGS += ${LIKWID_LIB}
 endif
--- a/util/gather-bench/src/allocate.c
+++ b/util/gather-bench/src/allocate.c
@@ -1,57 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <errno.h>
 void* allocate (int alignment, size_t bytesize)
 {
    int errorCode;
    void* ptr;
    errorCode =  posix_memalign(&ptr, alignment, bytesize);
    if (errorCode) {
        if (errorCode == EINVAL) {
            fprintf(stderr,
                    "Error: Alignment parameter is not a power of two\n");
            exit(EXIT_FAILURE);
        }
        if (errorCode == ENOMEM) {
            fprintf(stderr,
                    "Error: Insufficient memory to fulfill the request\n");
            exit(EXIT_FAILURE);
        }
    }
    if (ptr == NULL) {
        fprintf(stderr, "Error: posix_memalign failed!\n");
        exit(EXIT_FAILURE);
    }
    return ptr;
 }
--- a/util/gather-bench/src/avx2/gather.S
+++ b/util/gather-bench/src/avx2/gather.S
@@ -1,63 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> a
 # rsi -> idx
 # rdx -> N
 # rcx -> t
 .text
 .globl gather
 .type gather, @function
 gather :
 push rbp
 mov rbp, rsp
 push rbx
 push r12
 push r13
 push r14
 push r15
 xor   rax, rax
 vpcmpeqd ymm0, ymm0, ymm0
 .align 16
 1:
 vmovups xmm1, [rsi + rax * 4]
 vmovups xmm2, [rsi + rax * 4 + 16]
 vmovups xmm3, [rsi + rax * 4 + 32]
 vmovups xmm4, [rsi + rax * 4 + 48]
 vmovdqa ymm5, ymm0
 vmovdqa ymm6, ymm0
 vmovdqa ymm7, ymm0
 vmovdqa ymm8, ymm0
 vxorpd ymm9,  ymm9,  ymm9
 vxorpd ymm10, ymm10, ymm10
 vxorpd ymm11, ymm11, ymm11
 vxorpd ymm12, ymm12, ymm12
 vgatherdpd ymm9,  [rdi + xmm1 * 8], ymm5
 vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
 vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
 vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
 #ifdef TEST
 vmovapd [rcx + rax * 8],      ymm9
 vmovapd [rcx + rax * 8 + 32], ymm10
 vmovapd [rcx + rax * 8 + 64], ymm11
 vmovapd [rcx + rax * 8 + 96], ymm12
 #endif
 addq rax, 16
 cmpq rax, rdx
 jl 1b
 pop r15
 pop r14
 pop r13
 pop r12
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather, .-gather
--- a/util/gather-bench/src/avx2/gather_aos.S
+++ b/util/gather-bench/src/avx2/gather_aos.S
@@ -1,71 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> a
 # rsi -> idx
 # rdx -> N
 # rcx -> t
 .text
 .globl gather_aos
 .type gather_aos, @function
 gather_aos :
 push rbp
 mov rbp, rsp
 push rbx
 push r9
 push r10
 push r11
 push r12
 push r13
 push r14
 push r15
 xor   rax, rax
 vpcmpeqd ymm8, ymm8, ymm8
 .align 16
 1:
 vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
 vpaddd xmm4, xmm3, xmm3
 #ifdef PADDING
 vpaddd xmm3, xmm4, xmm4
 #else
 vpaddd xmm3, xmm3, xmm4
 #endif
 vmovdqa ymm5, ymm8
 vmovdqa ymm6, ymm8
 vmovdqa ymm7, ymm8
 vxorpd ymm0, ymm0, ymm0
 vxorpd ymm1, ymm1, ymm1
 vxorpd ymm2, ymm2, ymm2
 vgatherdpd ymm0, [     rdi + xmm3 * 8], ymm5
 vgatherdpd ymm1, [8  + rdi + xmm3 * 8], ymm6
 vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
 #ifdef TEST
 vmovupd  [rcx + rax * 8], ymm0
 lea rbx, [rcx + rdx * 8]
 vmovupd  [rbx + rax * 8], ymm1
 lea r9,  [rbx + rdx * 8]
 vmovupd  [r9  + rax * 8], ymm2
 #endif
 addq rax, 4
 cmpq rax, rdx
 jl 1b
 pop r15
 pop r14
 pop r13
 pop r12
 pop r11
 pop r10
 pop r9
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather_aos, .-gather_aos
--- a/util/gather-bench/src/avx2/gather_soa.S
+++ b/util/gather-bench/src/avx2/gather_soa.S
@@ -1,67 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> a
 # rsi -> idx
 # rdx -> N
 # rcx -> t
 .text
 .globl gather_soa
 .type gather_soa, @function
 gather_soa :
 push rbp
 mov rbp, rsp
 push rbx
 push r9
 push r10
 push r11
 push r12
 push r13
 push r14
 push r15
 xor rax, rax
 vpcmpeqd ymm8, ymm8, ymm8
 lea r8, [rdi + rdx * 8]
 lea r9, [r8  + rdx * 8]
 .align 16
 1:
 vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
 vmovdqa ymm5, ymm8
 vmovdqa ymm6, ymm8
 vmovdqa ymm7, ymm8
 vxorpd ymm0, ymm0, ymm0
 vxorpd ymm1, ymm1, ymm1
 vxorpd ymm2, ymm2, ymm2
 vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5
 vgatherdpd ymm1, [r8  + xmm3 * 8], ymm6
 vgatherdpd ymm2, [r9  + xmm3 * 8], ymm7
 #ifdef TEST
 vmovupd  [rcx + rax * 8], ymm0
 lea rbx, [rcx + rdx * 8]
 vmovupd  [rbx + rax * 8], ymm1
 lea r10, [rbx + rdx * 8]
 vmovupd  [r10 + rax * 8], ymm2
 #endif
 addq rax, 4
 cmpq rax, rdx
 jl 1b
 pop r15
 pop r14
 pop r13
 pop r12
 pop r11
 pop r10
 pop r9
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather_soa, .-gather_soa
--- a/util/gather-bench/src/avx512/gather.S
+++ b/util/gather-bench/src/avx512/gather.S
@@ -1,62 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> a
 # rsi -> idx
 # rdx -> N
 # rcx -> t
 .text
 .globl gather
 .type gather, @function
 gather :
 push rbp
 mov rbp, rsp
 push rbx
 push r12
 push r13
 push r14
 push r15
 xor   rax, rax
 .align 16
 1:
 vpcmpeqb k1, xmm0, xmm0
 vpcmpeqb k2, xmm0, xmm0
 vpcmpeqb k3, xmm0, xmm0
 vpcmpeqb k4, xmm0, xmm0
 vmovdqu ymm0, [rsi + rax * 4]
 vmovdqu ymm1, [rsi + rax * 4 + 32]
 vmovdqu ymm2, [rsi + rax * 4 + 64]
 vmovdqu ymm3, [rsi + rax * 4 + 96]
 vpxord zmm4, zmm4, zmm4
 vpxord zmm5, zmm5, zmm5
 vpxord zmm6, zmm6, zmm6
 vpxord zmm7, zmm7, zmm7
 vgatherdpd zmm4{k1}, [rdi + ymm0 * 8]
 vgatherdpd zmm5{k2}, [rdi + ymm1 * 8]
 vgatherdpd zmm6{k3}, [rdi + ymm2 * 8]
 vgatherdpd zmm7{k4}, [rdi + ymm3 * 8]
 #ifdef TEST
 vmovapd [rcx + rax * 8],       zmm4
 vmovapd [rcx + rax * 8 + 64],  zmm5
 vmovapd [rcx + rax * 8 + 128], zmm6
 vmovapd [rcx + rax * 8 + 192], zmm7
 #endif
 addq rax, 32
 cmpq rax, rdx
 jl 1b
 pop r15
 pop r14
 pop r13
 pop r12
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather, .-gather
--- a/util/gather-bench/src/avx512/gather_aos.S
+++ b/util/gather-bench/src/avx512/gather_aos.S
@@ -1,151 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> a
 # rsi -> idx
 # rdx -> N
 # rcx -> t
 # r8  -> cycles
 .text
 .globl gather_aos
 .type gather_aos, @function
 gather_aos :
 push rbp
 mov rbp, rsp
 push rbx
 push r9
 push r10
 push r11
 push r12
 push r13
 push r14
 push r15
 xor   rax, rax
 .align 16
 1:
 vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 vpaddd ymm4, ymm3, ymm3
 #ifdef PADDING
 vpaddd ymm3, ymm4, ymm4
 #else
 vpaddd ymm3, ymm3, ymm4
 #endif
 # Prefetching instructions
 #mov ebx, DWORD PTR[rsi + rax*4]
 #mov r9d, DWORD PTR[4 + rsi + rax*4]
 #mov r10d, DWORD PTR[8 + rsi + rax*4]
 #mov r11d, DWORD PTR[12 + rsi + rax*4]
 #mov r12d, DWORD PTR[16 + rsi + rax*4]
 #mov r13d, DWORD PTR[20 + rsi + rax*4]
 #mov r14d, DWORD PTR[24 + rsi + rax*4]
 #mov r15d, DWORD PTR[28 + rsi + rax*4]
 #lea ebx, DWORD PTR[rbx]
 #lea r9d, DWORD PTR[r9]
 #lea r10d, DWORD PTR[r10]
 #lea r11d, DWORD PTR[r11]
 #lea r12d, DWORD PTR[r12]
 #lea r13d, DWORD PTR[r13]
 #lea r14d, DWORD PTR[r14]
 #lea r15d, DWORD PTR[r15]
 vpcmpeqb k1, xmm5, xmm5
 #ifndef ONLY_FIRST_DIMENSION
 vpcmpeqb k2, xmm5, xmm5
 vpcmpeqb k3, xmm5, xmm5
 #endif
 vpxord zmm0, zmm0, zmm0
 #ifndef ONLY_FIRST_DIMENSION
 vpxord zmm1, zmm1, zmm1
 vpxord zmm2, zmm2, zmm2
 #endif
 #ifdef MEASURE_GATHER_CYCLES
 mov r9, rax
 mov r10, rdx
 xor r11, r11
 add r11, rax
 add r11, rax
 add r11, rax
 #shr r11, 3
 xor rbx, rbx
 lfence
 rdtsc
 add ebx, eax
 vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
 lfence
 rdtsc
 sub eax, ebx
 #movdiri [r8 + r11], rax
 movnti [r8 + r11], rax
 #ifndef ONLY_FIRST_DIMENSION
 xor rbx, rbx
 lfence
 rdtsc
 add ebx, eax
 vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
 lfence
 rdtsc
 sub eax, ebx
 #movdiri [8 + r8 + r11], rax
 movnti [8 + r8 + r11], rax
 xor rbx, rbx
 lfence
 rdtsc
 add ebx, eax
 vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 lfence
 rdtsc
 sub eax, ebx
 #movdiri [16 + r8 + r11], rax
 movnti [16 + r8 + r11], rax
 #endif // ONLY_FIRST_DIMENSION
 mov rax, r9
 mov rdx, r10
 #else // MEASURE_GATHER_CYCLES
 vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 #ifndef ONLY_FIRST_DIMENSION
 vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 #endif
 #endif // MEASURE_GATHER_CYCLES
 #ifdef TEST
 vmovupd  [rcx + rax * 8], zmm0
 lea rbx, [rcx + rdx * 8]
 vmovupd  [rbx + rax * 8], zmm1
 lea r9,  [rbx + rdx * 8]
 vmovupd  [r9  + rax * 8], zmm2
 #endif
 addq rax, 8
 cmpq rax, rdx
 jl 1b
 pop r15
 pop r14
 pop r13
 pop r12
 pop r11
 pop r10
 pop r9
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather_aos, .-gather_aos
--- a/util/gather-bench/src/avx512/gather_md_aos.S
+++ b/util/gather-bench/src/avx512/gather_md_aos.S
@@ -1,147 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 .section .rodata, "a"
 .align 64
 .align 64
 .ymm_reg_mask.1:
 	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 	.type	.ymm_reg_mask.1,@object
 	.size	.ymm_reg_mask.1,32
 	.align 8
 # rdi -> a
 # rsi -> neighbors
 # rdx -> numneighs[i]
 # rcx -> &t[t_idx]
 # r8  -> ntest
 .text
 .globl gather_md_aos
 .type gather_md_aos, @function
 gather_md_aos :
 push rbp
 mov rbp, rsp
 push rbx
 push r10
 push r11
 push r12
 push r13
 push r14
 push r15
 vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
 mov r15, rdx
 xor rax, rax
 .align 16
 1:
 vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 vpaddd ymm4, ymm3, ymm3
 #ifdef PADDING
 vpaddd ymm3, ymm4, ymm4
 #else
 vpaddd ymm3, ymm3, ymm4
 #endif
 # Prefetching instructions
 #mov ebx, DWORD PTR[rsi + rax*4]
 #mov r9d, DWORD PTR[4 + rsi + rax*4]
 #mov r10d, DWORD PTR[8 + rsi + rax*4]
 #mov r11d, DWORD PTR[12 + rsi + rax*4]
 #mov r12d, DWORD PTR[16 + rsi + rax*4]
 #mov r13d, DWORD PTR[20 + rsi + rax*4]
 #mov r14d, DWORD PTR[24 + rsi + rax*4]
 #mov r15d, DWORD PTR[28 + rsi + rax*4]
 #lea ebx, DWORD PTR[rbx]
 #lea r9d, DWORD PTR[r9]
 #lea r10d, DWORD PTR[r10]
 #lea r11d, DWORD PTR[r11]
 #lea r12d, DWORD PTR[r12]
 #lea r13d, DWORD PTR[r13]
 #lea r14d, DWORD PTR[r14]
 #lea r15d, DWORD PTR[r15]
 vpcmpeqb k1, xmm5, xmm5
 #ifndef ONLY_FIRST_DIMENSION
 vpcmpeqb k2, xmm5, xmm5
 vpcmpeqb k3, xmm5, xmm5
 #endif
 vpxord zmm0, zmm0, zmm0
 #ifndef ONLY_FIRST_DIMENSION
 vpxord zmm1, zmm1, zmm1
 vpxord zmm2, zmm2, zmm2
 #endif
 vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 #ifndef ONLY_FIRST_DIMENSION
 vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 #endif
 #ifdef TEST
 vmovupd  [rcx + rax * 8], zmm0
 lea rbx, [rcx + r8  * 8]
 vmovupd  [rbx + rax * 8], zmm1
 lea r10, [rbx + r8  * 8]
 vmovupd  [r10 + rax * 8], zmm2
 #endif
 # TODO: see if this logic can be optimized
 addq rax, 8
 subq r15, 8
 cmpq r15, 8
 jge 1b
 cmpq r15, 0
 jle .end_func
 vpbroadcastd ymm6, r15d
 vpcmpgtd k1, ymm6, ymm7
 vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
 vpaddd ymm4, ymm3, ymm3
 #ifdef PADDING
 vpaddd ymm3, ymm4, ymm4
 #else
 vpaddd ymm3, ymm3, ymm4
 #endif
 vpxord    zmm0, zmm1, zmm2
 #ifndef ONLY_FIRST_DIMENSION
 kmovw     k2, k1
 kmovw     k3, k1
 vpxord    zmm1, zmm1, zmm1
 vpxord    zmm2, zmm2, zmm2
 #endif
 vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 #ifndef ONLY_FIRST_DIMENSION
 vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 #endif
 #ifdef TEST
 vmovupd  [rcx + rax * 8], zmm0
 lea rbx, [rcx + r8  * 8]
 vmovupd  [rbx + rax * 8], zmm1
 lea r10, [rbx + r8  * 8]
 vmovupd  [r10  + rax * 8], zmm2
 #endif
 addq rax, r15
 .end_func:
 pop r15
 pop r14
 pop r13
 pop r12
 pop r11
 pop r10
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather_md_aos, .-gather_md_aos
--- a/util/gather-bench/src/avx512/gather_soa.S
+++ b/util/gather-bench/src/avx512/gather_soa.S
@@ -1,67 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> a
 # rsi -> idx
 # rdx -> N
 # rcx -> t
 .text
 .globl gather_soa
 .type gather_soa, @function
 gather_soa :
 push rbp
 mov rbp, rsp
 push rbx
 push r9
 push r10
 push r11
 push r12
 push r13
 push r14
 push r15
 xor   rax, rax
 vpcmpeqd ymm8, ymm8, ymm8
 lea r8, [rdi + rdx * 8]
 lea r9, [r8  + rdx * 8]
 .align 16
 1:
 vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 vpcmpeqb k1, xmm5, xmm5
 vpcmpeqb k2, xmm5, xmm5
 vpcmpeqb k3, xmm5, xmm5
 vpxord zmm0, zmm0, zmm0
 vpxord zmm1, zmm1, zmm1
 vpxord zmm2, zmm2, zmm2
 vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
 vgatherdpd zmm1{k2}, [r8  + ymm3 * 8]
 vgatherdpd zmm2{k3}, [r9  + ymm3 * 8]
 #ifdef TEST
 vmovupd  [rcx + rax * 8], zmm0
 lea rbx, [rcx + rdx * 8]
 vmovupd  [rbx + rax * 8], zmm1
 lea r10, [rbx + rdx * 8]
 vmovupd  [r10 + rax * 8], zmm2
 #endif
 addq rax, 8
 cmpq rax, rdx
 jl 1b
 pop r15
 pop r14
 pop r13
 pop r12
 pop r11
 pop r10
 pop r9
 pop rbx
 mov  rsp, rbp
 pop rbp
 ret
 .size gather_soa, .-gather_soa
--- a/util/gather-bench/src/avx512/load_aos.S
+++ b/util/gather-bench/src/avx512/load_aos.S
@@ -1,23 +0,0 @@
 .intel_syntax noprefix
 .data
 .align 64
 SCALAR:
 .double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 # rdi -> &a[i * snbytes]
 .text
 .globl load_aos
 .type load_aos, @function
 load_aos :
 vmovsd xmm0, QWORD PTR [rdi]
 vmovsd xmm1, QWORD PTR [8  + rdi]
 vmovsd xmm2, QWORD PTR [16 + rdi]
 vbroadcastsd zmm3, xmm0
 vbroadcastsd zmm4, xmm1
 vbroadcastsd zmm5, xmm2
 ret
 .size load_aos, .-load_aos
--- a/util/gather-bench/src/includes/allocate.h
+++ b/util/gather-bench/src/includes/allocate.h
@@ -1,32 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #ifndef __ALLOCATE_H_
 #define __ALLOCATE_H_
 extern void* allocate (int alignment, size_t bytesize);
 #endif
--- a/util/gather-bench/src/includes/likwid-marker.h
+++ b/util/gather-bench/src/includes/likwid-marker.h
@@ -1,53 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #ifndef LIKWID_MARKERS_H
 #define LIKWID_MARKERS_H
 #ifdef LIKWID_PERFMON
 #include <likwid.h>
 #define LIKWID_MARKER_INIT likwid_markerInit()
 #define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
 #define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
 #define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
 #define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
 #define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
 #define LIKWID_MARKER_CLOSE likwid_markerClose()
 #define LIKWID_MARKER_RESET(regionTag) likwid_markerResetRegion(regionTag)
 #define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
 #else  /* LIKWID_PERFMON */
 #define LIKWID_MARKER_INIT
 #define LIKWID_MARKER_THREADINIT
 #define LIKWID_MARKER_SWITCH
 #define LIKWID_MARKER_REGISTER(regionTag)
 #define LIKWID_MARKER_START(regionTag)
 #define LIKWID_MARKER_STOP(regionTag)
 #define LIKWID_MARKER_CLOSE
 #define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
 #define LIKWID_MARKER_RESET(regionTag)
 #endif /* LIKWID_PERFMON */
 #endif /*LIKWID_MARKERS_H*/
--- a/util/gather-bench/src/includes/timing.h
+++ b/util/gather-bench/src/includes/timing.h
@@ -1,34 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #ifndef __TIMING_H_
 #define __TIMING_H_
 extern double getTimeStamp();
 extern double getTimeResolution();
 extern double getTimeStamp_();
 #endif
--- a/util/gather-bench/src/main-md-trace.c
+++ b/util/gather-bench/src/main-md-trace.c
@@ -1,441 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #include <float.h>
 #include <getopt.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <x86intrin.h>
 //---
 #include <likwid-marker.h>
 //---
 #include <allocate.h>
 #include <timing.h>
 #if !defined(ISA_avx2) && !defined (ISA_avx512)
 #error "Invalid ISA macro, possible values are: avx2 and avx512"
 #endif
 #if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
 #error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
 #endif
 #define HLINE "----------------------------------------------------------------------------\n"
 #ifndef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
 #endif
 #ifndef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
 #endif
 #ifndef ABS
 #define ABS(a) ((a) >= 0 ? (a) : -(a))
 #endif
 #define ARRAY_ALIGNMENT  64
 #ifdef ISA_avx512
 #define _VL_  8
 #define ISA_STRING "avx512"
 #else
 #define _VL_  4
 #define ISA_STRING "avx2"
 #endif
 #ifdef AOS
 #define GATHER gather_md_aos
 #define LOAD(a, i, d, n) load_aos(&a[i * d])
 #define LAYOUT_STRING "AoS"
 #else
 #define GATHER gather_md_soa
 #define LOAD(a, i, d, n) load_soa(a, i, n)
 #define LAYOUT_STRING "SoA"
 #endif
 #if defined(PADDING) && defined(AOS)
 #define PADDING_BYTES 1
 #else
 #define PADDING_BYTES 0
 #endif
 #ifdef MEM_TRACER
 #   define MEM_TRACER_INIT(trace_file)    FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(trace_file), "w");
 #   define MEM_TRACER_END                 fclose(mem_tracer_fp);
 #   define MEM_TRACE(addr, op)            fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
 #else
 #   define MEM_TRACER_INIT
 #   define MEM_TRACER_END
 #   define MEM_TRACE(addr, op)
 #endif
 int gather_md_aos(double*, int*, int, double*, int);
 int gather_md_soa(double*, int*, int, double*, int);
 void load_aos(double*);
 void load_soa(double*, int, int);
 const char *get_mem_tracer_filename(const char *trace_file) {
    static char fname[64];
    snprintf(fname, sizeof fname, "mem_tracer_%s.txt", trace_file);
    return fname;
 }
 int log2_uint(unsigned int x) {
    int ans = 0;
    while(x >>= 1) { ans++; }
    return ans;
 }
 int main (int argc, char** argv) {
    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("gather");
    char *trace_file = NULL;
    int cl_size = 64;
    int ntimesteps = 200;
    int reneigh_every = 20;
    int opt = 0;
    double freq = 2.5;
    struct option long_opts[] = {
        {"trace" ,      required_argument,   NULL,   't'},
        {"freq",        required_argument,   NULL,   'f'},
        {"line",        required_argument,   NULL,   'l'},
        {"timesteps",   required_argument,   NULL,   'n'},
        {"reneigh",     required_argument,   NULL,   'r'},
        {"help",        required_argument,   NULL,   'h'}
    };
    while((opt = getopt_long(argc, argv, "t:f:l:n:r:h", long_opts, NULL)) != -1) {
        switch(opt) {
            case 't':
                trace_file = strdup(optarg);
                break;
            case 'f':
                freq = atof(optarg);
                break;
            case 'l':
                cl_size = atoi(optarg);
                break;
            case 'n':
                ntimesteps = atoi(optarg);
                break;
            case 'r':
                reneigh_every = atoi(optarg);
                break;
            case 'h':
            case '?':
            default:
                printf("Usage: %s [OPTION]...\n", argv[0]);
                printf("MD variant for gather benchmark.\n\n");
                printf("Mandatory arguments to long options are also mandatory for short options.\n");
                printf("\t-t, --trace=STRING        input file with traced indexes from MD-Bench.\n");
                printf("\t-f, --freq=REAL           CPU frequency in GHz (default 2.5).\n");
                printf("\t-l, --line=NUMBER         cache line size in bytes (default 64).\n");
                printf("\t-n, --timesteps=NUMBER    number of timesteps to simulate (default 200).\n");
                printf("\t-r, --reneigh=NUMBER      reneighboring frequency in timesteps (default 20).\n");
                printf("\t-h, --help                display this help message.\n");
                printf("\n\n");
                return EXIT_FAILURE;
        }
    }
    if(trace_file == NULL) {
        fprintf(stderr, "Trace file not specified!\n");
        return EXIT_FAILURE;
    }
    FILE *fp;
    char *line = NULL;
    int *neighborlists = NULL;
    int *numneighs = NULL;
    int atom = -1;
    int nlocal, nghost, maxneighs;
    int nall = 0;
    int N_alloc = 0;
    size_t ntest = 0;
    size_t llen;
    ssize_t read;
    double *a = NULL;
    double *f = NULL;
    double *t = NULL;
    double time = 0.0;
    double E, S;
    const int dims = 3;
    const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
    long long int niters = 0;
    long long int ngathered = 0;
    printf("ISA,Layout,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e)\n");
    printf("%s,%s,%d,%f,%d,%d\n\n", ISA_STRING, LAYOUT_STRING, dims, freq, cl_size, _VL_);
    freq = freq * 1e9;
    #ifdef ONLY_FIRST_DIMENSION
    const int gathered_dims = 1;
    #else
    const int gathered_dims = dims;
    #endif
    for(int ts = -1; ts < ntimesteps; ts++) {
        if(!((ts + 1) % reneigh_every)) {
            char ts_trace_file[128];
            snprintf(ts_trace_file, sizeof ts_trace_file, "%s_%d.out", trace_file, ts + 1);
            if((fp = fopen(ts_trace_file, "r")) == NULL) {
                fprintf(stderr, "Error: could not open trace file!\n");
                return EXIT_FAILURE;
            }
            while((read = getline(&line, &llen, fp)) != -1) {
                int i = 2;
                if(strncmp(line, "N:", 2) == 0) {
                    while(line[i] == ' ') { i++; }
                    nlocal = atoi(strtok(&line[i], " "));
                    nghost = atoi(strtok(NULL, " "));
                    nall = nlocal + nghost;
                    maxneighs = atoi(strtok(NULL, " "));
                    if(nlocal <= 0 || maxneighs <= 0) {
                        fprintf(stderr, "Number of local atoms and neighbor lists capacity cannot be less or equal than zero!\n");
                        return EXIT_FAILURE;
                    }
                    if(neighborlists == NULL) {
                        neighborlists = (int *) allocate( ARRAY_ALIGNMENT, nlocal * maxneighs * sizeof(int) );
                        numneighs = (int *) allocate( ARRAY_ALIGNMENT, nlocal * sizeof(int) );
                    }
                }
                if(strncmp(line, "A:", 2) == 0) {
                    while(line[i] == ' ') { i++; }
                    atom = atoi(strtok(&line[i], " "));
                    numneighs[atom] = 0;
                }
                if(strncmp(line, "I:", 2) == 0) {
                    while(line[i] == ' ') { i++; }
                    char *neigh_idx = strtok(&line[i], " ");
                    while(neigh_idx != NULL && *neigh_idx != '\n') {
                        int j = numneighs[atom];
                        neighborlists[atom * maxneighs + j] = atoi(neigh_idx);
                        numneighs[atom]++;
                        ntest++;
                        neigh_idx = strtok(NULL, " ");
                    }
                }
            }
            fclose(fp);
        }
        if(N_alloc == 0) {
            N_alloc = nall * 2;
            a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
            f = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
        }
        #ifdef TEST
        if(t != NULL) { free(t); }
        ntest += 100;
        t = (double*) allocate( ARRAY_ALIGNMENT, ntest * dims * sizeof(double) );
        #endif
        for(int i = 0; i < N_alloc; ++i) {
            #ifdef AOS
            a[i * snbytes + 0] = i * dims + 0;
            a[i * snbytes + 1] = i * dims + 1;
            a[i * snbytes + 2] = i * dims + 2;
            #else
            a[N * 0 + i] = N * 0 + i;
            a[N * 1 + i] = N * 1 + i;
            a[N * 2 + i] = N * 2 + i;
            #endif
            f[i * dims + 0] = 0.0;
            f[i * dims + 1] = 0.0;
            f[i * dims + 2] = 0.0;
        }
        int t_idx = 0;
        S = getTimeStamp();
        LIKWID_MARKER_START("gather");
        for(int i = 0; i < nlocal; i++) {
            int *neighbors = &neighborlists[i * maxneighs];
            // We inline the assembly for AVX512 with AoS layout to evaluate the impact
            // of calling external assembly procedures in the overall runtime
            #ifdef ISA_avx512
            __m256i ymm_reg_mask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
            __asm__ __volatile__(   "vmovsd 0(%0), %%xmm3;"
                                    "vmovsd 8(%0), %%xmm4;"
                                    "vmovsd 16(%0), %%xmm5;"
                                    "vbroadcastsd %%xmm3, %%zmm0;"
                                    "vbroadcastsd %%xmm4, %%zmm1;"
                                    "vbroadcastsd %%xmm5, %%zmm2;"
                                    :
                                    : "r" (&a[i * snbytes])
                                    : "%xmm3", "%xmm4", "%xmm5", "%zmm0", "%zmm1", "%zmm2"  );
            __asm__ __volatile__(   "xor %%rax, %%rax;"
                                    "movq %%rdx, %%r15;"
                                    "1: vmovdqu (%1,%%rax,4), %%ymm3;"
                                    "vpaddd %%ymm3, %%ymm3, %%ymm4;"
                                    #ifdef PADDING
                                    "vpaddd %%ymm4, %%ymm4, %%ymm3;"
                                    #else
                                    "vpaddd %%ymm3, %%ymm4, %%ymm3;"
                                    #endif
                                    "vpcmpeqb %%xmm5, %%xmm5, %%k1;"
                                    "vpcmpeqb %%xmm5, %%xmm5, %%k2;"
                                    "vpcmpeqb %%xmm5, %%xmm5, %%k3;"
                                    "vpxord %%zmm0, %%zmm0, %%zmm0;"
                                    "vpxord %%zmm1, %%zmm1, %%zmm1;"
                                    "vpxord %%zmm2, %%zmm2, %%zmm2;"
                                    "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
                                    "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
                                    "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
                                    "addq $8, %%rax;"
                                    "subq $8, %%r15;"
                                    "cmpq $8, %%r15;"
                                    "jge 1b;"
                                    "cmpq $0, %%r15;"
                                    "jle 2;"
                                    "vpbroadcastd %%r15d, %%ymm5;"
                                    "vpcmpgtd %%ymm5, %2, %%k1;"
                                    "vmovdqu32 (%1,%%rax,4), %%ymm3{{%%k1}}{{z}};"
                                    "vpaddd %%ymm3, %%ymm3, %%ymm4;"
                                    #ifdef PADDING
                                    "vpaddd %%ymm4, %%ymm4, %%ymm3;"
                                    #else
                                    "vpaddd %%ymm3, %%ymm4, %%ymm3;"
                                    #endif
                                    "vpxord %%zmm0, %%zmm0, %%zmm0;"
                                    "kmovw %%k1, %%k2;"
                                    "kmovw %%k1, %%k3;"
                                    "vpxord %%zmm1, %%zmm1, %%zmm1;"
                                    "vpxord %%zmm2, %%zmm2, %%zmm2;"
                                    "vgatherdpd (%3, %%ymm3, 8), %%zmm0{{%%k1}};"
                                    "vgatherdpd 8(%3, %%ymm3, 8), %%zmm1{{%%k2}};"
                                    "vgatherdpd 16(%3, %%ymm3, 8), %%zmm2{{%%k3}};"
                                    "addq %%r15, %%rax;"
                                    "2:;"
                                    :
                                    : "d" (numneighs[i]), "r" (neighbors), "x" (ymm_reg_mask), "r" (a)
                                    : "%rax", "%r15", "%ymm3", "%ymm4", "%ymm5", "%k1", "%k2", "%k3", "%zmm0", "%zmm1", "%zmm2" );
            #else
            LOAD(a, i, snbytes, N_alloc);
            t_idx += GATHER(a, neighbors, numneighs[i], &t[t_idx], ntest);
            #endif
            f[i * dims + 0] += i;
            f[i * dims + 1] += i;
            f[i * dims + 2] += i;
        }
        LIKWID_MARKER_STOP("gather");
        E = getTimeStamp();
        time += E - S;
        #ifdef MEM_TRACER
        MEM_TRACER_INIT(trace_file);
        for(int i = 0; i < nlocal; i++) {
            int *neighbors = &neighborlists[i * maxneighs];
            for(int d = 0; d < gathered_dims; d++) {
                #ifdef AOS
                MEM_TRACE('R', a[i * snbytes + d])
                #else
                MEM_TRACE('R', a[d * N + i])
                #endif
            }
            for(int j = 0; j < numneighs[i]; j += _VL_) {
                for(int jj = j; jj < MIN(j + _VL_, numneighs[i]); j++) {
                    int k = neighbors[jj];
                    for(int d = 0; d < gathered_dims; d++) {
                        #ifdef AOS
                        MEM_TRACE('R', a[k * snbytes + d])
                        #else
                        MEM_TRACE('R', a[d * N + k])
                        #endif
                    }
                }
            }
        }
        MEM_TRACER_END;
        #endif
        #ifdef TEST
        int test_failed = 0;
        t_idx = 0;
        for(int i = 0; i < nlocal; ++i) {
            int *neighbors = &neighborlists[i * maxneighs];
            for(int j = 0; j < numneighs[i]; ++j) {
                int k = neighbors[j];
                for(int d = 0; d < dims; ++d) {
                    #ifdef AOS
                    if(t[d * ntest + t_idx] != k * dims + d) {
                    #else
                    if(t[d * ntest + t_idx] != d * N + k) {
                    #endif
                        test_failed = 1;
                        break;
                    }
                }
                t_idx++;
            }
        }
        if(test_failed) {
            printf("Test failed!\n");
            return EXIT_FAILURE;
        }
        #endif
        for(int i = 0; i < nlocal; i++) {
            niters += (numneighs[i] / _VL_) + ((numneighs[i] % _VL_ == 0) ? 0 : 1);
            ngathered += numneighs[i];
        }
    }
    printf("%14s,%14s,%14s,%14s,%14s,%14s", "tot. time(s)", "time/step(ms)", "time/iter(us)", "cy/it", "cy/gather", "cy/elem");
    printf("\n");
    const double time_per_step = time * 1e3 / ((double) ntimesteps);
    const double time_per_it = time * 1e6 / ((double) niters);
    const double cy_per_it = time * freq * _VL_ / ((double) niters);
    const double cy_per_gather = time * freq * _VL_ / ((double) niters * gathered_dims);
    const double cy_per_elem = time * freq / ((double) ngathered * gathered_dims);
    printf("%14.6f,%14.6f,%14.6f,%14.6f,%14.6f,%14.6f\n", time, time_per_step, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
    #ifdef TEST
    printf("Test passed!\n");
    #endif
    LIKWID_MARKER_CLOSE;
    return EXIT_SUCCESS;
 }
--- a/util/gather-bench/src/main-md.c
+++ b/util/gather-bench/src/main-md.c
@@ -1,361 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #include <float.h>
 #include <getopt.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 //---
 #include <likwid-marker.h>
 //---
 #include <allocate.h>
 #include <timing.h>
 #if !defined(ISA_avx2) && !defined (ISA_avx512)
 #error "Invalid ISA macro, possible values are: avx2 and avx512"
 #endif
 #if defined(TEST) && defined(ONLY_FIRST_DIMENSION)
 #error "TEST and ONLY_FIRST_DIMENSION options are mutually exclusive!"
 #endif
 #define HLINE "----------------------------------------------------------------------------\n"
 #ifndef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
 #endif
 #ifndef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
 #endif
 #ifndef ABS
 #define ABS(a) ((a) >= 0 ? (a) : -(a))
 #endif
 #define ARRAY_ALIGNMENT  64
 #define SIZE  20000
 #ifdef ISA_avx512
 #define _VL_  8
 #define ISA_STRING "avx512"
 #else
 #define _VL_  4
 #define ISA_STRING "avx2"
 #endif
 #ifdef AOS
 #define GATHER gather_aos
 #define LAYOUT_STRING "AoS"
 #else
 #define GATHER gather_soa
 #define LAYOUT_STRING "SoA"
 #endif
 #if defined(PADDING) && defined(AOS)
 #define PADDING_BYTES 1
 #else
 #define PADDING_BYTES 0
 #endif
 #ifdef MEM_TRACER
 #   define MEM_TRACER_INIT(stride, size)  FILE *mem_tracer_fp = fopen(get_mem_tracer_filename(stride, size), "w");
 #   define MEM_TRACER_END                 fclose(mem_tracer_fp);
 #   define MEM_TRACE(addr, op)            fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr)));
 #else
 #   define MEM_TRACER_INIT
 #   define MEM_TRACER_END
 #   define MEM_TRACE(addr, op)
 #endif
 extern void gather_aos(double*, int*, int, double*, long int*);
 extern void gather_soa(double*, int*, int, double*, long int*);
 const char *get_mem_tracer_filename(int stride, int size) {
    static char fname[64];
    snprintf(fname, sizeof fname, "mem_tracer_%d_%d.txt", stride, size);
    return fname;
 }
 int log2_uint(unsigned int x) {
    int ans = 0;
    while(x >>= 1) { ans++; }
    return ans;
 }
 int main (int argc, char** argv) {
    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("gather");
    int stride = 1;
    int cl_size = 64;
    int opt = 0;
    double freq = 2.5;
    struct option long_opts[] = {
        {"stride", required_argument,   NULL,   's'},
        {"freq",   required_argument,   NULL,   'f'},
        {"line",   required_argument,   NULL,   'l'},
        {"help",   required_argument,   NULL,   'h'}
    };
    while((opt = getopt_long(argc, argv, "s:f:l:h", long_opts, NULL)) != -1) {
        switch(opt) {
            case 's':
                stride = atoi(optarg);
                break;
            case 'f':
                freq = atof(optarg);
                break;
            case 'l':
                cl_size = atoi(optarg);
                break;
            case 'h':
            case '?':
            default:
                printf("Usage: %s [OPTION]...\n", argv[0]);
                printf("MD variant for gather benchmark.\n\n");
                printf("Mandatory arguments to long options are also mandatory for short options.\n");
                printf("\t-s, --stride=NUMBER   stride between two successive elements (default 1).\n");
                printf("\t-f, --freq=REAL       CPU frequency in GHz (default 2.5).\n");
                printf("\t-l, --line=NUMBER     cache line size in bytes (default 64).\n");
                printf("\t-h, --help            display this help message.\n");
                printf("\n\n");
                return EXIT_FAILURE;
        }
    }
    size_t bytesPerWord = sizeof(double);
    const int dims = 3;
    const int snbytes = dims + PADDING_BYTES; // bytes per element (struct), includes padding
    #ifdef AOS
    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ * snbytes / (cl_size / sizeof(double)), 1), _VL_);
    #else
    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_) * dims;
    #endif
    size_t N = SIZE;
    double E, S;
    printf("ISA,Layout,Stride,Dims,Frequency (GHz),Cache Line Size (B),Vector Width (e),Cache Lines/Gather\n");
    printf("%s,%s,%d,%d,%f,%d,%d,%lu\n\n", ISA_STRING, LAYOUT_STRING, stride, dims, freq, cl_size, _VL_, cacheLinesPerGather);
    printf("%14s,%14s,%14s,", "N", "Size(kB)", "cut CLs");
 #ifndef MEASURE_GATHER_CYCLES
    printf("%14s,%14s,%14s,%14s,%14s", "tot. time", "time/LUP(ms)", "cy/it", "cy/gather", "cy/elem");
 #else
 #ifdef ONLY_FIRST_DIMENSION
    printf("%27s,%27s,%27s", "min/max/avg cy(x)", "min/max/avg cy(y)", "min/max/avg cy(z)");
 #else
    printf("%27s", "min/max/avg cy(x)");
 #endif
 #endif
    printf("\n");
    freq = freq * 1e9;
    for(int N = 512; N < 80000000; N = 1.5 * N) {
        // Currently this only works when the array size (in elements) is multiple of the vector length (no preamble and prelude)
        if(N % _VL_ != 0) {
            N += _VL_ - (N % _VL_);
        }
        MEM_TRACER_INIT(stride, N);
        int N_gathers_per_dim = N / _VL_;
        int N_alloc = N * 2;
        int N_cycles_alloc = N_gathers_per_dim * 2;
        int cut_cl = 0;
        double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * snbytes * sizeof(double) );
        int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
        int rep;
        double time;
 #ifdef TEST
        double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * dims * sizeof(double) );
 #else
        double* t = (double*) NULL;
 #endif
 #ifdef MEASURE_GATHER_CYCLES
        long int* cycles = (long int*) allocate( ARRAY_ALIGNMENT, N_cycles_alloc * dims * sizeof(long int)) ;
 #else
        long int* cycles = (long int*) NULL;
 #endif
        for(int i = 0; i < N_alloc; ++i) {
 #ifdef AOS
            a[i * snbytes + 0] = i * dims + 0;
            a[i * snbytes + 1] = i * dims + 1;
            a[i * snbytes + 2] = i * dims + 2;
 #else
            a[N * 0 + i] = N * 0 + i;
            a[N * 1 + i] = N * 1 + i;
            a[N * 2 + i] = N * 2 + i;
 #endif
            idx[i] = (i * stride) % N;
        }
 #ifdef ONLY_FIRST_DIMENSION
        const int gathered_dims = 1;
 #else
        const int gathered_dims = dims;
 #endif
 #ifdef MEM_TRACER
        for(int i = 0; i < N; i += _VL_) {
            for(int j = 0; j < _VL_; j++) {
                MEM_TRACE(idx[i + j], 'R');
            }
            for(int d = 0; d < gathered_dims; d++) {
                for(int j = 0; j < _VL_; j++) {
 #ifdef AOS
                    MEM_TRACE(a[idx[i + j] * snbytes + d], 'R');
 #else
                    MEM_TRACE(a[N * d + idx[i + j]], 'R');
 #endif
                }
            }
        }
 #endif
 #ifdef AOS
        const int cl_shift = log2_uint((unsigned int) cl_size);
        for(int i = 0; i < N; i++) {
            const int first_cl = (idx[i] * snbytes * sizeof(double)) >> cl_shift;
            const int last_cl = ((idx[i] * snbytes + gathered_dims - 1) * sizeof(double)) >> cl_shift;
            if(first_cl != last_cl) {
                cut_cl++;
            }
        }
 #endif
        S = getTimeStamp();
        for(int r = 0; r < 100; ++r) {
            GATHER(a, idx, N, t, cycles);
        }
        E = getTimeStamp();
 #ifdef MEASURE_GATHER_CYCLES
        for(int i = 0; i < N_cycles_alloc; i++) {
            cycles[i * 3 + 0] = 0;
            cycles[i * 3 + 1] = 0;
            cycles[i * 3 + 2] = 0;
        }
 #endif
        rep = 100 * (0.5 / (E - S));
        S = getTimeStamp();
        LIKWID_MARKER_START("gather");
        for(int r = 0; r < rep; ++r) {
            GATHER(a, idx, N, t, cycles);
        }
        LIKWID_MARKER_STOP("gather");
        E = getTimeStamp();
        time = E - S;
 #ifdef TEST
        int test_failed = 0;
        for(int i = 0; i < N; ++i) {
            for(int d = 0; d < dims; ++d) {
 #ifdef AOS
                if(t[d * N + i] != ((i * stride) % N) * dims + d) {
 #else
                if(t[d * N + i] != d * N + ((i * stride) % N)) {
 #endif
                    test_failed = 1;
                    break;
                }
            }
        }
        if(test_failed) {
            printf("Test failed!\n");
            return EXIT_FAILURE;
        } else {
            printf("Test passed!\n");
        }
 #endif
        const double size = N * (dims * sizeof(double) + sizeof(int)) / 1000.0;
        printf("%14d,%14.2f,%14d,", N, size, cut_cl);
 #ifndef MEASURE_GATHER_CYCLES
        const double time_per_it = time * 1e6 / ((double) N * rep);
        const double cy_per_it = time * freq * _VL_ / ((double) N * rep);
        const double cy_per_gather = time * freq * _VL_ / ((double) N * rep * gathered_dims);
        const double cy_per_elem = time * freq / ((double) N * rep * gathered_dims);
        printf("%14.10f,%14.10f,%14.6f,%14.6f,%14.6f", time, time_per_it, cy_per_it, cy_per_gather, cy_per_elem);
 #else
        double cy_min[dims];
        double cy_max[dims];
        double cy_avg[dims];
        for(int d = 0; d < dims; d++) {
            cy_min[d] = 100000.0;
            cy_max[d] = 0.0;
            cy_avg[d] = 0.0;
        }
        for(int i = 0; i < N_gathers_per_dim; ++i) {
            for(int d = 0; d < gathered_dims; d++) {
                const double cy_d = (double)(cycles[i * 3 + d]);
                cy_min[d] = MIN(cy_min[d], cy_d);
                cy_max[d] = MAX(cy_max[d], cy_d);
                cy_avg[d] += cy_d;
            }
        }
        for(int d = 0; d < gathered_dims; d++) {
            char tmp_str[64];
            cy_avg[d] /= (double) N_gathers_per_dim;
            snprintf(tmp_str, sizeof tmp_str, "%4.4f/%4.4f/%4.4f", cy_min[d], cy_max[d], cy_avg[d]);
            printf("%27s%c", tmp_str, (d < gathered_dims - 1) ? ',' : ' ');
        }
 #endif
        printf("\n");
        free(a);
        free(idx);
 #ifdef TEST
        free(t);
 #endif
 #ifdef MEASURE_GATHER_CYCLES
        free(cycles);
 #endif
        MEM_TRACER_END;
    }
    LIKWID_MARKER_CLOSE;
    return EXIT_SUCCESS;
 }
--- a/util/gather-bench/src/main.c
+++ b/util/gather-bench/src/main.c
@@ -1,166 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <limits.h>
 #include <float.h>
 //---
 #include <likwid-marker.h>
 //---
 #include <timing.h>
 #include <allocate.h>
 #if !defined(ISA_avx2) && !defined (ISA_avx512)
 #error "Invalid ISA macro, possible values are: avx2 and avx512"
 #endif
 #define HLINE "----------------------------------------------------------------------------\n"
 #ifndef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
 #endif
 #ifndef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
 #endif
 #ifndef ABS
 #define ABS(a) ((a) >= 0 ? (a) : -(a))
 #endif
 #define ARRAY_ALIGNMENT  64
 #define SIZE  20000
 #ifdef ISA_avx512
 #define _VL_  8
 #define ISA_STRING "avx512"
 #else
 #define _VL_  4
 #define ISA_STRING "avx2"
 #endif
 #ifdef TEST
 extern void gather(double*, int*, int, double*);
 #else
 extern void gather(double*, int*, int);
 #endif
 int main (int argc, char** argv) {
    LIKWID_MARKER_INIT;
    LIKWID_MARKER_REGISTER("gather");
    if (argc < 3) {
        printf("Please provide stride and frequency\n");
        printf("%s <stride> <freq (GHz)> [cache line size (B)]\n", argv[0]);
        return -1;
    }
    int stride = atoi(argv[1]);
    double freq = atof(argv[2]);
    int cl_size = (argc == 3) ? 64 : atoi(argv[3]);
    size_t bytesPerWord = sizeof(double);
    size_t cacheLinesPerGather = MIN(MAX(stride * _VL_ / (cl_size / sizeof(double)), 1), _VL_);
    size_t N = SIZE;
    double E, S;
    printf("ISA,Stride (elems),Frequency (GHz),Cache Line Size (B),Vector Width (elems),Cache Lines/Gather\n");
    printf("%s,%d,%f,%d,%d,%lu\n\n", ISA_STRING, stride, freq, cl_size, _VL_, cacheLinesPerGather);
    printf("%14s,%14s,%14s,%14s,%14s,%14s\n", "N", "Size(kB)", "tot. time", "time/LUP(ms)", "cy/gather", "cy/elem");
    freq = freq * 1e9;
    for(int N = 1024; N < 400000; N = 1.5 * N) {
        int N_alloc = N * 2;
        double* a = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
        int* idx = (int*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(int) );
        int rep;
        double time;
 #ifdef TEST
        double* t = (double*) allocate( ARRAY_ALIGNMENT, N_alloc * sizeof(double) );
 #endif
        for(int i = 0; i < N_alloc; ++i) {
            a[i] = i;
            idx[i] = (i * stride) % N;
        }
        S = getTimeStamp();
        for(int r = 0; r < 100; ++r) {
 #ifdef TEST
            gather(a, idx, N, t);
 #else
            gather(a, idx, N);
 #endif
        }
        E = getTimeStamp();
        rep = 100 * (0.5 / (E - S));
        S = getTimeStamp();
        LIKWID_MARKER_START("gather");
        for(int r = 0; r < rep; ++r) {
 #ifdef TEST
            gather(a, idx, N, t);
 #else
            gather(a, idx, N);
 #endif
        }
        LIKWID_MARKER_STOP("gather");
        E = getTimeStamp();
        time = E - S;
 #ifdef TEST
        int test_failed = 0;
        for(int i = 0; i < N; ++i) {
            if(t[i] != i * stride % N) {
                test_failed = 1;
                break;
            }
        }
        if(test_failed) {
            printf("Test failed!\n");
            return EXIT_FAILURE;
        } else {
            printf("Test passed!\n");
        }
 #endif
        const double size = N * (sizeof(double) + sizeof(int)) / 1000.0;
        const double time_per_it = time * 1e6 / ((double) N * rep);
        const double cy_per_gather = time * freq * _VL_ / ((double) N * rep);
        const double cy_per_elem = time * freq / ((double) N * rep);
        printf("%14d,%14.2f,%14.10f,%14.10f,%14.6f,%14.6f\n", N, size, time, time_per_it, cy_per_gather, cy_per_elem);
        free(a);
        free(idx);
 #ifdef TEST
        free(t);
 #endif
    }
    LIKWID_MARKER_CLOSE;
    return EXIT_SUCCESS;
 }
--- a/util/gather-bench/src/timing.c
+++ b/util/gather-bench/src/timing.c
@@ -1,47 +0,0 @@
 /*
 * =======================================================================================
 *
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 *
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 *      of this software and associated documentation files (the "Software"), to deal
 *      in the Software without restriction, including without limitation the rights
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 *      copies of the Software, and to permit persons to whom the Software is
 *      furnished to do so, subject to the following conditions:
 *
 *      The above copyright notice and this permission notice shall be included in all
 *      copies or substantial portions of the Software.
 *
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 *      SOFTWARE.
 *
 * =======================================================================================
 */
 #include <stdlib.h>
 #include <time.h>
 double getTimeStamp()
 {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
 }
 double getTimeResolution()
 {
    struct timespec ts;
    clock_getres(CLOCK_MONOTONIC, &ts);
    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
 }
 double getTimeStamp_()
 {
    return getTimeStamp();
 }
--- a/util/preds.py
+++ b/util/preds.py
@@ -1,28 +0,0 @@
 import sys
 import re
 if len(sys.argv) != 6:
    print("Usage: python preds.py <iaca> <mca> <osaca> <uica> <div_factor>")
    sys.exit(1)
 iaca_pred = float(sys.argv[1])
 mca_pred = float(sys.argv[2])
 osaca_pred = float(sys.argv[3])
 uica_pred = float(sys.argv[4])
 div_factor = float(sys.argv[5])
 preds = [x / div_factor for x in [iaca_pred, mca_pred, osaca_pred, uica_pred]]
 start = -4.0
 end = 36.0
 npoints = 50
 offset = (end - start) / (npoints - 1)
 i = 0
 for pred in preds:
    print(f"@target G0.S{i+6}")
    print(f"@type xy")
    for j in range(npoints):
        pos = start + offset * j
        print("{:.6f} {}".format(pos, pred))
    print("&")
    i += 1
--- a/util/string_to_agr.py
+++ b/util/string_to_agr.py
@@ -1,34 +0,0 @@
 import sys
 import re
 if len(sys.argv) != 3:
    print("Usage: python string_to_agr.py <input_filename> <div_factor>")
    sys.exit(1)
 input_filename = sys.argv[1]
 div_factor = float(sys.argv[2])
 result_list = []
 with open(input_filename, 'r') as file:
    for line in file:
        numbers = re.findall(r'\d+\.\d+', line)
        divided_numbers = [float(number) / div_factor for number in numbers]
        result_list.append(divided_numbers)
 start = -2.5
 bar_offset = 1.0
 group_offset = 8.0
 i = 0
 for group in result_list:
    print(f"@target G0.S{i}")
    print(f"@type bar")
    j = 0
    for meas in group:
        pos = start + i * bar_offset + j * group_offset
        print(f"{pos} {meas}")
        j += 1
    print("&")
    i += 1
Author	SHA1	Message	Date
Rafael Ravedutti	924914e4f0	First changes in the supercluster code Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2023-05-25 01:10:37 +02:00
Andropov Arsenii	055a009dbd	Neighbor list preparation	2023-05-23 16:25:00 +02:00
Andropov Arsenii	182c065fe2	Neighbor list preparation	2023-05-09 00:44:37 +02:00
Andropov Arsenii	ee3f6de050	Building of super clusters complete, force computation kernel WIP	2023-04-11 02:55:30 +02:00
		`@@ -1,2 +0,0 @@`
			`# gather-bench`
			`A X86 gather instruction performance benchmark`