Compare commits
	
		
			33 Commits
		
	
	
		
			gromacs_ma
			...
			mucosim23
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					a6a269703d | ||
| 
						 | 
					7ee250161a | ||
| 
						 | 
					c73efea786 | ||
| 
						 | 
					4cfa664533 | ||
| 
						 | 
					1837403326 | ||
| 
						 | 
					02629612a9 | ||
| 
						 | 
					ce00aa0042 | ||
| 
						 | 
					c4e5e87265 | ||
| 
						 | 
					da3b1dd53f | ||
| 
						 | 
					2f13291817 | ||
| 
						 | 
					a460fffa19 | ||
| 19209bdcce | |||
| 
						 | 
					151f0c0e6f | ||
| 
						 | 
					72f486f9bf | ||
| 
						 | 
					8253b31ee0 | ||
| 
						 | 
					e206c3566d | ||
| 
						 | 
					7ff1673399 | ||
| 
						 | 
					b6982d56f5 | ||
| 
						 | 
					1ad981a059 | ||
| 
						 | 
					c438fc6832 | ||
| 
						 | 
					17e239ed6d | ||
| 
						 | 
					d151b9b3e4 | ||
| 
						 | 
					98257b746c | ||
| 
						 | 
					a101f8588a | ||
| 
						 | 
					c14a6b2186 | ||
| 
						 | 
					300776f512 | ||
| 
						 | 
					4e5fe27c0f | ||
| 
						 | 
					989bec2c7d | ||
| 
						 | 
					2971ddcc63 | ||
| 
						 | 
					5341938b60 | ||
| 
						 | 
					039de0be99 | ||
| 
						 | 
					43259eb3cf | ||
| 
						 | 
					3eb7170a65 | 
							
								
								
									
										6
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										6
									
								
								Makefile
									
									
									
									
									
								
							@@ -30,6 +30,10 @@ ifneq ($(ASM_SYNTAX), ATT)
 | 
			
		||||
    ASFLAGS += -masm=intel
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(SORT_ATOMS)),true)
 | 
			
		||||
    DEFINES += -DSORT_ATOMS
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
 | 
			
		||||
    DEFINES += -DEXPLICIT_TYPES
 | 
			
		||||
endif
 | 
			
		||||
@@ -152,7 +156,7 @@ $(BUILD_DIR)/%.o:  %.s
 | 
			
		||||
clean:
 | 
			
		||||
	$(info ===>  CLEAN)
 | 
			
		||||
	@rm -rf $(BUILD_DIR)
 | 
			
		||||
	@rm -rf MDBench-$(IDENTIFIER)
 | 
			
		||||
	@rm -rf $(TARGET)*
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
 | 
			
		||||
cleanall:
 | 
			
		||||
 
 | 
			
		||||
@@ -1,626 +0,0 @@
 | 
			
		||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
 | 
			
		||||
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
 | 
			
		||||
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
 | 
			
		||||
# mark_description "ICC/force.s";
 | 
			
		||||
	.file "force.c"
 | 
			
		||||
	.text
 | 
			
		||||
..TXTST0:
 | 
			
		||||
.L_2__routine_start_computeForce_0:
 | 
			
		||||
# -- Begin  computeForce
 | 
			
		||||
	.text
 | 
			
		||||
# mark_begin;
 | 
			
		||||
       .align    16,0x90
 | 
			
		||||
	.globl computeForce
 | 
			
		||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
 | 
			
		||||
computeForce:
 | 
			
		||||
# parameter 1: %rdi
 | 
			
		||||
# parameter 2: %rsi
 | 
			
		||||
# parameter 3: %rdx
 | 
			
		||||
# parameter 4: %ecx
 | 
			
		||||
# parameter 5: %r8d
 | 
			
		||||
# parameter 6: %r9d
 | 
			
		||||
..B1.1:                         # Preds ..B1.0
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
..___tag_value_computeForce.1:
 | 
			
		||||
..L2:
 | 
			
		||||
                                                          #121.112
 | 
			
		||||
        pushq     %rbp                                          #121.112
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
        movq      %rsp, %rbp                                    #121.112
 | 
			
		||||
	.cfi_def_cfa 6, 16
 | 
			
		||||
	.cfi_offset 6, -16
 | 
			
		||||
        andq      $-64, %rsp                                    #121.112
 | 
			
		||||
        pushq     %r12                                          #121.112
 | 
			
		||||
        pushq     %r13                                          #121.112
 | 
			
		||||
        pushq     %r14                                          #121.112
 | 
			
		||||
        pushq     %r15                                          #121.112
 | 
			
		||||
        pushq     %rbx                                          #121.112
 | 
			
		||||
        subq      $88, %rsp                                     #121.112
 | 
			
		||||
        xorl      %eax, %eax                                    #124.16
 | 
			
		||||
	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
        movq      %rdx, %r15                                    #121.112
 | 
			
		||||
        movq      %rsi, %r12                                    #121.112
 | 
			
		||||
        movq      %rdi, %rbx                                    #121.112
 | 
			
		||||
..___tag_value_computeForce.11:
 | 
			
		||||
#       getTimeStamp()
 | 
			
		||||
        call      getTimeStamp                                  #124.16
 | 
			
		||||
..___tag_value_computeForce.12:
 | 
			
		||||
                                # LOE rbx r12 r15 xmm0
 | 
			
		||||
..B1.51:                        # Preds ..B1.1
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        vmovsd    %xmm0, 24(%rsp)                               #124.16[spill]
 | 
			
		||||
                                # LOE rbx r12 r15
 | 
			
		||||
..B1.2:                         # Preds ..B1.51
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        movl      4(%r12), %r13d                                #125.18
 | 
			
		||||
        movq      64(%r12), %r9                                 #127.20
 | 
			
		||||
        movq      72(%r12), %r14                                #127.45
 | 
			
		||||
        movq      80(%r12), %r8                                 #127.70
 | 
			
		||||
        vmovsd    72(%rbx), %xmm2                               #129.27
 | 
			
		||||
        vmovsd    8(%rbx), %xmm1                                #130.23
 | 
			
		||||
        vmovsd    (%rbx), %xmm0                                 #131.24
 | 
			
		||||
        testl     %r13d, %r13d                                  #134.24
 | 
			
		||||
        jle       ..B1.43       # Prob 50%                      #134.24
 | 
			
		||||
                                # LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.3:                         # Preds ..B1.2
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        xorl      %ebx, %ebx                                    #134.5
 | 
			
		||||
        movl      %r13d, %edx                                   #134.5
 | 
			
		||||
        xorl      %ecx, %ecx                                    #134.5
 | 
			
		||||
        movl      $1, %esi                                      #134.5
 | 
			
		||||
        xorl      %eax, %eax                                    #135.17
 | 
			
		||||
        shrl      $1, %edx                                      #134.5
 | 
			
		||||
        je        ..B1.7        # Prob 9%                       #134.5
 | 
			
		||||
                                # LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.5:                         # Preds ..B1.3 ..B1.5
 | 
			
		||||
                                # Execution count [2.50e+00]
 | 
			
		||||
        movq      %rax, (%rcx,%r9)                              #135.9
 | 
			
		||||
        incq      %rbx                                          #134.5
 | 
			
		||||
        movq      %rax, (%rcx,%r14)                             #136.9
 | 
			
		||||
        movq      %rax, (%rcx,%r8)                              #137.9
 | 
			
		||||
        movq      %rax, 8(%rcx,%r9)                             #135.9
 | 
			
		||||
        movq      %rax, 8(%rcx,%r14)                            #136.9
 | 
			
		||||
        movq      %rax, 8(%rcx,%r8)                             #137.9
 | 
			
		||||
        addq      $16, %rcx                                     #134.5
 | 
			
		||||
        cmpq      %rdx, %rbx                                    #134.5
 | 
			
		||||
        jb        ..B1.5        # Prob 63%                      #134.5
 | 
			
		||||
                                # LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.6:                         # Preds ..B1.5
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        lea       1(%rbx,%rbx), %esi                            #135.9
 | 
			
		||||
                                # LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.7:                         # Preds ..B1.3 ..B1.6
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        lea       -1(%rsi), %edx                                #134.5
 | 
			
		||||
        cmpl      %r13d, %edx                                   #134.5
 | 
			
		||||
        jae       ..B1.9        # Prob 9%                       #134.5
 | 
			
		||||
                                # LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.8:                         # Preds ..B1.7
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        movslq    %esi, %rsi                                    #134.5
 | 
			
		||||
        movq      %rax, -8(%r9,%rsi,8)                          #135.9
 | 
			
		||||
        movq      %rax, -8(%r14,%rsi,8)                         #136.9
 | 
			
		||||
        movq      %rax, -8(%r8,%rsi,8)                          #137.9
 | 
			
		||||
                                # LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.9:                         # Preds ..B1.7 ..B1.8
 | 
			
		||||
                                # Execution count [5.00e-01]
 | 
			
		||||
        movl      $.L_2__STRING.0, %edi                         #141.5
 | 
			
		||||
        movq      %r8, 32(%rsp)                                 #141.5[spill]
 | 
			
		||||
        movq      %r9, 80(%rsp)                                 #141.5[spill]
 | 
			
		||||
        vmovsd    %xmm2, (%rsp)                                 #141.5[spill]
 | 
			
		||||
        vmovsd    %xmm1, 8(%rsp)                                #141.5[spill]
 | 
			
		||||
        vmovsd    %xmm0, 16(%rsp)                               #141.5[spill]
 | 
			
		||||
..___tag_value_computeForce.18:
 | 
			
		||||
#       likwid_markerStartRegion(const char *)
 | 
			
		||||
        call      likwid_markerStartRegion                      #141.5
 | 
			
		||||
..___tag_value_computeForce.19:
 | 
			
		||||
                                # LOE r12 r14 r15 r13d
 | 
			
		||||
..B1.10:                        # Preds ..B1.9
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        vmovsd    16(%rsp), %xmm0                               #[spill]
 | 
			
		||||
        xorl      %esi, %esi                                    #143.15
 | 
			
		||||
        vmovsd    (%rsp), %xmm2                                 #[spill]
 | 
			
		||||
        xorl      %eax, %eax                                    #143.5
 | 
			
		||||
        vmulsd    %xmm2, %xmm2, %xmm13                          #129.45
 | 
			
		||||
        xorl      %edi, %edi                                    #143.5
 | 
			
		||||
        vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16            #173.13
 | 
			
		||||
        vmulsd    .L_2il0floatpacket.3(%rip), %xmm0, %xmm0      #197.45
 | 
			
		||||
        vmovdqu   .L_2il0floatpacket.1(%rip), %ymm15            #173.13
 | 
			
		||||
        vmovups   .L_2il0floatpacket.4(%rip), %zmm5             #197.58
 | 
			
		||||
        vmovsd    8(%rsp), %xmm1                                #[spill]
 | 
			
		||||
        vbroadcastsd %xmm13, %zmm14                             #129.25
 | 
			
		||||
        vbroadcastsd %xmm1, %zmm13                              #130.21
 | 
			
		||||
        vbroadcastsd %xmm0, %zmm9                               #197.45
 | 
			
		||||
        movslq    %r13d, %r13                                   #143.5
 | 
			
		||||
        movq      24(%r15), %r10                                #145.25
 | 
			
		||||
        movslq    16(%r15), %rdx                                #144.43
 | 
			
		||||
        movq      8(%r15), %rcx                                 #144.19
 | 
			
		||||
        movq      32(%rsp), %r8                                 #[spill]
 | 
			
		||||
        movq      16(%r12), %rbx                                #146.25
 | 
			
		||||
        shlq      $2, %rdx                                      #126.5
 | 
			
		||||
        movq      %r13, 64(%rsp)                                #143.5[spill]
 | 
			
		||||
        movq      %r10, 72(%rsp)                                #143.5[spill]
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.11:                        # Preds ..B1.41 ..B1.10
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        movq      72(%rsp), %r9                                 #145.25[spill]
 | 
			
		||||
        vxorpd    %xmm24, %xmm24, %xmm24                        #149.22
 | 
			
		||||
        vmovapd   %xmm24, %xmm18                                #150.22
 | 
			
		||||
        movl      (%r9,%rax,4), %r10d                           #145.25
 | 
			
		||||
        vmovapd   %xmm18, %xmm4                                 #151.22
 | 
			
		||||
        vmovsd    (%rdi,%rbx), %xmm10                           #146.25
 | 
			
		||||
        vmovsd    8(%rdi,%rbx), %xmm6                           #147.25
 | 
			
		||||
        vmovsd    16(%rdi,%rbx), %xmm12                         #148.25
 | 
			
		||||
        testl     %r10d, %r10d                                  #173.32
 | 
			
		||||
        jle       ..B1.41       # Prob 50%                      #173.32
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.12:                        # Preds ..B1.11
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        vpxord    %zmm8, %zmm8, %zmm8                           #149.22
 | 
			
		||||
        vmovaps   %zmm8, %zmm7                                  #150.22
 | 
			
		||||
        vmovaps   %zmm7, %zmm11                                 #151.22
 | 
			
		||||
        cmpl      $8, %r10d                                     #173.13
 | 
			
		||||
        jl        ..B1.48       # Prob 10%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.13:                        # Preds ..B1.12
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        cmpl      $1200, %r10d                                  #173.13
 | 
			
		||||
        jl        ..B1.47       # Prob 10%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.14:                        # Preds ..B1.13
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      %rdx, %r15                                    #144.43
 | 
			
		||||
        imulq     %rsi, %r15                                    #144.43
 | 
			
		||||
        addq      %rcx, %r15                                    #126.5
 | 
			
		||||
        movq      %r15, %r11                                    #173.13
 | 
			
		||||
        andq      $63, %r11                                     #173.13
 | 
			
		||||
        testl     $3, %r11d                                     #173.13
 | 
			
		||||
        je        ..B1.16       # Prob 50%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.15:                        # Preds ..B1.14
 | 
			
		||||
                                # Execution count [2.25e+00]
 | 
			
		||||
        xorl      %r11d, %r11d                                  #173.13
 | 
			
		||||
        jmp       ..B1.18       # Prob 100%                     #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.16:                        # Preds ..B1.14
 | 
			
		||||
                                # Execution count [2.25e+00]
 | 
			
		||||
        testl     %r11d, %r11d                                  #173.13
 | 
			
		||||
        je        ..B1.18       # Prob 50%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.17:                        # Preds ..B1.16
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        negl      %r11d                                         #173.13
 | 
			
		||||
        addl      $64, %r11d                                    #173.13
 | 
			
		||||
        shrl      $2, %r11d                                     #173.13
 | 
			
		||||
        cmpl      %r11d, %r10d                                  #173.13
 | 
			
		||||
        cmovl     %r10d, %r11d                                  #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.18:                        # Preds ..B1.15 ..B1.17 ..B1.16
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        movl      %r10d, %r13d                                  #173.13
 | 
			
		||||
        subl      %r11d, %r13d                                  #173.13
 | 
			
		||||
        andl      $7, %r13d                                     #173.13
 | 
			
		||||
        negl      %r13d                                         #173.13
 | 
			
		||||
        addl      %r10d, %r13d                                  #173.13
 | 
			
		||||
        cmpl      $1, %r11d                                     #173.13
 | 
			
		||||
        jb        ..B1.26       # Prob 50%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.19:                        # Preds ..B1.18
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        vmovdqa   %ymm15, %ymm4                                 #173.13
 | 
			
		||||
        xorl      %r12d, %r12d                                  #173.13
 | 
			
		||||
        vpbroadcastd %r11d, %ymm3                               #173.13
 | 
			
		||||
        vbroadcastsd %xmm10, %zmm2                              #146.23
 | 
			
		||||
        vbroadcastsd %xmm6, %zmm1                               #147.23
 | 
			
		||||
        vbroadcastsd %xmm12, %zmm0                              #148.23
 | 
			
		||||
        movslq    %r11d, %r9                                    #173.13
 | 
			
		||||
        movq      %r8, 32(%rsp)                                 #173.13[spill]
 | 
			
		||||
        movq      %r14, (%rsp)                                  #173.13[spill]
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.20:                        # Preds ..B1.24 ..B1.19
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        vpcmpgtd  %ymm4, %ymm3, %k3                             #173.13
 | 
			
		||||
        vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z}                 #174.25
 | 
			
		||||
        kmovw     %k3, %r14d                                    #173.13
 | 
			
		||||
        vpaddd    %ymm17, %ymm17, %ymm18                        #175.40
 | 
			
		||||
        vpaddd    %ymm18, %ymm17, %ymm17                        #175.40
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 | 
			
		||||
..B1.23:                        # Preds ..B1.20
 | 
			
		||||
                                # Execution count [1.25e+01]
 | 
			
		||||
        kmovw     %k3, %k1                                      #175.40
 | 
			
		||||
        kmovw     %k3, %k2                                      #175.40
 | 
			
		||||
        vpxord    %zmm18, %zmm18, %zmm18                        #175.40
 | 
			
		||||
        vpxord    %zmm19, %zmm19, %zmm19                        #175.40
 | 
			
		||||
        vpxord    %zmm20, %zmm20, %zmm20                        #175.40
 | 
			
		||||
        vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1}               #175.40
 | 
			
		||||
        vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2}                #175.40
 | 
			
		||||
        vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3}                 #175.40
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
 | 
			
		||||
..B1.24:                        # Preds ..B1.23
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        addq      $8, %r12                                      #173.13
 | 
			
		||||
        #vpaddd    %ymm16, %ymm4, %ymm4                          #173.13
 | 
			
		||||
        #vsubpd    %zmm18, %zmm0, %zmm29                         #177.40
 | 
			
		||||
        #vsubpd    %zmm19, %zmm1, %zmm27                         #176.40
 | 
			
		||||
        #vsubpd    %zmm20, %zmm2, %zmm26                         #175.40
 | 
			
		||||
        #vmulpd    %zmm27, %zmm27, %zmm25                        #178.53
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm26, %zmm25                      #178.53
 | 
			
		||||
        #vfmadd231pd %zmm29, %zmm29, %zmm25                      #178.67
 | 
			
		||||
        #vrcp14pd  %zmm25, %zmm24                                #195.42
 | 
			
		||||
        #vcmppd    $1, %zmm14, %zmm25, %k2                       #194.26
 | 
			
		||||
        #vfpclasspd $30, %zmm24, %k0                             #195.42
 | 
			
		||||
        #kmovw     %k2, %r8d                                     #194.26
 | 
			
		||||
        #knotw     %k0, %k1                                      #195.42
 | 
			
		||||
        #vmovaps   %zmm25, %zmm17                                #195.42
 | 
			
		||||
        #andl      %r8d, %r14d                                   #194.26
 | 
			
		||||
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
 | 
			
		||||
        #kmovw     %r14d, %k3                                    #198.21
 | 
			
		||||
        #vmulpd    %zmm17, %zmm17, %zmm18                        #195.42
 | 
			
		||||
        #vfmadd213pd %zmm24, %zmm17, %zmm24{%k1}                 #195.42
 | 
			
		||||
        #vfmadd213pd %zmm24, %zmm18, %zmm24{%k1}                 #195.42
 | 
			
		||||
        #vmulpd    %zmm13, %zmm24, %zmm19                        #196.42
 | 
			
		||||
        #vmulpd    %zmm9, %zmm24, %zmm21                         #197.58
 | 
			
		||||
        #vmulpd    %zmm19, %zmm24, %zmm22                        #196.48
 | 
			
		||||
        #vmulpd    %zmm22, %zmm24, %zmm20                        #196.54
 | 
			
		||||
        #vfmsub213pd %zmm5, %zmm22, %zmm24                       #197.58
 | 
			
		||||
        #vmulpd    %zmm21, %zmm20, %zmm23                        #197.65
 | 
			
		||||
        #vmulpd    %zmm24, %zmm23, %zmm28                        #197.71
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm28, %zmm8{%k3}                  #198.21
 | 
			
		||||
        #vfmadd231pd %zmm27, %zmm28, %zmm7{%k3}                  #199.21
 | 
			
		||||
        #vfmadd231pd %zmm29, %zmm28, %zmm11{%k3}                 #200.21
 | 
			
		||||
        cmpq      %r9, %r12                                     #173.13
 | 
			
		||||
        jb        ..B1.20       # Prob 82%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.25:                        # Preds ..B1.24
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      32(%rsp), %r8                                 #[spill]
 | 
			
		||||
        movq      (%rsp), %r14                                  #[spill]
 | 
			
		||||
        cmpl      %r11d, %r10d                                  #173.13
 | 
			
		||||
        je        ..B1.40       # Prob 10%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.26:                        # Preds ..B1.25 ..B1.18 ..B1.47
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        lea       8(%r11), %r9d                                 #173.13
 | 
			
		||||
        cmpl      %r9d, %r13d                                   #173.13
 | 
			
		||||
        jl        ..B1.34       # Prob 50%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.27:                        # Preds ..B1.26
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      %rdx, %r12                                    #144.43
 | 
			
		||||
        imulq     %rsi, %r12                                    #144.43
 | 
			
		||||
        vbroadcastsd %xmm10, %zmm1                              #146.23
 | 
			
		||||
        vbroadcastsd %xmm6, %zmm0                               #147.23
 | 
			
		||||
        vbroadcastsd %xmm12, %zmm2                              #148.23
 | 
			
		||||
        movslq    %r11d, %r9                                    #173.13
 | 
			
		||||
        addq      %rcx, %r12                                    #126.5
 | 
			
		||||
        movq      %rdi, 8(%rsp)                                 #126.5[spill]
 | 
			
		||||
        movq      %rdx, 16(%rsp)                                #126.5[spill]
 | 
			
		||||
        movq      %rcx, 40(%rsp)                                #126.5[spill]
 | 
			
		||||
        movq      %rax, 48(%rsp)                                #126.5[spill]
 | 
			
		||||
        movq      %rsi, 56(%rsp)                                #126.5[spill]
 | 
			
		||||
        movq      %r8, 32(%rsp)                                 #126.5[spill]
 | 
			
		||||
        movq      %r14, (%rsp)                                  #126.5[spill]
 | 
			
		||||
                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.28:                        # Preds ..B1.32 ..B1.27
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        vmovdqu   (%r12,%r9,4), %ymm3                           #174.25
 | 
			
		||||
        vpaddd    %ymm3, %ymm3, %ymm4                           #175.40
 | 
			
		||||
        vpaddd    %ymm4, %ymm3, %ymm3                           #175.40
 | 
			
		||||
        movl      (%r12,%r9,4), %r14d                           #174.25
 | 
			
		||||
        movl      4(%r12,%r9,4), %r8d                           #174.25
 | 
			
		||||
        movl      8(%r12,%r9,4), %edi                           #174.25
 | 
			
		||||
        movl      12(%r12,%r9,4), %esi                          #174.25
 | 
			
		||||
        lea       (%r14,%r14,2), %r14d                          #175.40
 | 
			
		||||
        movl      16(%r12,%r9,4), %ecx                          #174.25
 | 
			
		||||
        lea       (%r8,%r8,2), %r8d                             #175.40
 | 
			
		||||
        movl      20(%r12,%r9,4), %edx                          #174.25
 | 
			
		||||
        lea       (%rdi,%rdi,2), %edi                           #175.40
 | 
			
		||||
        movl      24(%r12,%r9,4), %eax                          #174.25
 | 
			
		||||
        lea       (%rsi,%rsi,2), %esi                           #175.40
 | 
			
		||||
        movl      28(%r12,%r9,4), %r15d                         #174.25
 | 
			
		||||
        lea       (%rcx,%rcx,2), %ecx                           #175.40
 | 
			
		||||
        lea       (%rdx,%rdx,2), %edx                           #175.40
 | 
			
		||||
        lea       (%rax,%rax,2), %eax                           #175.40
 | 
			
		||||
        lea       (%r15,%r15,2), %r15d                          #175.40
 | 
			
		||||
                                # LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.31:                        # Preds ..B1.28
 | 
			
		||||
                                # Execution count [1.25e+01]
 | 
			
		||||
        vpcmpeqb  %xmm0, %xmm0, %k1                             #175.40
 | 
			
		||||
        vpcmpeqb  %xmm0, %xmm0, %k2                             #175.40
 | 
			
		||||
        vpcmpeqb  %xmm0, %xmm0, %k3                             #175.40
 | 
			
		||||
        vpxord    %zmm4, %zmm4, %zmm4                           #175.40
 | 
			
		||||
        vpxord    %zmm17, %zmm17, %zmm17                        #175.40
 | 
			
		||||
        vpxord    %zmm18, %zmm18, %zmm18                        #175.40
 | 
			
		||||
        vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1}                 #175.40
 | 
			
		||||
        vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2}                 #175.40
 | 
			
		||||
        vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3}                  #175.40
 | 
			
		||||
                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
 | 
			
		||||
..B1.32:                        # Preds ..B1.31
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        addl      $8, %r11d                                     #173.13
 | 
			
		||||
        addq      $8, %r9                                       #173.13
 | 
			
		||||
        #vsubpd    %zmm4, %zmm2, %zmm26                          #177.40
 | 
			
		||||
        #vsubpd    %zmm17, %zmm0, %zmm24                         #176.40
 | 
			
		||||
        #vsubpd    %zmm18, %zmm1, %zmm23                         #175.40
 | 
			
		||||
        #vmulpd    %zmm24, %zmm24, %zmm3                         #178.53
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm23, %zmm3                       #178.53
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm26, %zmm3                       #178.67
 | 
			
		||||
        #vrcp14pd  %zmm3, %zmm22                                 #195.42
 | 
			
		||||
        #vcmppd    $1, %zmm14, %zmm3, %k2                        #194.26
 | 
			
		||||
        #vfpclasspd $30, %zmm22, %k0                             #195.42
 | 
			
		||||
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
 | 
			
		||||
        #knotw     %k0, %k1                                      #195.42
 | 
			
		||||
        #vmulpd    %zmm3, %zmm3, %zmm4                           #195.42
 | 
			
		||||
        #vfmadd213pd %zmm22, %zmm3, %zmm22{%k1}                  #195.42
 | 
			
		||||
        #vfmadd213pd %zmm22, %zmm4, %zmm22{%k1}                  #195.42
 | 
			
		||||
        #vmulpd    %zmm13, %zmm22, %zmm17                        #196.42
 | 
			
		||||
        #vmulpd    %zmm9, %zmm22, %zmm19                         #197.58
 | 
			
		||||
        #vmulpd    %zmm17, %zmm22, %zmm20                        #196.48
 | 
			
		||||
        #vmulpd    %zmm20, %zmm22, %zmm18                        #196.54
 | 
			
		||||
        #vfmsub213pd %zmm5, %zmm20, %zmm22                       #197.58
 | 
			
		||||
        #vmulpd    %zmm19, %zmm18, %zmm21                        #197.65
 | 
			
		||||
        #vmulpd    %zmm22, %zmm21, %zmm25                        #197.71
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm25, %zmm8{%k2}                  #198.21
 | 
			
		||||
        #vfmadd231pd %zmm24, %zmm25, %zmm7{%k2}                  #199.21
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm25, %zmm11{%k2}                 #200.21
 | 
			
		||||
        cmpl      %r13d, %r11d                                  #173.13
 | 
			
		||||
        jb        ..B1.28       # Prob 82%                      #173.13
 | 
			
		||||
                                # LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.33:                        # Preds ..B1.32
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      8(%rsp), %rdi                                 #[spill]
 | 
			
		||||
        movq      16(%rsp), %rdx                                #[spill]
 | 
			
		||||
        movq      40(%rsp), %rcx                                #[spill]
 | 
			
		||||
        movq      48(%rsp), %rax                                #[spill]
 | 
			
		||||
        movq      56(%rsp), %rsi                                #[spill]
 | 
			
		||||
        movq      32(%rsp), %r8                                 #[spill]
 | 
			
		||||
        movq      (%rsp), %r14                                  #[spill]
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.34:                        # Preds ..B1.33 ..B1.26 ..B1.48
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        lea       1(%r13), %r9d                                 #173.13
 | 
			
		||||
        cmpl      %r10d, %r9d                                   #173.13
 | 
			
		||||
        ja        ..B1.40       # Prob 50%                      #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.35:                        # Preds ..B1.34
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        imulq     %rdx, %rsi                                    #144.43
 | 
			
		||||
        vbroadcastsd %xmm10, %zmm4                              #146.23
 | 
			
		||||
        subl      %r13d, %r10d                                  #173.13
 | 
			
		||||
        addq      %rcx, %rsi                                    #126.5
 | 
			
		||||
        vpbroadcastd %r10d, %ymm0                               #173.13
 | 
			
		||||
        vpcmpgtd  %ymm15, %ymm0, %k3                            #173.13
 | 
			
		||||
        movslq    %r13d, %r13                                   #173.13
 | 
			
		||||
        kmovw     %k3, %r9d                                     #173.13
 | 
			
		||||
        vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z}                  #174.25
 | 
			
		||||
        vpaddd    %ymm1, %ymm1, %ymm2                           #175.40
 | 
			
		||||
        vpaddd    %ymm2, %ymm1, %ymm0                           #175.40
 | 
			
		||||
                                # LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 | 
			
		||||
..B1.38:                        # Preds ..B1.35
 | 
			
		||||
                                # Execution count [1.25e+01]
 | 
			
		||||
        kmovw     %k3, %k1                                      #175.40
 | 
			
		||||
        kmovw     %k3, %k2                                      #175.40
 | 
			
		||||
        vpxord    %zmm1, %zmm1, %zmm1                           #175.40
 | 
			
		||||
        vpxord    %zmm2, %zmm2, %zmm2                           #175.40
 | 
			
		||||
        vpxord    %zmm3, %zmm3, %zmm3                           #175.40
 | 
			
		||||
        vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1}                 #175.40
 | 
			
		||||
        vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2}                  #175.40
 | 
			
		||||
        vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3}                   #175.40
 | 
			
		||||
                                # LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.39:                        # Preds ..B1.38
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        #vbroadcastsd %xmm6, %zmm6                               #147.23
 | 
			
		||||
        #vbroadcastsd %xmm12, %zmm12                             #148.23
 | 
			
		||||
        #vsubpd    %zmm1, %zmm12, %zmm23                         #177.40
 | 
			
		||||
        #vsubpd    %zmm2, %zmm6, %zmm21                          #176.40
 | 
			
		||||
        #vsubpd    %zmm3, %zmm4, %zmm20                          #175.40
 | 
			
		||||
        #vmulpd    %zmm21, %zmm21, %zmm19                        #178.53
 | 
			
		||||
        #vfmadd231pd %zmm20, %zmm20, %zmm19                      #178.53
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm23, %zmm19                      #178.67
 | 
			
		||||
        #vrcp14pd  %zmm19, %zmm18                                #195.42
 | 
			
		||||
        #vcmppd    $1, %zmm14, %zmm19, %k2                       #194.26
 | 
			
		||||
        #vfpclasspd $30, %zmm18, %k0                             #195.42
 | 
			
		||||
        #kmovw     %k2, %esi                                     #194.26
 | 
			
		||||
        #knotw     %k0, %k1                                      #195.42
 | 
			
		||||
        #vmovaps   %zmm19, %zmm0                                 #195.42
 | 
			
		||||
        #andl      %esi, %r9d                                    #194.26
 | 
			
		||||
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
 | 
			
		||||
        #kmovw     %r9d, %k3                                     #198.21
 | 
			
		||||
        #vmulpd    %zmm0, %zmm0, %zmm1                           #195.42
 | 
			
		||||
        #vfmadd213pd %zmm18, %zmm0, %zmm18{%k1}                  #195.42
 | 
			
		||||
        #vfmadd213pd %zmm18, %zmm1, %zmm18{%k1}                  #195.42
 | 
			
		||||
        #vmulpd    %zmm13, %zmm18, %zmm2                         #196.42
 | 
			
		||||
        #vmulpd    %zmm9, %zmm18, %zmm4                          #197.58
 | 
			
		||||
        #vmulpd    %zmm2, %zmm18, %zmm10                         #196.48
 | 
			
		||||
        #vmulpd    %zmm10, %zmm18, %zmm3                         #196.54
 | 
			
		||||
        #vfmsub213pd %zmm5, %zmm10, %zmm18                       #197.58
 | 
			
		||||
        #vmulpd    %zmm4, %zmm3, %zmm17                          #197.65
 | 
			
		||||
        #vmulpd    %zmm18, %zmm17, %zmm22                        #197.71
 | 
			
		||||
        #vfmadd231pd %zmm20, %zmm22, %zmm8{%k3}                  #198.21
 | 
			
		||||
        #vfmadd231pd %zmm21, %zmm22, %zmm7{%k3}                  #199.21
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm22, %zmm11{%k3}                 #200.21
 | 
			
		||||
                                # LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.40:                        # Preds ..B1.25 ..B1.39 ..B1.34
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        vmovups   .L_2il0floatpacket.10(%rip), %zmm19           #151.22
 | 
			
		||||
        vpermd    %zmm11, %zmm19, %zmm0                         #151.22
 | 
			
		||||
        vpermd    %zmm7, %zmm19, %zmm6                          #150.22
 | 
			
		||||
        vpermd    %zmm8, %zmm19, %zmm20                         #149.22
 | 
			
		||||
        vaddpd    %zmm11, %zmm0, %zmm11                         #151.22
 | 
			
		||||
        vaddpd    %zmm7, %zmm6, %zmm7                           #150.22
 | 
			
		||||
        vaddpd    %zmm8, %zmm20, %zmm8                          #149.22
 | 
			
		||||
        vpermpd   $78, %zmm11, %zmm1                            #151.22
 | 
			
		||||
        vpermpd   $78, %zmm7, %zmm10                            #150.22
 | 
			
		||||
        vpermpd   $78, %zmm8, %zmm21                            #149.22
 | 
			
		||||
        vaddpd    %zmm1, %zmm11, %zmm2                          #151.22
 | 
			
		||||
        vaddpd    %zmm10, %zmm7, %zmm12                         #150.22
 | 
			
		||||
        vaddpd    %zmm21, %zmm8, %zmm22                         #149.22
 | 
			
		||||
        vpermpd   $177, %zmm2, %zmm3                            #151.22
 | 
			
		||||
        vpermpd   $177, %zmm12, %zmm17                          #150.22
 | 
			
		||||
        vpermpd   $177, %zmm22, %zmm23                          #149.22
 | 
			
		||||
        vaddpd    %zmm3, %zmm2, %zmm4                           #151.22
 | 
			
		||||
        vaddpd    %zmm17, %zmm12, %zmm18                        #150.22
 | 
			
		||||
        vaddpd    %zmm23, %zmm22, %zmm24                        #149.22
 | 
			
		||||
                                # LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.41:                        # Preds ..B1.40 ..B1.11
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        movq      80(%rsp), %rsi                                #208.9[spill]
 | 
			
		||||
        addq      $24, %rdi                                     #143.5
 | 
			
		||||
        vaddsd    (%rsi,%rax,8), %xmm24, %xmm0                  #208.9
 | 
			
		||||
        vmovsd    %xmm0, (%rsi,%rax,8)                          #208.9
 | 
			
		||||
        movslq    %eax, %rsi                                    #143.32
 | 
			
		||||
        vaddsd    (%r14,%rax,8), %xmm18, %xmm1                  #209.9
 | 
			
		||||
        vmovsd    %xmm1, (%r14,%rax,8)                          #209.9
 | 
			
		||||
        incq      %rsi                                          #143.32
 | 
			
		||||
        vaddsd    (%r8,%rax,8), %xmm4, %xmm2                    #210.9
 | 
			
		||||
        vmovsd    %xmm2, (%r8,%rax,8)                           #210.9
 | 
			
		||||
        incq      %rax                                          #143.5
 | 
			
		||||
        cmpq      64(%rsp), %rax                                #143.5[spill]
 | 
			
		||||
        jb        ..B1.11       # Prob 82%                      #143.5
 | 
			
		||||
        jmp       ..B1.44       # Prob 100%                     #143.5
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.43:                        # Preds ..B1.2
 | 
			
		||||
                                # Execution count [5.00e-01]
 | 
			
		||||
        movl      $.L_2__STRING.0, %edi                         #141.5
 | 
			
		||||
..___tag_value_computeForce.48:
 | 
			
		||||
#       likwid_markerStartRegion(const char *)
 | 
			
		||||
        call      likwid_markerStartRegion                      #141.5
 | 
			
		||||
..___tag_value_computeForce.49:
 | 
			
		||||
                                # LOE
 | 
			
		||||
..B1.44:                        # Preds ..B1.41 ..B1.43
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        movl      $.L_2__STRING.0, %edi                         #219.5
 | 
			
		||||
        vzeroupper                                              #219.5
 | 
			
		||||
..___tag_value_computeForce.50:
 | 
			
		||||
#       likwid_markerStopRegion(const char *)
 | 
			
		||||
        call      likwid_markerStopRegion                       #219.5
 | 
			
		||||
..___tag_value_computeForce.51:
 | 
			
		||||
                                # LOE
 | 
			
		||||
..B1.45:                        # Preds ..B1.44
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        xorl      %eax, %eax                                    #221.16
 | 
			
		||||
..___tag_value_computeForce.52:
 | 
			
		||||
#       getTimeStamp()
 | 
			
		||||
        call      getTimeStamp                                  #221.16
 | 
			
		||||
..___tag_value_computeForce.53:
 | 
			
		||||
                                # LOE xmm0
 | 
			
		||||
..B1.46:                        # Preds ..B1.45
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        vsubsd    24(%rsp), %xmm0, %xmm0                        #224.14[spill]
 | 
			
		||||
        addq      $88, %rsp                                     #224.14
 | 
			
		||||
	.cfi_restore 3
 | 
			
		||||
        popq      %rbx                                          #224.14
 | 
			
		||||
	.cfi_restore 15
 | 
			
		||||
        popq      %r15                                          #224.14
 | 
			
		||||
	.cfi_restore 14
 | 
			
		||||
        popq      %r14                                          #224.14
 | 
			
		||||
	.cfi_restore 13
 | 
			
		||||
        popq      %r13                                          #224.14
 | 
			
		||||
	.cfi_restore 12
 | 
			
		||||
        popq      %r12                                          #224.14
 | 
			
		||||
        movq      %rbp, %rsp                                    #224.14
 | 
			
		||||
        popq      %rbp                                          #224.14
 | 
			
		||||
	.cfi_def_cfa 7, 8
 | 
			
		||||
	.cfi_restore 6
 | 
			
		||||
        ret                                                     #224.14
 | 
			
		||||
	.cfi_def_cfa 6, 16
 | 
			
		||||
	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_offset 6, -16
 | 
			
		||||
	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
                                # LOE
 | 
			
		||||
..B1.47:                        # Preds ..B1.13
 | 
			
		||||
                                # Execution count [4.50e-01]: Infreq
 | 
			
		||||
        movl      %r10d, %r13d                                  #173.13
 | 
			
		||||
        xorl      %r11d, %r11d                                  #173.13
 | 
			
		||||
        andl      $-8, %r13d                                    #173.13
 | 
			
		||||
        jmp       ..B1.26       # Prob 100%                     #173.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.48:                        # Preds ..B1.12
 | 
			
		||||
                                # Execution count [4.50e-01]: Infreq
 | 
			
		||||
        xorl      %r13d, %r13d                                  #173.13
 | 
			
		||||
        jmp       ..B1.34       # Prob 100%                     #173.13
 | 
			
		||||
        .align    16,0x90
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
# mark_end;
 | 
			
		||||
	.type	computeForce,@function
 | 
			
		||||
	.size	computeForce,.-computeForce
 | 
			
		||||
..LNcomputeForce.0:
 | 
			
		||||
	.data
 | 
			
		||||
# -- End  computeForce
 | 
			
		||||
	.section .rodata, "a"
 | 
			
		||||
	.align 64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.2:
 | 
			
		||||
	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.2,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.2,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.4:
 | 
			
		||||
	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 | 
			
		||||
	.type	.L_2il0floatpacket.4,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.4,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.5:
 | 
			
		||||
	.long	0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.5,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.5,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.6:
 | 
			
		||||
	.long	0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.6,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.6,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.7:
 | 
			
		||||
	.long	0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.7,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.7,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.8:
 | 
			
		||||
	.long	0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.8,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.8,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.10:
 | 
			
		||||
	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 | 
			
		||||
	.type	.L_2il0floatpacket.10,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.10,64
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.0:
 | 
			
		||||
	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 | 
			
		||||
	.type	.L_2il0floatpacket.0,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.0,32
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.1:
 | 
			
		||||
	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 | 
			
		||||
	.type	.L_2il0floatpacket.1,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.1,32
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.3:
 | 
			
		||||
	.long	0x00000000,0x40480000
 | 
			
		||||
	.type	.L_2il0floatpacket.3,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.3,8
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.9:
 | 
			
		||||
	.long	0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.9,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.9,8
 | 
			
		||||
	.section .rodata.str1.4, "aMS",@progbits,1
 | 
			
		||||
	.align 4
 | 
			
		||||
	.align 4
 | 
			
		||||
.L_2__STRING.0:
 | 
			
		||||
	.long	1668444006
 | 
			
		||||
	.word	101
 | 
			
		||||
	.type	.L_2__STRING.0,@object
 | 
			
		||||
	.size	.L_2__STRING.0,6
 | 
			
		||||
	.data
 | 
			
		||||
	.section .note.GNU-stack, ""
 | 
			
		||||
# End
 | 
			
		||||
@@ -1,585 +0,0 @@
 | 
			
		||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
 | 
			
		||||
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
 | 
			
		||||
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
 | 
			
		||||
	.file "force.c"
 | 
			
		||||
	.text
 | 
			
		||||
..TXTST0:
 | 
			
		||||
.L_2__routine_start_computeForce_0:
 | 
			
		||||
# -- Begin  computeForce
 | 
			
		||||
	.text
 | 
			
		||||
# mark_begin;
 | 
			
		||||
       .align    16,0x90
 | 
			
		||||
	.globl computeForce
 | 
			
		||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
 | 
			
		||||
computeForce:
 | 
			
		||||
# parameter 1: %rdi
 | 
			
		||||
# parameter 2: %rsi
 | 
			
		||||
# parameter 3: %rdx
 | 
			
		||||
# parameter 4: %ecx
 | 
			
		||||
..B1.1:                         # Preds ..B1.0
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
..___tag_value_computeForce.1:
 | 
			
		||||
..L2:
 | 
			
		||||
                                                          #103.87
 | 
			
		||||
        pushq     %rbp                                          #103.87
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
        movq      %rsp, %rbp                                    #103.87
 | 
			
		||||
	.cfi_def_cfa 6, 16
 | 
			
		||||
	.cfi_offset 6, -16
 | 
			
		||||
        andq      $-64, %rsp                                    #103.87
 | 
			
		||||
        pushq     %r12                                          #103.87
 | 
			
		||||
        pushq     %r13                                          #103.87
 | 
			
		||||
        pushq     %r14                                          #103.87
 | 
			
		||||
        subq      $104, %rsp                                    #103.87
 | 
			
		||||
        xorl      %eax, %eax                                    #106.16
 | 
			
		||||
	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
        movq      %rdx, %r14                                    #103.87
 | 
			
		||||
        movq      %rsi, %r13                                    #103.87
 | 
			
		||||
        movq      %rdi, %r12                                    #103.87
 | 
			
		||||
..___tag_value_computeForce.9:
 | 
			
		||||
#       getTimeStamp()
 | 
			
		||||
        call      getTimeStamp                                  #106.16
 | 
			
		||||
..___tag_value_computeForce.10:
 | 
			
		||||
                                # LOE rbx r12 r13 r14 r15 xmm0
 | 
			
		||||
..B1.48:                        # Preds ..B1.1
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        vmovsd    %xmm0, 16(%rsp)                               #106.16[spill]
 | 
			
		||||
                                # LOE rbx r12 r13 r14 r15
 | 
			
		||||
..B1.2:                         # Preds ..B1.48
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        movl      4(%r13), %ecx                                 #107.18
 | 
			
		||||
        movq      64(%r13), %r11                                #109.20
 | 
			
		||||
        movq      72(%r13), %r10                                #109.45
 | 
			
		||||
        movq      80(%r13), %r9                                 #109.70
 | 
			
		||||
        vmovsd    72(%r12), %xmm2                               #111.27
 | 
			
		||||
        vmovsd    8(%r12), %xmm1                                #112.23
 | 
			
		||||
        vmovsd    (%r12), %xmm0                                 #113.24
 | 
			
		||||
        testl     %ecx, %ecx                                    #116.24
 | 
			
		||||
        jle       ..B1.42       # Prob 50%                      #116.24
 | 
			
		||||
                                # LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
 | 
			
		||||
..B1.3:                         # Preds ..B1.2
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        xorl      %edi, %edi                                    #116.5
 | 
			
		||||
        movl      %ecx, %edx                                    #116.5
 | 
			
		||||
        xorl      %esi, %esi                                    #116.5
 | 
			
		||||
        movl      $1, %r8d                                      #116.5
 | 
			
		||||
        xorl      %eax, %eax                                    #117.17
 | 
			
		||||
        shrl      $1, %edx                                      #116.5
 | 
			
		||||
        je        ..B1.7        # Prob 9%                       #116.5
 | 
			
		||||
                                # LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.5:                         # Preds ..B1.3 ..B1.5
 | 
			
		||||
                                # Execution count [2.50e+00]
 | 
			
		||||
        movq      %rax, (%rsi,%r11)                             #117.9
 | 
			
		||||
        incq      %rdi                                          #116.5
 | 
			
		||||
        movq      %rax, (%rsi,%r10)                             #118.9
 | 
			
		||||
        movq      %rax, (%rsi,%r9)                              #119.9
 | 
			
		||||
        movq      %rax, 8(%rsi,%r11)                            #117.9
 | 
			
		||||
        movq      %rax, 8(%rsi,%r10)                            #118.9
 | 
			
		||||
        movq      %rax, 8(%rsi,%r9)                             #119.9
 | 
			
		||||
        addq      $16, %rsi                                     #116.5
 | 
			
		||||
        cmpq      %rdx, %rdi                                    #116.5
 | 
			
		||||
        jb        ..B1.5        # Prob 63%                      #116.5
 | 
			
		||||
                                # LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
 | 
			
		||||
..B1.6:                         # Preds ..B1.5
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        lea       1(%rdi,%rdi), %r8d                            #117.9
 | 
			
		||||
                                # LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.7:                         # Preds ..B1.3 ..B1.6
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        lea       -1(%r8), %edx                                 #116.5
 | 
			
		||||
        cmpl      %ecx, %edx                                    #116.5
 | 
			
		||||
        jae       ..B1.9        # Prob 9%                       #116.5
 | 
			
		||||
                                # LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
 | 
			
		||||
..B1.8:                         # Preds ..B1.7
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        movslq    %r8d, %r8                                     #116.5
 | 
			
		||||
        movq      %rax, -8(%r11,%r8,8)                          #117.9
 | 
			
		||||
        movq      %rax, -8(%r10,%r8,8)                          #118.9
 | 
			
		||||
        movq      %rax, -8(%r9,%r8,8)                           #119.9
 | 
			
		||||
                                # LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
 | 
			
		||||
..B1.9:                         # Preds ..B1.7 ..B1.8
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        vmulsd    %xmm2, %xmm2, %xmm13                          #111.45
 | 
			
		||||
        xorl      %edi, %edi                                    #124.15
 | 
			
		||||
        vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16            #153.13
 | 
			
		||||
        vmulsd    .L_2il0floatpacket.3(%rip), %xmm0, %xmm0      #177.45
 | 
			
		||||
        vmovdqu   .L_2il0floatpacket.1(%rip), %ymm15            #153.13
 | 
			
		||||
        vmovups   .L_2il0floatpacket.4(%rip), %zmm5             #177.58
 | 
			
		||||
        vbroadcastsd %xmm13, %zmm14                             #111.25
 | 
			
		||||
        vbroadcastsd %xmm1, %zmm13                              #112.21
 | 
			
		||||
        vbroadcastsd %xmm0, %zmm9                               #177.45
 | 
			
		||||
        movq      16(%r13), %rdx                                #127.25
 | 
			
		||||
        xorl      %r8d, %r8d                                    #124.5
 | 
			
		||||
        movslq    %ecx, %r12                                    #124.5
 | 
			
		||||
        xorl      %eax, %eax                                    #124.5
 | 
			
		||||
        movq      24(%r14), %r13                                #126.25
 | 
			
		||||
        movslq    16(%r14), %rcx                                #125.43
 | 
			
		||||
        movq      8(%r14), %rsi                                 #125.19
 | 
			
		||||
        shlq      $2, %rcx                                      #108.5
 | 
			
		||||
        movq      %r12, 80(%rsp)                                #124.5[spill]
 | 
			
		||||
        movq      %r13, 88(%rsp)                                #124.5[spill]
 | 
			
		||||
        movq      %r11, 96(%rsp)                                #124.5[spill]
 | 
			
		||||
        movq      %r15, 8(%rsp)                                 #124.5[spill]
 | 
			
		||||
        movq      %rbx, (%rsp)                                  #124.5[spill]
 | 
			
		||||
	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.10:                        # Preds ..B1.40 ..B1.9
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        movq      88(%rsp), %rbx                                #126.25[spill]
 | 
			
		||||
        vxorpd    %xmm24, %xmm24, %xmm24                        #130.22
 | 
			
		||||
        vmovapd   %xmm24, %xmm18                                #131.22
 | 
			
		||||
        movl      (%rbx,%r8,4), %r11d                           #126.25
 | 
			
		||||
        vmovapd   %xmm18, %xmm4                                 #132.22
 | 
			
		||||
        vmovsd    (%rax,%rdx), %xmm10                           #127.25
 | 
			
		||||
        vmovsd    8(%rax,%rdx), %xmm6                           #128.25
 | 
			
		||||
        vmovsd    16(%rax,%rdx), %xmm12                         #129.25
 | 
			
		||||
        testl     %r11d, %r11d                                  #153.32
 | 
			
		||||
        jle       ..B1.40       # Prob 50%                      #153.32
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.11:                        # Preds ..B1.10
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        vpxord    %zmm8, %zmm8, %zmm8                           #130.22
 | 
			
		||||
        vmovaps   %zmm8, %zmm7                                  #131.22
 | 
			
		||||
        vmovaps   %zmm7, %zmm11                                 #132.22
 | 
			
		||||
        cmpl      $8, %r11d                                     #153.13
 | 
			
		||||
        jl        ..B1.45       # Prob 10%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.12:                        # Preds ..B1.11
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        cmpl      $1200, %r11d                                  #153.13
 | 
			
		||||
        jl        ..B1.44       # Prob 10%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.13:                        # Preds ..B1.12
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      %rcx, %r15                                    #125.43
 | 
			
		||||
        imulq     %rdi, %r15                                    #125.43
 | 
			
		||||
        addq      %rsi, %r15                                    #108.5
 | 
			
		||||
        movq      %r15, %r12                                    #153.13
 | 
			
		||||
        andq      $63, %r12                                     #153.13
 | 
			
		||||
        testl     $3, %r12d                                     #153.13
 | 
			
		||||
        je        ..B1.15       # Prob 50%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.14:                        # Preds ..B1.13
 | 
			
		||||
                                # Execution count [2.25e+00]
 | 
			
		||||
        xorl      %r12d, %r12d                                  #153.13
 | 
			
		||||
        jmp       ..B1.17       # Prob 100%                     #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.15:                        # Preds ..B1.13
 | 
			
		||||
                                # Execution count [2.25e+00]
 | 
			
		||||
        testl     %r12d, %r12d                                  #153.13
 | 
			
		||||
        je        ..B1.17       # Prob 50%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.16:                        # Preds ..B1.15
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        negl      %r12d                                         #153.13
 | 
			
		||||
        addl      $64, %r12d                                    #153.13
 | 
			
		||||
        shrl      $2, %r12d                                     #153.13
 | 
			
		||||
        cmpl      %r12d, %r11d                                  #153.13
 | 
			
		||||
        cmovl     %r11d, %r12d                                  #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.17:                        # Preds ..B1.14 ..B1.16 ..B1.15
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        movl      %r11d, %r14d                                  #153.13
 | 
			
		||||
        subl      %r12d, %r14d                                  #153.13
 | 
			
		||||
        andl      $7, %r14d                                     #153.13
 | 
			
		||||
        negl      %r14d                                         #153.13
 | 
			
		||||
        addl      %r11d, %r14d                                  #153.13
 | 
			
		||||
        cmpl      $1, %r12d                                     #153.13
 | 
			
		||||
        jb        ..B1.25       # Prob 50%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.18:                        # Preds ..B1.17
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        vmovdqa   %ymm15, %ymm4                                 #153.13
 | 
			
		||||
        xorl      %r13d, %r13d                                  #153.13
 | 
			
		||||
        vpbroadcastd %r12d, %ymm3                               #153.13
 | 
			
		||||
        vbroadcastsd %xmm10, %zmm2                              #127.23
 | 
			
		||||
        vbroadcastsd %xmm6, %zmm1                               #128.23
 | 
			
		||||
        vbroadcastsd %xmm12, %zmm0                              #129.23
 | 
			
		||||
        movslq    %r12d, %rbx                                   #153.13
 | 
			
		||||
        movq      %r9, 24(%rsp)                                 #153.13[spill]
 | 
			
		||||
        movq      %r10, 32(%rsp)                                #153.13[spill]
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.19:                        # Preds ..B1.23 ..B1.18
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        vpcmpgtd  %ymm4, %ymm3, %k3                             #153.13
 | 
			
		||||
        vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z}                 #154.25
 | 
			
		||||
        kmovw     %k3, %r10d                                    #153.13
 | 
			
		||||
        vpaddd    %ymm17, %ymm17, %ymm18                        #155.40
 | 
			
		||||
        vpaddd    %ymm18, %ymm17, %ymm17                        #155.40
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 | 
			
		||||
..B1.22:                        # Preds ..B1.19
 | 
			
		||||
                                # Execution count [1.25e+01]
 | 
			
		||||
        kmovw     %k3, %k1                                      #155.40
 | 
			
		||||
        kmovw     %k3, %k2                                      #155.40
 | 
			
		||||
        vpxord    %zmm18, %zmm18, %zmm18                        #155.40
 | 
			
		||||
        vpxord    %zmm19, %zmm19, %zmm19                        #155.40
 | 
			
		||||
        vpxord    %zmm20, %zmm20, %zmm20                        #155.40
 | 
			
		||||
        vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1}               #155.40
 | 
			
		||||
        vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2}                #155.40
 | 
			
		||||
        vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3}                 #155.40
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
 | 
			
		||||
..B1.23:                        # Preds ..B1.22
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        addq      $8, %r13                                      #153.13
 | 
			
		||||
        #vpaddd    %ymm16, %ymm4, %ymm4                          #153.13
 | 
			
		||||
        #vsubpd    %zmm18, %zmm0, %zmm29                         #157.40
 | 
			
		||||
        #vsubpd    %zmm19, %zmm1, %zmm27                         #156.40
 | 
			
		||||
        #vsubpd    %zmm20, %zmm2, %zmm26                         #155.40
 | 
			
		||||
        #vmulpd    %zmm27, %zmm27, %zmm25                        #158.53
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm26, %zmm25                      #158.53
 | 
			
		||||
        #vfmadd231pd %zmm29, %zmm29, %zmm25                      #158.67
 | 
			
		||||
        #vrcp14pd  %zmm25, %zmm24                                #175.42
 | 
			
		||||
        #vcmppd    $1, %zmm14, %zmm25, %k2                       #174.26
 | 
			
		||||
        #vfpclasspd $30, %zmm24, %k0                             #175.42
 | 
			
		||||
        #kmovw     %k2, %r9d                                     #174.26
 | 
			
		||||
        #knotw     %k0, %k1                                      #175.42
 | 
			
		||||
        #vmovaps   %zmm25, %zmm17                                #175.42
 | 
			
		||||
        #andl      %r9d, %r10d                                   #174.26
 | 
			
		||||
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
 | 
			
		||||
        #kmovw     %r10d, %k3                                    #178.21
 | 
			
		||||
        #vmulpd    %zmm17, %zmm17, %zmm18                        #175.42
 | 
			
		||||
        #vfmadd213pd %zmm24, %zmm17, %zmm24{%k1}                 #175.42
 | 
			
		||||
        #vfmadd213pd %zmm24, %zmm18, %zmm24{%k1}                 #175.42
 | 
			
		||||
        #vmulpd    %zmm13, %zmm24, %zmm19                        #176.42
 | 
			
		||||
        #vmulpd    %zmm9, %zmm24, %zmm21                         #177.58
 | 
			
		||||
        #vmulpd    %zmm19, %zmm24, %zmm22                        #176.48
 | 
			
		||||
        #vmulpd    %zmm22, %zmm24, %zmm20                        #176.54
 | 
			
		||||
        #vfmsub213pd %zmm5, %zmm22, %zmm24                       #177.58
 | 
			
		||||
        #vmulpd    %zmm21, %zmm20, %zmm23                        #177.65
 | 
			
		||||
        #vmulpd    %zmm24, %zmm23, %zmm28                        #177.71
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm28, %zmm8{%k3}                  #178.21
 | 
			
		||||
        #vfmadd231pd %zmm27, %zmm28, %zmm7{%k3}                  #179.21
 | 
			
		||||
        #vfmadd231pd %zmm29, %zmm28, %zmm11{%k3}                 #180.21
 | 
			
		||||
        cmpq      %rbx, %r13                                    #153.13
 | 
			
		||||
        jb        ..B1.19       # Prob 82%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.24:                        # Preds ..B1.23
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      24(%rsp), %r9                                 #[spill]
 | 
			
		||||
        movq      32(%rsp), %r10                                #[spill]
 | 
			
		||||
        cmpl      %r12d, %r11d                                  #153.13
 | 
			
		||||
        je        ..B1.39       # Prob 10%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.25:                        # Preds ..B1.24 ..B1.17 ..B1.44
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        lea       8(%r12), %ebx                                 #153.13
 | 
			
		||||
        cmpl      %ebx, %r14d                                   #153.13
 | 
			
		||||
        jl        ..B1.33       # Prob 50%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.26:                        # Preds ..B1.25
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      %rcx, %r13                                    #125.43
 | 
			
		||||
        imulq     %rdi, %r13                                    #125.43
 | 
			
		||||
        vbroadcastsd %xmm10, %zmm1                              #127.23
 | 
			
		||||
        vbroadcastsd %xmm6, %zmm0                               #128.23
 | 
			
		||||
        vbroadcastsd %xmm12, %zmm2                              #129.23
 | 
			
		||||
        movslq    %r12d, %rbx                                   #153.13
 | 
			
		||||
        addq      %rsi, %r13                                    #108.5
 | 
			
		||||
        movq      %rax, 40(%rsp)                                #108.5[spill]
 | 
			
		||||
        movq      %rcx, 48(%rsp)                                #108.5[spill]
 | 
			
		||||
        movq      %rsi, 56(%rsp)                                #108.5[spill]
 | 
			
		||||
        movq      %r8, 64(%rsp)                                 #108.5[spill]
 | 
			
		||||
        movq      %rdi, 72(%rsp)                                #108.5[spill]
 | 
			
		||||
        movq      %r9, 24(%rsp)                                 #108.5[spill]
 | 
			
		||||
        movq      %r10, 32(%rsp)                                #108.5[spill]
 | 
			
		||||
                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.27:                        # Preds ..B1.31 ..B1.26
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        vmovdqu   (%r13,%rbx,4), %ymm3                          #154.25
 | 
			
		||||
        vpaddd    %ymm3, %ymm3, %ymm4                           #155.40
 | 
			
		||||
        vpaddd    %ymm4, %ymm3, %ymm3                           #155.40
 | 
			
		||||
        movl      (%r13,%rbx,4), %r10d                          #154.25
 | 
			
		||||
        movl      4(%r13,%rbx,4), %r9d                          #154.25
 | 
			
		||||
        movl      8(%r13,%rbx,4), %r8d                          #154.25
 | 
			
		||||
        movl      12(%r13,%rbx,4), %edi                         #154.25
 | 
			
		||||
        lea       (%r10,%r10,2), %r10d                          #155.40
 | 
			
		||||
        movl      16(%r13,%rbx,4), %esi                         #154.25
 | 
			
		||||
        lea       (%r9,%r9,2), %r9d                             #155.40
 | 
			
		||||
        movl      20(%r13,%rbx,4), %ecx                         #154.25
 | 
			
		||||
        lea       (%r8,%r8,2), %r8d                             #155.40
 | 
			
		||||
        movl      24(%r13,%rbx,4), %eax                         #154.25
 | 
			
		||||
        lea       (%rdi,%rdi,2), %edi                           #155.40
 | 
			
		||||
        movl      28(%r13,%rbx,4), %r15d                        #154.25
 | 
			
		||||
        lea       (%rsi,%rsi,2), %esi                           #155.40
 | 
			
		||||
        lea       (%rcx,%rcx,2), %ecx                           #155.40
 | 
			
		||||
        lea       (%rax,%rax,2), %eax                           #155.40
 | 
			
		||||
        lea       (%r15,%r15,2), %r15d                          #155.40
 | 
			
		||||
                                # LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.30:                        # Preds ..B1.27
 | 
			
		||||
                                # Execution count [1.25e+01]
 | 
			
		||||
        vpcmpeqb  %xmm0, %xmm0, %k1                             #155.40
 | 
			
		||||
        vpcmpeqb  %xmm0, %xmm0, %k2                             #155.40
 | 
			
		||||
        vpcmpeqb  %xmm0, %xmm0, %k3                             #155.40
 | 
			
		||||
        vpxord    %zmm4, %zmm4, %zmm4                           #155.40
 | 
			
		||||
        vpxord    %zmm17, %zmm17, %zmm17                        #155.40
 | 
			
		||||
        vpxord    %zmm18, %zmm18, %zmm18                        #155.40
 | 
			
		||||
        vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1}                 #155.40
 | 
			
		||||
        vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2}                 #155.40
 | 
			
		||||
        vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3}                  #155.40
 | 
			
		||||
                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
 | 
			
		||||
..B1.31:                        # Preds ..B1.30
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        addl      $8, %r12d                                     #153.13
 | 
			
		||||
        addq      $8, %rbx                                      #153.13
 | 
			
		||||
        #vsubpd    %zmm4, %zmm2, %zmm26                          #157.40
 | 
			
		||||
        #vsubpd    %zmm17, %zmm0, %zmm24                         #156.40
 | 
			
		||||
        #vsubpd    %zmm18, %zmm1, %zmm23                         #155.40
 | 
			
		||||
        #vmulpd    %zmm24, %zmm24, %zmm3                         #158.53
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm23, %zmm3                       #158.53
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm26, %zmm3                       #158.67
 | 
			
		||||
        #vrcp14pd  %zmm3, %zmm22                                 #175.42
 | 
			
		||||
        #vcmppd    $1, %zmm14, %zmm3, %k2                        #174.26
 | 
			
		||||
        #vfpclasspd $30, %zmm22, %k0                             #175.42
 | 
			
		||||
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
 | 
			
		||||
        #knotw     %k0, %k1                                      #175.42
 | 
			
		||||
        #vmulpd    %zmm3, %zmm3, %zmm4                           #175.42
 | 
			
		||||
        #vfmadd213pd %zmm22, %zmm3, %zmm22{%k1}                  #175.42
 | 
			
		||||
        #vfmadd213pd %zmm22, %zmm4, %zmm22{%k1}                  #175.42
 | 
			
		||||
        #vmulpd    %zmm13, %zmm22, %zmm17                        #176.42
 | 
			
		||||
        #vmulpd    %zmm9, %zmm22, %zmm19                         #177.58
 | 
			
		||||
        #vmulpd    %zmm17, %zmm22, %zmm20                        #176.48
 | 
			
		||||
        #vmulpd    %zmm20, %zmm22, %zmm18                        #176.54
 | 
			
		||||
        #vfmsub213pd %zmm5, %zmm20, %zmm22                       #177.58
 | 
			
		||||
        #vmulpd    %zmm19, %zmm18, %zmm21                        #177.65
 | 
			
		||||
        #vmulpd    %zmm22, %zmm21, %zmm25                        #177.71
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm25, %zmm8{%k2}                  #178.21
 | 
			
		||||
        #vfmadd231pd %zmm24, %zmm25, %zmm7{%k2}                  #179.21
 | 
			
		||||
        #vfmadd231pd %zmm26, %zmm25, %zmm11{%k2}                 #180.21
 | 
			
		||||
        cmpl      %r14d, %r12d                                  #153.13
 | 
			
		||||
        jb        ..B1.27       # Prob 82%                      #153.13
 | 
			
		||||
                                # LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.32:                        # Preds ..B1.31
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        movq      40(%rsp), %rax                                #[spill]
 | 
			
		||||
        movq      48(%rsp), %rcx                                #[spill]
 | 
			
		||||
        movq      56(%rsp), %rsi                                #[spill]
 | 
			
		||||
        movq      64(%rsp), %r8                                 #[spill]
 | 
			
		||||
        movq      72(%rsp), %rdi                                #[spill]
 | 
			
		||||
        movq      24(%rsp), %r9                                 #[spill]
 | 
			
		||||
        movq      32(%rsp), %r10                                #[spill]
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.33:                        # Preds ..B1.32 ..B1.25 ..B1.45
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        lea       1(%r14), %ebx                                 #153.13
 | 
			
		||||
        cmpl      %r11d, %ebx                                   #153.13
 | 
			
		||||
        ja        ..B1.39       # Prob 50%                      #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.34:                        # Preds ..B1.33
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        imulq     %rcx, %rdi                                    #125.43
 | 
			
		||||
        vbroadcastsd %xmm10, %zmm4                              #127.23
 | 
			
		||||
        subl      %r14d, %r11d                                  #153.13
 | 
			
		||||
        addq      %rsi, %rdi                                    #108.5
 | 
			
		||||
        vpbroadcastd %r11d, %ymm0                               #153.13
 | 
			
		||||
        vpcmpgtd  %ymm15, %ymm0, %k3                            #153.13
 | 
			
		||||
        movslq    %r14d, %r14                                   #153.13
 | 
			
		||||
        vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z}                  #154.25
 | 
			
		||||
        kmovw     %k3, %edi                                     #153.13
 | 
			
		||||
        vpaddd    %ymm1, %ymm1, %ymm2                           #155.40
 | 
			
		||||
        vpaddd    %ymm2, %ymm1, %ymm0                           #155.40
 | 
			
		||||
                                # LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
 | 
			
		||||
..B1.37:                        # Preds ..B1.34
 | 
			
		||||
                                # Execution count [1.25e+01]
 | 
			
		||||
        kmovw     %k3, %k1                                      #155.40
 | 
			
		||||
        kmovw     %k3, %k2                                      #155.40
 | 
			
		||||
        vpxord    %zmm1, %zmm1, %zmm1                           #155.40
 | 
			
		||||
        vpxord    %zmm2, %zmm2, %zmm2                           #155.40
 | 
			
		||||
        vpxord    %zmm3, %zmm3, %zmm3                           #155.40
 | 
			
		||||
        vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1}                 #155.40
 | 
			
		||||
        vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2}                  #155.40
 | 
			
		||||
        vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3}                   #155.40
 | 
			
		||||
                                # LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.38:                        # Preds ..B1.37
 | 
			
		||||
                                # Execution count [2.50e+01]
 | 
			
		||||
        #vbroadcastsd %xmm6, %zmm6                               #128.23
 | 
			
		||||
        #vbroadcastsd %xmm12, %zmm12                             #129.23
 | 
			
		||||
        #vsubpd    %zmm1, %zmm12, %zmm23                         #157.40
 | 
			
		||||
        #vsubpd    %zmm2, %zmm6, %zmm21                          #156.40
 | 
			
		||||
        #vsubpd    %zmm3, %zmm4, %zmm20                          #155.40
 | 
			
		||||
        #vmulpd    %zmm21, %zmm21, %zmm19                        #158.53
 | 
			
		||||
        #vfmadd231pd %zmm20, %zmm20, %zmm19                      #158.53
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm23, %zmm19                      #158.67
 | 
			
		||||
        #vrcp14pd  %zmm19, %zmm18                                #175.42
 | 
			
		||||
        #vcmppd    $1, %zmm14, %zmm19, %k2                       #174.26
 | 
			
		||||
        #vfpclasspd $30, %zmm18, %k0                             #175.42
 | 
			
		||||
        #kmovw     %k2, %ebx                                     #174.26
 | 
			
		||||
        #knotw     %k0, %k1                                      #175.42
 | 
			
		||||
        #vmovaps   %zmm19, %zmm0                                 #175.42
 | 
			
		||||
        #andl      %ebx, %edi                                    #174.26
 | 
			
		||||
        #vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
 | 
			
		||||
        #kmovw     %edi, %k3                                     #178.21
 | 
			
		||||
        #vmulpd    %zmm0, %zmm0, %zmm1                           #175.42
 | 
			
		||||
        #vfmadd213pd %zmm18, %zmm0, %zmm18{%k1}                  #175.42
 | 
			
		||||
        #vfmadd213pd %zmm18, %zmm1, %zmm18{%k1}                  #175.42
 | 
			
		||||
        #vmulpd    %zmm13, %zmm18, %zmm2                         #176.42
 | 
			
		||||
        #vmulpd    %zmm9, %zmm18, %zmm4                          #177.58
 | 
			
		||||
        #vmulpd    %zmm2, %zmm18, %zmm10                         #176.48
 | 
			
		||||
        #vmulpd    %zmm10, %zmm18, %zmm3                         #176.54
 | 
			
		||||
        #vfmsub213pd %zmm5, %zmm10, %zmm18                       #177.58
 | 
			
		||||
        #vmulpd    %zmm4, %zmm3, %zmm17                          #177.65
 | 
			
		||||
        #vmulpd    %zmm18, %zmm17, %zmm22                        #177.71
 | 
			
		||||
        #vfmadd231pd %zmm20, %zmm22, %zmm8{%k3}                  #178.21
 | 
			
		||||
        #vfmadd231pd %zmm21, %zmm22, %zmm7{%k3}                  #179.21
 | 
			
		||||
        #vfmadd231pd %zmm23, %zmm22, %zmm11{%k3}                 #180.21
 | 
			
		||||
                                # LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.39:                        # Preds ..B1.24 ..B1.38 ..B1.33
 | 
			
		||||
                                # Execution count [4.50e+00]
 | 
			
		||||
        vmovups   .L_2il0floatpacket.10(%rip), %zmm19           #132.22
 | 
			
		||||
        vpermd    %zmm11, %zmm19, %zmm0                         #132.22
 | 
			
		||||
        vpermd    %zmm7, %zmm19, %zmm6                          #131.22
 | 
			
		||||
        vpermd    %zmm8, %zmm19, %zmm20                         #130.22
 | 
			
		||||
        vaddpd    %zmm11, %zmm0, %zmm11                         #132.22
 | 
			
		||||
        vaddpd    %zmm7, %zmm6, %zmm7                           #131.22
 | 
			
		||||
        vaddpd    %zmm8, %zmm20, %zmm8                          #130.22
 | 
			
		||||
        vpermpd   $78, %zmm11, %zmm1                            #132.22
 | 
			
		||||
        vpermpd   $78, %zmm7, %zmm10                            #131.22
 | 
			
		||||
        vpermpd   $78, %zmm8, %zmm21                            #130.22
 | 
			
		||||
        vaddpd    %zmm1, %zmm11, %zmm2                          #132.22
 | 
			
		||||
        vaddpd    %zmm10, %zmm7, %zmm12                         #131.22
 | 
			
		||||
        vaddpd    %zmm21, %zmm8, %zmm22                         #130.22
 | 
			
		||||
        vpermpd   $177, %zmm2, %zmm3                            #132.22
 | 
			
		||||
        vpermpd   $177, %zmm12, %zmm17                          #131.22
 | 
			
		||||
        vpermpd   $177, %zmm22, %zmm23                          #130.22
 | 
			
		||||
        vaddpd    %zmm3, %zmm2, %zmm4                           #132.22
 | 
			
		||||
        vaddpd    %zmm17, %zmm12, %zmm18                        #131.22
 | 
			
		||||
        vaddpd    %zmm23, %zmm22, %zmm24                        #130.22
 | 
			
		||||
                                # LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.40:                        # Preds ..B1.39 ..B1.10
 | 
			
		||||
                                # Execution count [5.00e+00]
 | 
			
		||||
        movq      96(%rsp), %rbx                                #188.9[spill]
 | 
			
		||||
        addq      $24, %rax                                     #124.5
 | 
			
		||||
        movslq    %r8d, %rdi                                    #124.32
 | 
			
		||||
        incq      %rdi                                          #124.32
 | 
			
		||||
        #vaddsd    (%rbx,%r8,8), %xmm24, %xmm0                   #188.9
 | 
			
		||||
        #vmovsd    %xmm0, (%rbx,%r8,8)                           #188.9
 | 
			
		||||
        #vaddsd    (%r10,%r8,8), %xmm18, %xmm1                   #189.9
 | 
			
		||||
        #vmovsd    %xmm1, (%r10,%r8,8)                           #189.9
 | 
			
		||||
        #vaddsd    (%r9,%r8,8), %xmm4, %xmm2                     #190.9
 | 
			
		||||
        #vmovsd    %xmm2, (%r9,%r8,8)                            #190.9
 | 
			
		||||
        incq      %r8                                           #124.5
 | 
			
		||||
        cmpq      80(%rsp), %r8                                 #124.5[spill]
 | 
			
		||||
        jb        ..B1.10       # Prob 82%                      #124.5
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
 | 
			
		||||
..B1.41:                        # Preds ..B1.40
 | 
			
		||||
                                # Execution count [9.00e-01]
 | 
			
		||||
        movq      8(%rsp), %r15                                 #[spill]
 | 
			
		||||
	.cfi_restore 15
 | 
			
		||||
        movq      (%rsp), %rbx                                  #[spill]
 | 
			
		||||
	.cfi_restore 3
 | 
			
		||||
                                # LOE rbx r15
 | 
			
		||||
..B1.42:                        # Preds ..B1.2 ..B1.41
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        xorl      %eax, %eax                                    #201.16
 | 
			
		||||
        vzeroupper                                              #201.16
 | 
			
		||||
..___tag_value_computeForce.43:
 | 
			
		||||
#       getTimeStamp()
 | 
			
		||||
        call      getTimeStamp                                  #201.16
 | 
			
		||||
..___tag_value_computeForce.44:
 | 
			
		||||
                                # LOE rbx r15 xmm0
 | 
			
		||||
..B1.43:                        # Preds ..B1.42
 | 
			
		||||
                                # Execution count [1.00e+00]
 | 
			
		||||
        vsubsd    16(%rsp), %xmm0, %xmm0                        #204.14[spill]
 | 
			
		||||
        addq      $104, %rsp                                    #204.14
 | 
			
		||||
	.cfi_restore 14
 | 
			
		||||
        popq      %r14                                          #204.14
 | 
			
		||||
	.cfi_restore 13
 | 
			
		||||
        popq      %r13                                          #204.14
 | 
			
		||||
	.cfi_restore 12
 | 
			
		||||
        popq      %r12                                          #204.14
 | 
			
		||||
        movq      %rbp, %rsp                                    #204.14
 | 
			
		||||
        popq      %rbp                                          #204.14
 | 
			
		||||
	.cfi_def_cfa 7, 8
 | 
			
		||||
	.cfi_restore 6
 | 
			
		||||
        ret                                                     #204.14
 | 
			
		||||
	.cfi_def_cfa 6, 16
 | 
			
		||||
	.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_offset 6, -16
 | 
			
		||||
	.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
	.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
 | 
			
		||||
                                # LOE
 | 
			
		||||
..B1.44:                        # Preds ..B1.12
 | 
			
		||||
                                # Execution count [4.50e-01]: Infreq
 | 
			
		||||
        movl      %r11d, %r14d                                  #153.13
 | 
			
		||||
        xorl      %r12d, %r12d                                  #153.13
 | 
			
		||||
        andl      $-8, %r14d                                    #153.13
 | 
			
		||||
        jmp       ..B1.25       # Prob 100%                     #153.13
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
..B1.45:                        # Preds ..B1.11
 | 
			
		||||
                                # Execution count [4.50e-01]: Infreq
 | 
			
		||||
        xorl      %r14d, %r14d                                  #153.13
 | 
			
		||||
        jmp       ..B1.33       # Prob 100%                     #153.13
 | 
			
		||||
        .align    16,0x90
 | 
			
		||||
                                # LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
# mark_end;
 | 
			
		||||
	.type	computeForce,@function
 | 
			
		||||
	.size	computeForce,.-computeForce
 | 
			
		||||
..LNcomputeForce.0:
 | 
			
		||||
	.data
 | 
			
		||||
# -- End  computeForce
 | 
			
		||||
	.section .rodata, "a"
 | 
			
		||||
	.align 64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.2:
 | 
			
		||||
	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.2,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.2,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.4:
 | 
			
		||||
	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 | 
			
		||||
	.type	.L_2il0floatpacket.4,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.4,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.5:
 | 
			
		||||
	.long	0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.5,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.5,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.6:
 | 
			
		||||
	.long	0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.6,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.6,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.7:
 | 
			
		||||
	.long	0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.7,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.7,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.8:
 | 
			
		||||
	.long	0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
 | 
			
		||||
	.type	.L_2il0floatpacket.8,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.8,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.10:
 | 
			
		||||
	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 | 
			
		||||
	.type	.L_2il0floatpacket.10,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.10,64
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.0:
 | 
			
		||||
	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 | 
			
		||||
	.type	.L_2il0floatpacket.0,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.0,32
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.1:
 | 
			
		||||
	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 | 
			
		||||
	.type	.L_2il0floatpacket.1,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.1,32
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.3:
 | 
			
		||||
	.long	0x00000000,0x40480000
 | 
			
		||||
	.type	.L_2il0floatpacket.3,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.3,8
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.9:
 | 
			
		||||
	.long	0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.9,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.9,8
 | 
			
		||||
	.data
 | 
			
		||||
	.section .note.GNU-stack, ""
 | 
			
		||||
# End
 | 
			
		||||
@@ -1,324 +0,0 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
 | 
			
		||||
.text
 | 
			
		||||
.align    16,0x90
 | 
			
		||||
.globl computeForce
 | 
			
		||||
computeForce:
 | 
			
		||||
# parameter 1: rdi Parameter*
 | 
			
		||||
# parameter 2: rsi Atom*
 | 
			
		||||
# parameter 3: rdx Neighbor*
 | 
			
		||||
        push      rbp
 | 
			
		||||
        push      r12
 | 
			
		||||
        push      r13
 | 
			
		||||
        push      r14
 | 
			
		||||
        push      r15
 | 
			
		||||
        push      rbx
 | 
			
		||||
        #call      getTimeStamp                                      # xmm0 <- getTimeStamp()
 | 
			
		||||
        #vmovsd    QWORD PTR [-56+rsp], xmm0                         # [-56+rsp] <- xmm0 [spill]
 | 
			
		||||
        mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal
 | 
			
		||||
        vmovsd    xmm2, QWORD PTR [96+rdi]                          # xmm2 <- param->cutforce
 | 
			
		||||
        vmovsd    xmm1, QWORD PTR [32+rdi]                          # xmm1 <- param->sigma6
 | 
			
		||||
        vmovsd    xmm0, QWORD PTR [24+rdi]                          # xmm0 <- param->epsilon
 | 
			
		||||
        mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx
 | 
			
		||||
        mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy
 | 
			
		||||
        mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz
 | 
			
		||||
        test      r9d, r9d                                          # atom->Nlocal <= 0
 | 
			
		||||
        jle       ..atom_loop_exit
 | 
			
		||||
        xor       r10d, r10d                                        # r10d <- 0
 | 
			
		||||
        mov       ecx, r9d                                          # ecx <- atom->Nlocal
 | 
			
		||||
        xor       r8d, r8d                                          # r8d <- 0
 | 
			
		||||
        mov       r11d, 1                                           # r11d <- 1
 | 
			
		||||
        xor       eax, eax                                          # eax <- 0
 | 
			
		||||
        shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1
 | 
			
		||||
        je        ..zero_last_element                               # ecx == 0
 | 
			
		||||
 | 
			
		||||
# Init forces to zero loop (unroll factor = 2)
 | 
			
		||||
..init_force_loop:
 | 
			
		||||
        mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0
 | 
			
		||||
        add       r8, 16                                            # i++
 | 
			
		||||
        inc       r10                                               # i++
 | 
			
		||||
        cmp       r10, rcx                                          # i < Nlocal
 | 
			
		||||
        jb        ..init_force_loop
 | 
			
		||||
 | 
			
		||||
# Trick to make r11d contain value of last element to be zeroed plus 1
 | 
			
		||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
 | 
			
		||||
        lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1
 | 
			
		||||
..zero_last_element:
 | 
			
		||||
        lea       ecx, DWORD PTR [-1+r11]                           # ecx <- i * 2
 | 
			
		||||
        cmp       ecx, r9d                                          # i >= Nlocal
 | 
			
		||||
        jae       ..before_atom_loop
 | 
			
		||||
 | 
			
		||||
        # Set last element to zero
 | 
			
		||||
        movsxd    r11, r11d                                         # r11 <- i * 2
 | 
			
		||||
        mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0
 | 
			
		||||
 | 
			
		||||
# Initialize registers to be used within atom loop
 | 
			
		||||
..before_atom_loop:
 | 
			
		||||
        vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq
 | 
			
		||||
        vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...]
 | 
			
		||||
        vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon
 | 
			
		||||
        vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7]
 | 
			
		||||
        vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...]
 | 
			
		||||
        vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...]
 | 
			
		||||
        vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...]
 | 
			
		||||
        vbroadcastsd zmm14, xmm0                                    # zmm14 <- [48 * epsilon, ...]
 | 
			
		||||
        movsxd    r9, r9d                                           # r9 <- atom->Nlocal
 | 
			
		||||
        xor       r10d, r10d                                        # r10d <- 0 (i)
 | 
			
		||||
        mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh
 | 
			
		||||
        mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors
 | 
			
		||||
        movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs
 | 
			
		||||
        mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x
 | 
			
		||||
        ### AOS
 | 
			
		||||
        xor       eax, eax
 | 
			
		||||
        ### SOA
 | 
			
		||||
        #mov       rax, QWORD PTR [24+rsi]                          # rax <- atom->y
 | 
			
		||||
        #mov       rsi, QWORD PTR [32+rsi]                          # rsi <- atom->z
 | 
			
		||||
        ###
 | 
			
		||||
        shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4
 | 
			
		||||
 | 
			
		||||
        # Register spilling
 | 
			
		||||
        mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal
 | 
			
		||||
        mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh
 | 
			
		||||
        mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy
 | 
			
		||||
        mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx
 | 
			
		||||
        mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15
 | 
			
		||||
        mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx
 | 
			
		||||
 | 
			
		||||
..atom_loop_begin:
 | 
			
		||||
        mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh
 | 
			
		||||
        vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 (fix)
 | 
			
		||||
        vmovapd   xmm20, xmm25                                      # xmm20 <- 0 (fiy)
 | 
			
		||||
        mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs)
 | 
			
		||||
        vmovapd   xmm4, xmm20                                       # xmm4 <- 0 (fiz)
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3]
 | 
			
		||||
        vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1]
 | 
			
		||||
        vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2]
 | 
			
		||||
        ### SOA
 | 
			
		||||
        #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i]
 | 
			
		||||
        #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i]
 | 
			
		||||
        #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i]
 | 
			
		||||
        ###
 | 
			
		||||
        vbroadcastsd zmm0, xmm8                                     # zmm0 <- atom_x(i)
 | 
			
		||||
        vbroadcastsd zmm1, xmm9                                     # zmm1 <- atom_y(i)
 | 
			
		||||
        vbroadcastsd zmm2, xmm10                                    # zmm2 <- atom_z(i)
 | 
			
		||||
        test      r13d, r13d                                        # numneighs <= 0
 | 
			
		||||
        jle       ..atom_loop_exit
 | 
			
		||||
 | 
			
		||||
        vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix)
 | 
			
		||||
        vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy)
 | 
			
		||||
        vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz)
 | 
			
		||||
        mov       rcx, r12                                          # rcx <- neighbor->maxneighs * 4
 | 
			
		||||
        imul      rcx, r10                                          # rcx <- neighbor->maxneighs * 4 * i
 | 
			
		||||
        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
 | 
			
		||||
        xor       r9d, r9d                                          # r9d <- 0 (k)
 | 
			
		||||
        mov       r14d, r13d                                        # r14d <- numneighs
 | 
			
		||||
        cmp       r14d, 8
 | 
			
		||||
        jl        ..compute_forces_remainder
 | 
			
		||||
 | 
			
		||||
..compute_forces:
 | 
			
		||||
        vpcmpeqb  k1, xmm0, xmm0
 | 
			
		||||
        vpcmpeqb  k2, xmm0, xmm0
 | 
			
		||||
        vpcmpeqb  k3, xmm0, xmm0
 | 
			
		||||
        vmovdqu   ymm3, YMMWORD PTR [rcx+r9*4]
 | 
			
		||||
        vpxord    zmm5, zmm5, zmm5
 | 
			
		||||
        vpxord    zmm6, zmm6, zmm6
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        vpaddd     ymm4, ymm3, ymm3
 | 
			
		||||
        vpaddd     ymm3, ymm3, ymm4
 | 
			
		||||
        vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
 | 
			
		||||
        ### SOA
 | 
			
		||||
        #vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
 | 
			
		||||
        ###
 | 
			
		||||
 | 
			
		||||
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
 | 
			
		||||
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
 | 
			
		||||
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
 | 
			
		||||
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
 | 
			
		||||
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
 | 
			
		||||
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
 | 
			
		||||
 | 
			
		||||
        # Cutoff radius condition
 | 
			
		||||
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
 | 
			
		||||
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
 | 
			
		||||
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
 | 
			
		||||
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
 | 
			
		||||
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
 | 
			
		||||
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
 | 
			
		||||
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
 | 
			
		||||
        vfmadd231pd zmm13{k5}, zmm30, zmm28                         # fix += force * delx
 | 
			
		||||
        vfmadd231pd zmm12{k5}, zmm30, zmm29                         # fiy += force * dely
 | 
			
		||||
        vfmadd231pd zmm11{k5}, zmm30, zmm31                         # fiz += force * delz
 | 
			
		||||
        sub       r14d, 8
 | 
			
		||||
        add       r9, 8
 | 
			
		||||
        cmp       r14d, 8
 | 
			
		||||
        jge       ..compute_forces
 | 
			
		||||
 | 
			
		||||
# Check if there are remaining neighbors to be computed
 | 
			
		||||
..compute_forces_remainder:
 | 
			
		||||
        test      r14d, r14d
 | 
			
		||||
        jle       ..sum_up_forces
 | 
			
		||||
 | 
			
		||||
        vpbroadcastd ymm4, r14d
 | 
			
		||||
        vpcmpgtd  k1, ymm4, ymm17
 | 
			
		||||
        kmovw     r15d, k1
 | 
			
		||||
        vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
 | 
			
		||||
        kmovw     k2, k1
 | 
			
		||||
        kmovw     k3, k1
 | 
			
		||||
        vpxord    zmm5, zmm5, zmm5
 | 
			
		||||
        vpxord    zmm6, zmm6, zmm6
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        vpaddd     ymm4, ymm3, ymm3
 | 
			
		||||
        vpaddd     ymm3, ymm3, ymm4
 | 
			
		||||
        vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
 | 
			
		||||
        #### SOA
 | 
			
		||||
        #vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
 | 
			
		||||
        ###
 | 
			
		||||
 | 
			
		||||
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
 | 
			
		||||
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
 | 
			
		||||
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
 | 
			
		||||
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
 | 
			
		||||
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
 | 
			
		||||
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
 | 
			
		||||
 | 
			
		||||
        # Cutoff radius condition
 | 
			
		||||
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
 | 
			
		||||
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
 | 
			
		||||
        kmovw     r9d, k5                                           # r9d <- rsq < cutforcesq
 | 
			
		||||
        and       r15d, r9d                                         # r15d <- rsq < cutforcesq && k < numneighs
 | 
			
		||||
        kmovw     k3, r15d                                          # k3 <- rsq < cutforcesq && k < numneighs
 | 
			
		||||
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
 | 
			
		||||
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
 | 
			
		||||
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
 | 
			
		||||
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
 | 
			
		||||
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
 | 
			
		||||
        vfmadd231pd zmm13{k3}, zmm30, zmm28                         # fix += force * delx
 | 
			
		||||
        vfmadd231pd zmm12{k3}, zmm30, zmm29                         # fiy += force * dely
 | 
			
		||||
        vfmadd231pd zmm11{k3}, zmm30, zmm31                         # fiz += force * delz
 | 
			
		||||
 | 
			
		||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
 | 
			
		||||
# and add them (reduction) to obtain the final contribution for the current atom
 | 
			
		||||
..sum_up_forces:
 | 
			
		||||
        vmovups   zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
 | 
			
		||||
        vpermd    zmm0, zmm10, zmm11
 | 
			
		||||
        vpermd    zmm5, zmm10, zmm12
 | 
			
		||||
        vpermd    zmm21, zmm10, zmm13
 | 
			
		||||
        vaddpd    zmm11, zmm0, zmm11
 | 
			
		||||
        vaddpd    zmm12, zmm5, zmm12
 | 
			
		||||
        vaddpd    zmm13, zmm21, zmm13
 | 
			
		||||
        vpermpd   zmm1, zmm11, 78
 | 
			
		||||
        vpermpd   zmm6, zmm12, 78
 | 
			
		||||
        vpermpd   zmm22, zmm13, 78
 | 
			
		||||
        vaddpd    zmm2, zmm11, zmm1
 | 
			
		||||
        vaddpd    zmm8, zmm12, zmm6
 | 
			
		||||
        vaddpd    zmm23, zmm13, zmm22
 | 
			
		||||
        vpermpd   zmm3, zmm2, 177
 | 
			
		||||
        vpermpd   zmm9, zmm8, 177
 | 
			
		||||
        vpermpd   zmm24, zmm23, 177
 | 
			
		||||
        vaddpd    zmm4, zmm2, zmm3
 | 
			
		||||
        vaddpd    zmm20, zmm8, zmm9
 | 
			
		||||
        vaddpd    zmm25, zmm23, zmm24
 | 
			
		||||
 | 
			
		||||
..atom_loop_exit:
 | 
			
		||||
        mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill]
 | 
			
		||||
        mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill]
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        add       rax, 24
 | 
			
		||||
        ###
 | 
			
		||||
 | 
			
		||||
        vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9
 | 
			
		||||
        vmovsd    QWORD PTR [rcx+r10*8], xmm0                   #84.9
 | 
			
		||||
        vaddsd    xmm1, xmm20, QWORD PTR [rbx+r10*8]            #85.9
 | 
			
		||||
        vmovsd    QWORD PTR [rbx+r10*8], xmm1                   #85.9
 | 
			
		||||
        vaddsd    xmm2, xmm4, QWORD PTR [rdi+r10*8]             #86.9
 | 
			
		||||
        vmovsd    QWORD PTR [rdi+r10*8], xmm2                   #86.9
 | 
			
		||||
        inc       r10                                           #55.5
 | 
			
		||||
        cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill]
 | 
			
		||||
        jb        ..atom_loop_begin
 | 
			
		||||
        vzeroupper                                              #93.12
 | 
			
		||||
        vxorpd    xmm0, xmm0, xmm0                              #93.12
 | 
			
		||||
        #call      getTimeStamp                                  # xmm0 <- getTimeStamp()
 | 
			
		||||
        #vsubsd    xmm0, xmm0, QWORD PTR [-56+rsp]               # xmm0 <- E-S
 | 
			
		||||
        pop       rbx
 | 
			
		||||
        pop       r15
 | 
			
		||||
        pop       r14                                           #93.12
 | 
			
		||||
        pop       r13                                           #93.12
 | 
			
		||||
        pop       r12                                           #93.12
 | 
			
		||||
        pop       rbp                                           #93.12
 | 
			
		||||
        ret                                                     #93.12
 | 
			
		||||
 | 
			
		||||
.type	computeForce,@function
 | 
			
		||||
.size	computeForce,.-computeForce
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
..LNcomputeForce.0:
 | 
			
		||||
	.data
 | 
			
		||||
# -- End  computeForce
 | 
			
		||||
	.section .rodata, "a"
 | 
			
		||||
	.align 64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.2:
 | 
			
		||||
	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.2,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.2,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.4:
 | 
			
		||||
	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 | 
			
		||||
	.type	.L_2il0floatpacket.4,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.4,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.6:
 | 
			
		||||
	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 | 
			
		||||
	.type	.L_2il0floatpacket.6,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.6,64
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.0:
 | 
			
		||||
	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 | 
			
		||||
	.type	.L_2il0floatpacket.0,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.0,32
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.1:
 | 
			
		||||
	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 | 
			
		||||
	.type	.L_2il0floatpacket.1,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.1,32
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.3:
 | 
			
		||||
	.long	0x00000000,0x40480000
 | 
			
		||||
	.type	.L_2il0floatpacket.3,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.3,8
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.5:
 | 
			
		||||
	.long	0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.5,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.5,8
 | 
			
		||||
	.data
 | 
			
		||||
	.section .note.GNU-stack, ""
 | 
			
		||||
# End
 | 
			
		||||
@@ -1,326 +0,0 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
 | 
			
		||||
.text
 | 
			
		||||
.align    16,0x90
 | 
			
		||||
.globl computeForceLJ
 | 
			
		||||
computeForceLJ:
 | 
			
		||||
# parameter 1: rdi Parameter*
 | 
			
		||||
# parameter 2: rsi Atom*
 | 
			
		||||
# parameter 3: rdx Neighbor*
 | 
			
		||||
        push      rbp
 | 
			
		||||
        push      r12
 | 
			
		||||
        push      r13
 | 
			
		||||
        push      r14
 | 
			
		||||
        push      r15
 | 
			
		||||
        push      rbx
 | 
			
		||||
        mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal
 | 
			
		||||
        vmovsd    xmm2, QWORD PTR [96+rdi]                          # xmm2 <- param->cutforce
 | 
			
		||||
        vmovsd    xmm1, QWORD PTR [32+rdi]                          # xmm1 <- param->sigma6
 | 
			
		||||
        vmovsd    xmm0, QWORD PTR [24+rdi]                          # xmm0 <- param->epsilon
 | 
			
		||||
        mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx
 | 
			
		||||
        mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy
 | 
			
		||||
        mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz
 | 
			
		||||
        test      r9d, r9d                                          # atom->Nlocal <= 0
 | 
			
		||||
        jle       ..atom_loop_exit
 | 
			
		||||
        xor       r10d, r10d                                        # r10d <- 0
 | 
			
		||||
        mov       ecx, r9d                                          # ecx <- atom->Nlocal
 | 
			
		||||
        xor       r8d, r8d                                          # r8d <- 0
 | 
			
		||||
        mov       r11d, 1                                           # r11d <- 1
 | 
			
		||||
        xor       eax, eax                                          # eax <- 0
 | 
			
		||||
        shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1
 | 
			
		||||
        je        ..zero_last_element                               # ecx == 0
 | 
			
		||||
 | 
			
		||||
# Init forces to zero loop (unroll factor = 2)
 | 
			
		||||
..init_force_loop:
 | 
			
		||||
        mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0
 | 
			
		||||
        add       r8, 16                                            # i++
 | 
			
		||||
        inc       r10                                               # i++
 | 
			
		||||
        cmp       r10, rcx                                          # i < Nlocal
 | 
			
		||||
        jb        ..init_force_loop
 | 
			
		||||
 | 
			
		||||
# Trick to make r11d contain value of last element to be zeroed plus 1
 | 
			
		||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
 | 
			
		||||
        lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1
 | 
			
		||||
..zero_last_element:
 | 
			
		||||
        lea       ecx, DWORD PTR [-1+r11]                           # ecx <- i * 2
 | 
			
		||||
        cmp       ecx, r9d                                          # i >= Nlocal
 | 
			
		||||
        jae       ..before_atom_loop
 | 
			
		||||
 | 
			
		||||
        # Set last element to zero
 | 
			
		||||
        movsxd    r11, r11d                                         # r11 <- i * 2
 | 
			
		||||
        mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0
 | 
			
		||||
        mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0
 | 
			
		||||
 | 
			
		||||
# Initialize registers to be used within atom loop
 | 
			
		||||
..before_atom_loop:
 | 
			
		||||
        vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq
 | 
			
		||||
        vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...]
 | 
			
		||||
        vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon
 | 
			
		||||
        vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7]
 | 
			
		||||
        vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...]
 | 
			
		||||
        vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...]
 | 
			
		||||
        vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...]
 | 
			
		||||
        vbroadcastsd zmm14, xmm0                                    # zmm14 <- [48 * epsilon, ...]
 | 
			
		||||
        movsxd    r9, r9d                                           # r9 <- atom->Nlocal
 | 
			
		||||
        xor       r10d, r10d                                        # r10d <- 0 (i)
 | 
			
		||||
        mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh
 | 
			
		||||
        mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors
 | 
			
		||||
        movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs
 | 
			
		||||
        mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x
 | 
			
		||||
        ### AOS
 | 
			
		||||
        xor       eax, eax
 | 
			
		||||
        ### SOA
 | 
			
		||||
        #mov       rax, QWORD PTR [24+rsi]                          # rax <- atom->y
 | 
			
		||||
        #mov       rsi, QWORD PTR [32+rsi]                          # rsi <- atom->z
 | 
			
		||||
        ###
 | 
			
		||||
        shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4
 | 
			
		||||
 | 
			
		||||
        # Register spilling
 | 
			
		||||
        mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal
 | 
			
		||||
        mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh
 | 
			
		||||
        mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy
 | 
			
		||||
        mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx
 | 
			
		||||
        mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15
 | 
			
		||||
        mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx
 | 
			
		||||
        #sub       rsp, 64
 | 
			
		||||
        #call      getTimeStamp                                      # xmm0 <- getTimeStamp()
 | 
			
		||||
        #vmovsd    QWORD PTR [-56+rsp], xmm0                         # [-56+rsp] <- xmm0 [spill]
 | 
			
		||||
        #add       rsp, 64
 | 
			
		||||
 | 
			
		||||
..atom_loop_begin:
 | 
			
		||||
        mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh
 | 
			
		||||
        vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 (fix)
 | 
			
		||||
        vmovapd   xmm20, xmm25                                      # xmm20 <- 0 (fiy)
 | 
			
		||||
        mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs)
 | 
			
		||||
        vmovapd   xmm4, xmm20                                       # xmm4 <- 0 (fiz)
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3]
 | 
			
		||||
        vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1]
 | 
			
		||||
        vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2]
 | 
			
		||||
        ### SOA
 | 
			
		||||
        #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i]
 | 
			
		||||
        #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i]
 | 
			
		||||
        #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i]
 | 
			
		||||
        ###
 | 
			
		||||
        vbroadcastsd zmm0, xmm8                                     # zmm0 <- atom_x(i)
 | 
			
		||||
        vbroadcastsd zmm1, xmm9                                     # zmm1 <- atom_y(i)
 | 
			
		||||
        vbroadcastsd zmm2, xmm10                                    # zmm2 <- atom_z(i)
 | 
			
		||||
        test      r13d, r13d                                        # numneighs <= 0
 | 
			
		||||
        jle       ..atom_loop_exit
 | 
			
		||||
 | 
			
		||||
        vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix)
 | 
			
		||||
        vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy)
 | 
			
		||||
        vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz)
 | 
			
		||||
        mov       rcx, r12                                          # rcx <- neighbor->maxneighs * 4
 | 
			
		||||
        imul      rcx, r10                                          # rcx <- neighbor->maxneighs * 4 * i
 | 
			
		||||
        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
 | 
			
		||||
        xor       r9d, r9d                                          # r9d <- 0 (k)
 | 
			
		||||
        mov       r14d, r13d                                        # r14d <- numneighs
 | 
			
		||||
        cmp       r14d, 8
 | 
			
		||||
        jl        ..compute_forces_remainder
 | 
			
		||||
 | 
			
		||||
..compute_forces:
 | 
			
		||||
        vpcmpeqb  k1, xmm0, xmm0
 | 
			
		||||
        vpcmpeqb  k2, xmm0, xmm0
 | 
			
		||||
        vpcmpeqb  k3, xmm0, xmm0
 | 
			
		||||
        vmovdqu   ymm3, YMMWORD PTR [rcx+r9*4]
 | 
			
		||||
        vpxord    zmm5, zmm5, zmm5
 | 
			
		||||
        vpxord    zmm6, zmm6, zmm6
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        vpaddd     ymm4, ymm3, ymm3
 | 
			
		||||
        vpaddd     ymm3, ymm3, ymm4
 | 
			
		||||
        vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
 | 
			
		||||
        ### SOA
 | 
			
		||||
        #vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
 | 
			
		||||
        ###
 | 
			
		||||
 | 
			
		||||
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
 | 
			
		||||
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
 | 
			
		||||
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
 | 
			
		||||
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
 | 
			
		||||
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
 | 
			
		||||
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
 | 
			
		||||
 | 
			
		||||
        # Cutoff radius condition
 | 
			
		||||
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
 | 
			
		||||
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
 | 
			
		||||
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
 | 
			
		||||
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
 | 
			
		||||
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
 | 
			
		||||
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
 | 
			
		||||
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
 | 
			
		||||
        vfmadd231pd zmm13{k5}, zmm30, zmm28                         # fix += force * delx
 | 
			
		||||
        vfmadd231pd zmm12{k5}, zmm30, zmm29                         # fiy += force * dely
 | 
			
		||||
        vfmadd231pd zmm11{k5}, zmm30, zmm31                         # fiz += force * delz
 | 
			
		||||
        sub       r14d, 8
 | 
			
		||||
        add       r9, 8
 | 
			
		||||
        cmp       r14d, 8
 | 
			
		||||
        jge       ..compute_forces
 | 
			
		||||
 | 
			
		||||
# Check if there are remaining neighbors to be computed
 | 
			
		||||
..compute_forces_remainder:
 | 
			
		||||
        test      r14d, r14d
 | 
			
		||||
        jle       ..sum_up_forces
 | 
			
		||||
 | 
			
		||||
        vpbroadcastd ymm4, r14d
 | 
			
		||||
        vpcmpgtd  k1, ymm4, ymm17
 | 
			
		||||
        kmovw     r15d, k1
 | 
			
		||||
        vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
 | 
			
		||||
        kmovw     k2, k1
 | 
			
		||||
        kmovw     k3, k1
 | 
			
		||||
        vpxord    zmm5, zmm5, zmm5
 | 
			
		||||
        vpxord    zmm6, zmm6, zmm6
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        vpaddd     ymm4, ymm3, ymm3
 | 
			
		||||
        vpaddd     ymm3, ymm3, ymm4
 | 
			
		||||
        vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
 | 
			
		||||
        vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
 | 
			
		||||
        #### SOA
 | 
			
		||||
        #vpxord     zmm4, zmm4, zmm4
 | 
			
		||||
        #vgatherdpd zmm5{k2}, [rax+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm4{k1}, [rdx+ymm3*8]
 | 
			
		||||
        #vgatherdpd zmm6{k3}, [rsi+ymm3*8]
 | 
			
		||||
        ###
 | 
			
		||||
 | 
			
		||||
        vsubpd    zmm29, zmm1, zmm5                                 # zmm29 <- atom_y(i) - atom_y(j) -- dely
 | 
			
		||||
        vsubpd    zmm28, zmm0, zmm4                                 # zmm28 <- atom_x(i) - atom_x(j) -- delx
 | 
			
		||||
        vsubpd    zmm31, zmm2, zmm6                                 # zmm31 <- atom_z(i) - atom_z(j) -- delz
 | 
			
		||||
        vmulpd    zmm20, zmm29, zmm29                               # zmm20 <- dely * dely
 | 
			
		||||
        vfmadd231pd zmm20, zmm28, zmm28                             # zmm20 <- dely * dely + delx * delx
 | 
			
		||||
        vfmadd231pd zmm20, zmm31, zmm31                             # zmm20 <- zmm20 + delz * delz --  rsq
 | 
			
		||||
 | 
			
		||||
        # Cutoff radius condition
 | 
			
		||||
        vrcp14pd  zmm27, zmm20                                      # zmm27 <- 1.0 / rsq (sr2)
 | 
			
		||||
        vcmppd    k5, zmm20, zmm16, 1                               # k5 <- rsq < cutforcesq
 | 
			
		||||
        kmovw     r9d, k5                                           # r9d <- rsq < cutforcesq
 | 
			
		||||
        and       r15d, r9d                                         # r15d <- rsq < cutforcesq && k < numneighs
 | 
			
		||||
        kmovw     k3, r15d                                          # k3 <- rsq < cutforcesq && k < numneighs
 | 
			
		||||
        vmulpd    zmm22, zmm27, zmm15                               # zmm22 <-  sr2 * sigma6
 | 
			
		||||
        vmulpd    zmm24, zmm27, zmm14                               # zmm24 <- 48.0 * epsilon * sr2
 | 
			
		||||
        vmulpd    zmm25, zmm27, zmm22                               # zmm25 <- sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm23, zmm27, zmm25                               # zmm23 <- sr2 * sigma6 * sr2 * sr2
 | 
			
		||||
        vfmsub213pd zmm27, zmm25, zmm7                              # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
 | 
			
		||||
        vmulpd    zmm26, zmm23, zmm24                               # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
 | 
			
		||||
        vmulpd    zmm30, zmm26, zmm27                               # zmm30 <- force
 | 
			
		||||
        vfmadd231pd zmm13{k3}, zmm30, zmm28                         # fix += force * delx
 | 
			
		||||
        vfmadd231pd zmm12{k3}, zmm30, zmm29                         # fiy += force * dely
 | 
			
		||||
        vfmadd231pd zmm11{k3}, zmm30, zmm31                         # fiz += force * delz
 | 
			
		||||
 | 
			
		||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
 | 
			
		||||
# and add them (reduction) to obtain the final contribution for the current atom
 | 
			
		||||
..sum_up_forces:
 | 
			
		||||
        vmovups   zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
 | 
			
		||||
        vpermd    zmm0, zmm10, zmm11
 | 
			
		||||
        vpermd    zmm5, zmm10, zmm12
 | 
			
		||||
        vpermd    zmm21, zmm10, zmm13
 | 
			
		||||
        vaddpd    zmm11, zmm0, zmm11
 | 
			
		||||
        vaddpd    zmm12, zmm5, zmm12
 | 
			
		||||
        vaddpd    zmm13, zmm21, zmm13
 | 
			
		||||
        vpermpd   zmm1, zmm11, 78
 | 
			
		||||
        vpermpd   zmm6, zmm12, 78
 | 
			
		||||
        vpermpd   zmm22, zmm13, 78
 | 
			
		||||
        vaddpd    zmm2, zmm11, zmm1
 | 
			
		||||
        vaddpd    zmm8, zmm12, zmm6
 | 
			
		||||
        vaddpd    zmm23, zmm13, zmm22
 | 
			
		||||
        vpermpd   zmm3, zmm2, 177
 | 
			
		||||
        vpermpd   zmm9, zmm8, 177
 | 
			
		||||
        vpermpd   zmm24, zmm23, 177
 | 
			
		||||
        vaddpd    zmm4, zmm2, zmm3
 | 
			
		||||
        vaddpd    zmm20, zmm8, zmm9
 | 
			
		||||
        vaddpd    zmm25, zmm23, zmm24
 | 
			
		||||
 | 
			
		||||
..atom_loop_exit:
 | 
			
		||||
        mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill]
 | 
			
		||||
        mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill]
 | 
			
		||||
 | 
			
		||||
        ### AOS
 | 
			
		||||
        add       rax, 24
 | 
			
		||||
        ###
 | 
			
		||||
 | 
			
		||||
        vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9
 | 
			
		||||
        vmovsd    QWORD PTR [rcx+r10*8], xmm0                   #84.9
 | 
			
		||||
        vaddsd    xmm1, xmm20, QWORD PTR [rbx+r10*8]            #85.9
 | 
			
		||||
        vmovsd    QWORD PTR [rbx+r10*8], xmm1                   #85.9
 | 
			
		||||
        vaddsd    xmm2, xmm4, QWORD PTR [rdi+r10*8]             #86.9
 | 
			
		||||
        vmovsd    QWORD PTR [rdi+r10*8], xmm2                   #86.9
 | 
			
		||||
        inc       r10                                           #55.5
 | 
			
		||||
        cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill]
 | 
			
		||||
        jb        ..atom_loop_begin
 | 
			
		||||
        vzeroupper                                              #93.12
 | 
			
		||||
        vxorpd    xmm0, xmm0, xmm0                              #93.12
 | 
			
		||||
        #call      getTimeStamp                                  # xmm0 <- getTimeStamp()
 | 
			
		||||
        #vsubsd    xmm0, xmm0, QWORD PTR [-56+rsp]               # xmm0 <- E-S
 | 
			
		||||
        pop       rbx
 | 
			
		||||
        pop       r15
 | 
			
		||||
        pop       r14                                           #93.12
 | 
			
		||||
        pop       r13                                           #93.12
 | 
			
		||||
        pop       r12                                           #93.12
 | 
			
		||||
        pop       rbp                                           #93.12
 | 
			
		||||
        ret                                                     #93.12
 | 
			
		||||
 | 
			
		||||
.type	computeForceLJ,@function
 | 
			
		||||
.size	computeForceLJ,.-computeForceLJ
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
..LNcomputeForce.0:
 | 
			
		||||
	.data
 | 
			
		||||
# -- End  computeForceLJ
 | 
			
		||||
	.section .rodata, "a"
 | 
			
		||||
	.align 64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.2:
 | 
			
		||||
	.long	0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.2,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.2,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.4:
 | 
			
		||||
	.long	0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
 | 
			
		||||
	.type	.L_2il0floatpacket.4,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.4,64
 | 
			
		||||
	.align 64
 | 
			
		||||
.L_2il0floatpacket.6:
 | 
			
		||||
	.long	0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
 | 
			
		||||
	.type	.L_2il0floatpacket.6,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.6,64
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.0:
 | 
			
		||||
	.long	0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
 | 
			
		||||
	.type	.L_2il0floatpacket.0,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.0,32
 | 
			
		||||
	.align 32
 | 
			
		||||
.L_2il0floatpacket.1:
 | 
			
		||||
	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 | 
			
		||||
	.type	.L_2il0floatpacket.1,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.1,32
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.3:
 | 
			
		||||
	.long	0x00000000,0x40480000
 | 
			
		||||
	.type	.L_2il0floatpacket.3,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.3,8
 | 
			
		||||
	.align 8
 | 
			
		||||
.L_2il0floatpacket.5:
 | 
			
		||||
	.long	0x00000000,0x3ff00000
 | 
			
		||||
	.type	.L_2il0floatpacket.5,@object
 | 
			
		||||
	.size	.L_2il0floatpacket.5,8
 | 
			
		||||
	.data
 | 
			
		||||
	.section .note.GNU-stack, ""
 | 
			
		||||
# End
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -21,6 +21,7 @@ typedef struct {
 | 
			
		||||
    char* input_file;
 | 
			
		||||
    char* vtk_file;
 | 
			
		||||
    char* xtc_file;
 | 
			
		||||
    char* write_atom_file;
 | 
			
		||||
    MD_FLOAT epsilon;
 | 
			
		||||
    MD_FLOAT sigma;
 | 
			
		||||
    MD_FLOAT sigma6;
 | 
			
		||||
 
 | 
			
		||||
@@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
 | 
			
		||||
    t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
 | 
			
		||||
    t0 = _mm256_add_pd(t0, t2);
 | 
			
		||||
    t1 = _mm256_add_pd(t1, t2);
 | 
			
		||||
    t0 = _mm256_blend_pd(t0, t1, 0b1100);
 | 
			
		||||
    t0 = _mm256_blend_pd(t0, t1, 0xC);
 | 
			
		||||
    //t0 = _mm256_blend_pd(t0, t1, 0b1100);
 | 
			
		||||
    t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
 | 
			
		||||
    _mm256_store_pd(m, t1);
 | 
			
		||||
 | 
			
		||||
    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
 | 
			
		||||
    t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
 | 
			
		||||
    //t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
 | 
			
		||||
    a0 = _mm256_castpd256_pd128(t0);
 | 
			
		||||
    a1 = _mm256_extractf128_pd(t0, 0x1);
 | 
			
		||||
    a0 = _mm_add_sd(a0, a1);
 | 
			
		||||
@@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Functions used in LAMMPS kernel
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
 | 
			
		||||
#define simd_gather(vidx, m, s)     _mm256_i32gather_pd(m, vidx, s);
 | 
			
		||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
 | 
			
		||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
 | 
			
		||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
 | 
			
		||||
 
 | 
			
		||||
@@ -7,8 +7,8 @@
 | 
			
		||||
#ifndef __TIMING_H_
 | 
			
		||||
#define __TIMING_H_
 | 
			
		||||
 | 
			
		||||
extern double getTimeStamp();
 | 
			
		||||
extern double getTimeResolution();
 | 
			
		||||
extern double getTimeStamp_();
 | 
			
		||||
extern double getTimeStamp(void);
 | 
			
		||||
extern double getTimeResolution(void);
 | 
			
		||||
extern double getTimeStamp_(void);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -39,8 +39,8 @@ extern double myrandom(int*);
 | 
			
		||||
extern void random_reset(int *seed, int ibase, double *coord);
 | 
			
		||||
extern int str2ff(const char *string);
 | 
			
		||||
extern const char* ff2str(int ff);
 | 
			
		||||
extern int get_num_threads();
 | 
			
		||||
extern void readline(char *line, FILE *fp);
 | 
			
		||||
extern void debug_printf(const char *format, ...);
 | 
			
		||||
extern int get_cuda_num_threads();
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -17,6 +17,7 @@ void initParameter(Parameter *param) {
 | 
			
		||||
    param->vtk_file = NULL;
 | 
			
		||||
    param->xtc_file = NULL;
 | 
			
		||||
    param->eam_file = NULL;
 | 
			
		||||
    param->write_atom_file = NULL;
 | 
			
		||||
    param->force_field = FF_LJ;
 | 
			
		||||
    param->epsilon = 1.0;
 | 
			
		||||
    param->sigma = 1.0;
 | 
			
		||||
@@ -169,6 +170,11 @@ void printParameter(Parameter *param) {
 | 
			
		||||
    printf("\tNumber of timesteps: %d\n", param->ntimes);
 | 
			
		||||
    printf("\tReport stats every (timesteps): %d\n", param->nstat);
 | 
			
		||||
    printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
 | 
			
		||||
    #ifdef SORT_ATOMS
 | 
			
		||||
    printf("\tSort atoms when reneighboring: yes\n");
 | 
			
		||||
    #else
 | 
			
		||||
    printf("\tSort atoms when reneighboring: no\n");
 | 
			
		||||
    #endif
 | 
			
		||||
    printf("\tPrune every (timesteps): %d\n", param->prune_every);
 | 
			
		||||
    printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
 | 
			
		||||
    printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
 | 
			
		||||
 
 | 
			
		||||
@@ -79,7 +79,7 @@ const char* ff2str(int ff) {
 | 
			
		||||
    return "invalid";
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int get_num_threads() {
 | 
			
		||||
int get_cuda_num_threads() {
 | 
			
		||||
    const char *num_threads_env = getenv("NUM_THREADS");
 | 
			
		||||
    return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -3,7 +3,7 @@ TAG ?= ICC
 | 
			
		||||
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
 | 
			
		||||
ISA ?= AVX512
 | 
			
		||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
 | 
			
		||||
OPT_SCHEME ?= gromacs
 | 
			
		||||
OPT_SCHEME ?= lammps
 | 
			
		||||
# Enable likwid (true or false)
 | 
			
		||||
ENABLE_LIKWID ?= true
 | 
			
		||||
# SP or DP
 | 
			
		||||
@@ -15,6 +15,8 @@ ASM_SYNTAX ?= ATT
 | 
			
		||||
# Debug
 | 
			
		||||
DEBUG ?= false
 | 
			
		||||
 | 
			
		||||
# Sort atoms when reneighboring (true or false)
 | 
			
		||||
SORT_ATOMS ?= true
 | 
			
		||||
# Explicitly store and load atom types (true or false)
 | 
			
		||||
EXPLICIT_TYPES ?= false
 | 
			
		||||
# Trace memory addresses for cache simulator (true or false)
 | 
			
		||||
@@ -36,7 +38,7 @@ USE_REFERENCE_VERSION ?= false
 | 
			
		||||
# Enable XTC output
 | 
			
		||||
XTC_OUTPUT ?= false
 | 
			
		||||
# Check if cj is local when decreasing reaction force
 | 
			
		||||
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
 | 
			
		||||
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
 | 
			
		||||
 | 
			
		||||
# Configurations for CUDA
 | 
			
		||||
# Use CUDA host memory to optimize transfers
 | 
			
		||||
 
 | 
			
		||||
@@ -6,7 +6,7 @@ dt 0.001
 | 
			
		||||
temp 80
 | 
			
		||||
x_out_freq 500
 | 
			
		||||
v_out_freq 5
 | 
			
		||||
cutforce 0.9
 | 
			
		||||
skin 0.05
 | 
			
		||||
cutforce 1.8
 | 
			
		||||
skin 0.1
 | 
			
		||||
reneigh_every 100
 | 
			
		||||
nstat 125000
 | 
			
		||||
 
 | 
			
		||||
 Submodule gather-bench deleted from 2f654cb043
									
								
							@@ -45,7 +45,7 @@ static inline void gmx_load_simd_4xn_interactions(
 | 
			
		||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int *neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -66,7 +66,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        int ci_cj1 = CJ1_FROM_CI(ci);
 | 
			
		||||
@@ -77,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int any = 0;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
@@ -158,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int *neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -213,7 +213,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    #endif
 | 
			
		||||
    */
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -240,9 +240,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        MD_SIMD_FLOAT fiz2 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            //int imask = neighs[k].imask;
 | 
			
		||||
            //int imask = neighs_imask[k];
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
            //MD_SIMD_MASK interact0;
 | 
			
		||||
@@ -331,7 +331,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
@@ -401,7 +401,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int *neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -427,7 +427,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -454,9 +454,8 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        MD_SIMD_FLOAT fiz2 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            unsigned int mask0, mask1, mask2, mask3;
 | 
			
		||||
 | 
			
		||||
@@ -507,7 +506,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
 | 
			
		||||
@@ -570,7 +569,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
 | 
			
		||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int *neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -596,7 +595,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -635,9 +634,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        MD_SIMD_FLOAT fiz3 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
@@ -741,9 +739,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
@@ -846,7 +843,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int *neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -872,7 +869,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -911,9 +908,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        MD_SIMD_FLOAT fiz3 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
@@ -991,9 +987,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
 
 | 
			
		||||
@@ -25,11 +25,6 @@
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    int cj;
 | 
			
		||||
    unsigned int imask;
 | 
			
		||||
} NeighborCluster;
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    int every;
 | 
			
		||||
    int ncalls;
 | 
			
		||||
@@ -37,7 +32,8 @@ typedef struct {
 | 
			
		||||
    int* numneigh;
 | 
			
		||||
    int* numneigh_masked;
 | 
			
		||||
    int half_neigh;
 | 
			
		||||
    NeighborCluster* neighbors;
 | 
			
		||||
    int* neighbors;
 | 
			
		||||
    unsigned int* neighbors_imask;
 | 
			
		||||
} Neighbor;
 | 
			
		||||
 | 
			
		||||
extern void initNeighbor(Neighbor*, Parameter*);
 | 
			
		||||
 
 | 
			
		||||
@@ -60,18 +60,15 @@ void init(Parameter *param) {
 | 
			
		||||
    param->eam_file = NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Show debug messages
 | 
			
		||||
#define DEBUG(msg)  printf(msg)
 | 
			
		||||
// Do not show debug messages
 | 
			
		||||
//#define DEBUG(msg)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
 | 
			
		||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
 | 
			
		||||
    const int maxneighs = nneighs * nreps;
 | 
			
		||||
    const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
 | 
			
		||||
    const int ncj = atom->Nclusters_local / jfac;
 | 
			
		||||
    const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
 | 
			
		||||
    neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
 | 
			
		||||
    neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
 | 
			
		||||
    neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
 | 
			
		||||
    neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
 | 
			
		||||
 | 
			
		||||
    if(pattern == P_RAND && ncj <= nneighs) {
 | 
			
		||||
        fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
 | 
			
		||||
@@ -80,6 +77,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 | 
			
		||||
        unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
 | 
			
		||||
        int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
 | 
			
		||||
        int m = (pattern == P_SEQ) ? ncj : nneighs;
 | 
			
		||||
        int k = 0;
 | 
			
		||||
@@ -90,6 +88,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
 | 
			
		||||
                do {
 | 
			
		||||
                    int cj = rand() % ncj;
 | 
			
		||||
                    neighptr[k] = cj;
 | 
			
		||||
                    neighptr_imask[k] = imask;
 | 
			
		||||
                    found = 0;
 | 
			
		||||
                    for(int l = 0; l < k; l++) {
 | 
			
		||||
                        if(neighptr[l] == cj) {
 | 
			
		||||
@@ -99,6 +98,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
 | 
			
		||||
                } while(found == 1);
 | 
			
		||||
            } else {
 | 
			
		||||
                neighptr[k] = j;
 | 
			
		||||
                neighptr_imask[k] = imask;
 | 
			
		||||
                j = (j + 1) % m;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
@@ -106,10 +106,12 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
 | 
			
		||||
        for(int r = 1; r < nreps; r++) {
 | 
			
		||||
            for(int k = 0; k < nneighs; k++) {
 | 
			
		||||
                neighptr[r * nneighs + k] = neighptr[k];
 | 
			
		||||
                neighptr_imask[r * nneighs + k] = neighptr_imask[k];
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        neighbor->numneigh[ci] = nneighs * nreps;
 | 
			
		||||
        neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -125,12 +127,13 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
    int niclusters = 256;               // Number of local i-clusters
 | 
			
		||||
    int iclusters_natoms = CLUSTER_M;   // Number of valid atoms within i-clusters
 | 
			
		||||
    int nneighs = 9;                    // Number of j-cluster neighbors per i-cluster
 | 
			
		||||
    int masked = 0;                     // Use masked loop 
 | 
			
		||||
    int nreps = 1;
 | 
			
		||||
    int csv = 0;
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_INIT;
 | 
			
		||||
    LIKWID_MARKER_REGISTER("force");
 | 
			
		||||
    DEBUG("Initializing parameters...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Initializing parameters...\n");
 | 
			
		||||
    init(¶m);
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < argc; i++) {
 | 
			
		||||
@@ -156,6 +159,10 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
            param.eam_file = strdup(argv[++i]);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
        if((strcmp(argv[i], "-m") == 0)) {
 | 
			
		||||
            masked = 1;
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
        if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
 | 
			
		||||
            param.ntimes = atoi(argv[++i]);
 | 
			
		||||
            continue;
 | 
			
		||||
@@ -206,11 +213,11 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(param.force_field == FF_EAM) {
 | 
			
		||||
        DEBUG("Initializing EAM parameters...\n");
 | 
			
		||||
        DEBUG_MESSAGE("Initializing EAM parameters...\n");
 | 
			
		||||
        initEam(&eam, ¶m);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG("Initializing atoms...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Initializing atoms...\n");
 | 
			
		||||
    initAtom(atom);
 | 
			
		||||
    initStats(&stats);
 | 
			
		||||
 | 
			
		||||
@@ -226,7 +233,7 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
        atom->cutforcesq[i] = param.cutforce * param.cutforce;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG("Creating atoms...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Creating atoms...\n");
 | 
			
		||||
    while(atom->Nmax < niclusters * iclusters_natoms) {
 | 
			
		||||
        growAtom(atom);
 | 
			
		||||
    }
 | 
			
		||||
@@ -281,13 +288,13 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG("Defining j-clusters...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Defining j-clusters...\n");
 | 
			
		||||
    defineJClusters(atom);
 | 
			
		||||
    DEBUG("Initializing neighbor lists...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Initializing neighbor lists...\n");
 | 
			
		||||
    initNeighbor(&neighbor, ¶m);
 | 
			
		||||
    DEBUG("Creating neighbor lists...\n");
 | 
			
		||||
    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
 | 
			
		||||
    DEBUG("Computing forces...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Creating neighbor lists...\n");
 | 
			
		||||
    createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
 | 
			
		||||
    DEBUG_MESSAGE("Computing forces...\n");
 | 
			
		||||
 | 
			
		||||
    double T_accum = 0.0;
 | 
			
		||||
    for(int i = 0; i < param.ntimes; i++) {
 | 
			
		||||
 
 | 
			
		||||
@@ -5,7 +5,9 @@
 | 
			
		||||
 * license that can be found in the LICENSE file.
 | 
			
		||||
 */
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
//--
 | 
			
		||||
#include <likwid-marker.h>
 | 
			
		||||
//--
 | 
			
		||||
@@ -117,7 +119,7 @@ int main(int argc, char** argv) {
 | 
			
		||||
 | 
			
		||||
    initParameter(¶m);
 | 
			
		||||
    for(int i = 0; i < argc; i++) {
 | 
			
		||||
        if((strcmp(argv[i], "-p") == 0)) {
 | 
			
		||||
        if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
 | 
			
		||||
            readParameter(¶m, argv[++i]);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
@@ -308,6 +310,30 @@ int main(int argc, char** argv) {
 | 
			
		||||
    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
 | 
			
		||||
            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
 | 
			
		||||
    printf(HLINE);
 | 
			
		||||
    
 | 
			
		||||
    int nthreads = 0;
 | 
			
		||||
    int chunkSize = 0;
 | 
			
		||||
    omp_sched_t schedKind;
 | 
			
		||||
    char schedType[10];
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
#pragma omp master
 | 
			
		||||
    {
 | 
			
		||||
	omp_get_schedule(&schedKind, &chunkSize);
 | 
			
		||||
 | 
			
		||||
    	switch (schedKind)
 | 
			
		||||
    	{
 | 
			
		||||
        	case omp_sched_static:  strcpy(schedType, "static"); break;
 | 
			
		||||
        	case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
 | 
			
		||||
        	case omp_sched_guided:  strcpy(schedType, "guided"); break;
 | 
			
		||||
        	case omp_sched_auto:    strcpy(schedType, "auto"); break;
 | 
			
		||||
    	}
 | 
			
		||||
 | 
			
		||||
    	nthreads = omp_get_max_threads();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Num threads: %d\n", nthreads);
 | 
			
		||||
    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
 | 
			
		||||
 | 
			
		||||
    printf("Performance: %.2f million atom updates per second\n",
 | 
			
		||||
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
 | 
			
		||||
    #ifdef COMPUTE_STATS
 | 
			
		||||
 
 | 
			
		||||
@@ -58,6 +58,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
 | 
			
		||||
    neighbor->numneigh = NULL;
 | 
			
		||||
    neighbor->numneigh_masked = NULL;
 | 
			
		||||
    neighbor->neighbors = NULL;
 | 
			
		||||
    neighbor->neighbors_imask = NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void setupNeighbor(Parameter *param, Atom *atom) {
 | 
			
		||||
@@ -229,10 +230,13 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    if(atom->Nclusters_local > nmax) {
 | 
			
		||||
        nmax = atom->Nclusters_local;
 | 
			
		||||
        if(neighbor->numneigh) free(neighbor->numneigh);
 | 
			
		||||
        if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
 | 
			
		||||
        if(neighbor->neighbors) free(neighbor->neighbors);
 | 
			
		||||
        if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
 | 
			
		||||
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
 | 
			
		||||
        neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
 | 
			
		||||
        neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
 | 
			
		||||
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
 | 
			
		||||
        neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
 | 
			
		||||
@@ -248,7 +252,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
 | 
			
		||||
        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
            int ci_cj1 = CJ1_FROM_CI(ci);
 | 
			
		||||
            NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 | 
			
		||||
            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 | 
			
		||||
            unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
 | 
			
		||||
            int n = 0, nmasked = 0;
 | 
			
		||||
            int ibin = atom->icluster_bin[ci];
 | 
			
		||||
            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
 | 
			
		||||
@@ -324,15 +329,17 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
                                    imask = get_imask_simd_4xn(1, ci, cj);
 | 
			
		||||
                                    #endif
 | 
			
		||||
 | 
			
		||||
                                    if(imask == NBNXN_INTERACTION_MASK_ALL) {
 | 
			
		||||
                                        neighptr[n].cj = cj;
 | 
			
		||||
                                        neighptr[n].imask = imask;
 | 
			
		||||
                                    } else {
 | 
			
		||||
                                        neighptr[n].cj = neighptr[nmasked].cj;
 | 
			
		||||
                                        neighptr[n].imask = neighptr[nmasked].imask;
 | 
			
		||||
                                        neighptr[nmasked].cj = cj;
 | 
			
		||||
                                        neighptr[nmasked].imask = imask;
 | 
			
		||||
                                        nmasked++;
 | 
			
		||||
                                    if(n < neighbor->maxneighs) {
 | 
			
		||||
                                        if(imask == NBNXN_INTERACTION_MASK_ALL) {
 | 
			
		||||
                                            neighptr[n] = cj;
 | 
			
		||||
                                            neighptr_imask[n] = imask;
 | 
			
		||||
                                        } else {
 | 
			
		||||
                                            neighptr[n] = neighptr[nmasked];
 | 
			
		||||
                                            neighptr_imask[n] = neighptr_imask[nmasked];
 | 
			
		||||
                                            neighptr[nmasked] = cj;
 | 
			
		||||
                                            neighptr_imask[nmasked] = imask;
 | 
			
		||||
                                            nmasked++;
 | 
			
		||||
                                        }
 | 
			
		||||
                                    }
 | 
			
		||||
 | 
			
		||||
                                    n++;
 | 
			
		||||
@@ -357,8 +364,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
            // Fill neighbor list with dummy values to fit vector width
 | 
			
		||||
            if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
 | 
			
		||||
                    neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                    neighptr[n].imask = 0;
 | 
			
		||||
                    neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                    neighptr_imask[n] = 0;
 | 
			
		||||
                    n++;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
@@ -375,10 +382,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(resize) {
 | 
			
		||||
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
 | 
			
		||||
            neighbor->maxneighs = new_maxneighs * 1.2;
 | 
			
		||||
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
 | 
			
		||||
            free(neighbor->neighbors);
 | 
			
		||||
            neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
 | 
			
		||||
            free(neighbor->neighbors_imask);
 | 
			
		||||
            neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
 | 
			
		||||
            neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -433,20 +442,21 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    MD_FLOAT cutsq = cutneighsq;
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
        int numneighs_masked = neighbor->numneigh_masked[ci];
 | 
			
		||||
        int k = 0;
 | 
			
		||||
 | 
			
		||||
        // Remove dummy clusters if necessary
 | 
			
		||||
        if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
            while(neighs[numneighs - 1].cj == atom->dummy_cj) {
 | 
			
		||||
            while(neighs[numneighs - 1] == atom->dummy_cj) {
 | 
			
		||||
                numneighs--;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        while(k < numneighs) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            if(atomDistanceInRange(atom, ci, cj, cutsq)) {
 | 
			
		||||
                k++;
 | 
			
		||||
            } else {
 | 
			
		||||
@@ -461,8 +471,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
        // Readd dummy clusters if necessary
 | 
			
		||||
        if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
 | 
			
		||||
                neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                neighs[numneighs].imask = 0;
 | 
			
		||||
                neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                neighs_imask[numneighs] = 0;
 | 
			
		||||
                numneighs++;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
 | 
			
		||||
    MEM_TRACER_INIT;
 | 
			
		||||
    INDEX_TRACER_INIT;
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int *neighs;
 | 
			
		||||
    unsigned int *neighs_imask;
 | 
			
		||||
    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
 | 
			
		||||
 | 
			
		||||
    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
 | 
			
		||||
@@ -34,7 +35,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
 | 
			
		||||
        DIST_TRACE(neighs, numneighs);
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int j = neighs[k].cj;
 | 
			
		||||
            int j = neighs[k];
 | 
			
		||||
            MEM_TRACE(j, 'R');
 | 
			
		||||
            MEM_TRACE(atom_x(j), 'R');
 | 
			
		||||
            MEM_TRACE(atom_y(j), 'R');
 | 
			
		||||
 
 | 
			
		||||
@@ -1,7 +1,7 @@
 | 
			
		||||
CC  = icc
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP  = #-qopenmp
 | 
			
		||||
OPENMP  = -qopenmp
 | 
			
		||||
PROFILE  = #-profile-functions -g  -pg
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX512)
 | 
			
		||||
 
 | 
			
		||||
@@ -502,6 +502,21 @@ int readAtom_in(Atom* atom, Parameter* param) {
 | 
			
		||||
    return natoms;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void writeAtom(Atom *atom, Parameter *param) {
 | 
			
		||||
    FILE *fp = fopen(param->write_atom_file, "w");
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
			
		||||
        fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
 | 
			
		||||
            atom->type[i], 1.0,
 | 
			
		||||
            atom_x(i), atom_y(i), atom_z(i),
 | 
			
		||||
            atom_vx(i), atom_vy(i), atom_vz(i));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fclose(fp);
 | 
			
		||||
    fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
 | 
			
		||||
        param->write_atom_file, param->xprd, param->yprd, param->zprd);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void growAtom(Atom *atom) {
 | 
			
		||||
    DeviceAtom *d_atom = &(atom->d_atom);
 | 
			
		||||
    int nold = atom->Nmax;
 | 
			
		||||
 
 | 
			
		||||
@@ -29,7 +29,7 @@ extern "C" {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// cuda kernel
 | 
			
		||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
 | 
			
		||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
 | 
			
		||||
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    if(i >= Nlocal) {
 | 
			
		||||
        return;
 | 
			
		||||
@@ -46,6 +46,10 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
 | 
			
		||||
    MD_FLOAT fiy = 0;
 | 
			
		||||
    MD_FLOAT fiz = 0;
 | 
			
		||||
 | 
			
		||||
#ifdef EXPLICIT_TYPES
 | 
			
		||||
    const int type_i = atom->type[i];
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
        int j = neigh_neighbors[Nlocal * k + i];
 | 
			
		||||
        MD_FLOAT delx = xtmp - atom_x(j);
 | 
			
		||||
@@ -55,7 +59,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
 | 
			
		||||
 | 
			
		||||
#ifdef EXPLICIT_TYPES
 | 
			
		||||
        const int type_j = atom->type[j];
 | 
			
		||||
        const int type_ij = type_i * atom->ntypes + type_j;
 | 
			
		||||
        const int type_ij = type_i * ntypes + type_j;
 | 
			
		||||
        const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
 | 
			
		||||
        const MD_FLOAT sigma6 = atom->sigma6[type_ij];
 | 
			
		||||
        const MD_FLOAT epsilon = atom->epsilon[type_ij];
 | 
			
		||||
@@ -109,7 +113,7 @@ extern "C" {
 | 
			
		||||
 | 
			
		||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 | 
			
		||||
    const int Nlocal = atom->Nlocal;
 | 
			
		||||
    const int num_threads_per_block = get_num_threads();
 | 
			
		||||
    const int num_threads_per_block = get_cuda_num_threads();
 | 
			
		||||
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
			
		||||
 | 
			
		||||
    kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
 | 
			
		||||
@@ -123,7 +127,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 | 
			
		||||
 | 
			
		||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 | 
			
		||||
    const int Nlocal = atom->Nlocal;
 | 
			
		||||
    const int num_threads_per_block = get_num_threads();
 | 
			
		||||
    const int num_threads_per_block = get_cuda_num_threads();
 | 
			
		||||
    const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
 | 
			
		||||
 | 
			
		||||
    kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
 | 
			
		||||
@@ -136,13 +140,11 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    const int num_threads_per_block = get_num_threads();
 | 
			
		||||
    const int num_threads_per_block = get_cuda_num_threads();
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
#ifndef EXPLICIT_TYPES
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    int nDevices;
 | 
			
		||||
@@ -165,7 +167,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
 | 
			
		||||
    calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
 | 
			
		||||
    cuda_assert("calc_force", cudaPeekAtLastError());
 | 
			
		||||
    cuda_assert("calc_force", cudaDeviceSynchronize());
 | 
			
		||||
    cudaProfilerStop();
 | 
			
		||||
 
 | 
			
		||||
@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
 | 
			
		||||
 | 
			
		||||
__global__ void compute_neighborhood(
 | 
			
		||||
    DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
 | 
			
		||||
    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
 | 
			
		||||
    int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
 | 
			
		||||
 | 
			
		||||
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
 | 
			
		||||
    if(i >= nlocal) {
 | 
			
		||||
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
 | 
			
		||||
 | 
			
		||||
#ifdef EXPLICIT_TYPES
 | 
			
		||||
            int type_j = atom->type[j];
 | 
			
		||||
            const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
 | 
			
		||||
            const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
 | 
			
		||||
#else
 | 
			
		||||
            const MD_FLOAT cutoff = cutneighsq;
 | 
			
		||||
#endif
 | 
			
		||||
@@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
 | 
			
		||||
 | 
			
		||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
 | 
			
		||||
    const int num_threads_per_block = get_num_threads();
 | 
			
		||||
    const int num_threads_per_block = get_cuda_num_threads();
 | 
			
		||||
    int nall = atom->Nlocal + atom->Nghost;
 | 
			
		||||
 | 
			
		||||
    cudaProfilerStart();
 | 
			
		||||
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
                                                                    np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
 | 
			
		||||
                                                                    c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
 | 
			
		||||
                                                                    c_new_maxneighs,
 | 
			
		||||
								                                    cutneighsq);
 | 
			
		||||
								                                    cutneighsq, atom->ntypes);
 | 
			
		||||
 | 
			
		||||
        cuda_assert("compute_neighborhood", cudaPeekAtLastError());
 | 
			
		||||
        cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
 | 
			
		||||
 
 | 
			
		||||
@@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
 | 
			
		||||
/* update coordinates of ghost atoms */
 | 
			
		||||
/* uses mapping created in setupPbc */
 | 
			
		||||
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
 | 
			
		||||
    const int num_threads_per_block = get_num_threads();
 | 
			
		||||
    const int num_threads_per_block = get_cuda_num_threads();
 | 
			
		||||
 | 
			
		||||
    if(reneigh) {
 | 
			
		||||
        memcpyToGPU(atom->d_atom.x,     atom->x,    sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
			
		||||
@@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
 | 
			
		||||
    const int num_threads_per_block = get_num_threads();
 | 
			
		||||
    const int num_threads_per_block = get_cuda_num_threads();
 | 
			
		||||
    MD_FLOAT xprd = param->xprd;
 | 
			
		||||
    MD_FLOAT yprd = param->yprd;
 | 
			
		||||
    MD_FLOAT zprd = param->zprd;
 | 
			
		||||
 
 | 
			
		||||
@@ -14,6 +14,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
 | 
			
		||||
    d_atom->epsilon         =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    d_atom->sigma6          =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    d_atom->cutneighsq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    d_atom->cutforcesq      =   (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    d_neighbor->neighbors   =   (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
 | 
			
		||||
    d_neighbor->numneigh    =   (int *) allocateGPU(sizeof(int) * atom->Nmax);
 | 
			
		||||
@@ -22,6 +23,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    memcpyToGPU(d_atom->vx,             atom->vx,         sizeof(MD_FLOAT) * atom->Nmax * 3);
 | 
			
		||||
    memcpyToGPU(d_atom->sigma6,         atom->sigma6,     sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    memcpyToGPU(d_atom->epsilon,        atom->epsilon,    sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    memcpyToGPU(d_atom->cutneighsq,     atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    memcpyToGPU(d_atom->cutforcesq,     atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
 | 
			
		||||
    memcpyToGPU(d_atom->type,           atom->type,       sizeof(int) * atom->Nmax);
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -41,7 +41,7 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -90,6 +90,12 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
 | 
			
		||||
        atom_fy(i) += fiy;
 | 
			
		||||
        atom_fz(i) += fiz;
 | 
			
		||||
 | 
			
		||||
        #ifdef USE_REFERENCE_VERSION
 | 
			
		||||
        if(numneighs % VECTOR_WIDTH > 0) {
 | 
			
		||||
            addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
 | 
			
		||||
        }
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
        addStat(stats->total_force_neighs, numneighs);
 | 
			
		||||
        addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
 | 
			
		||||
    }
 | 
			
		||||
@@ -125,7 +131,7 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("forceLJ-halfneigh");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -221,7 +227,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp for schedule(runtime)
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
 
 | 
			
		||||
@@ -73,6 +73,7 @@ extern int readAtom_pdb(Atom*, Parameter*);
 | 
			
		||||
extern int readAtom_gro(Atom*, Parameter*);
 | 
			
		||||
extern int readAtom_dmp(Atom*, Parameter*);
 | 
			
		||||
extern int readAtom_in(Atom*, Parameter*);
 | 
			
		||||
extern void writeAtom(Atom*, Parameter*);
 | 
			
		||||
extern void growAtom(Atom*);
 | 
			
		||||
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
 
 | 
			
		||||
@@ -59,12 +59,6 @@ void init(Parameter *param) {
 | 
			
		||||
    param->eam_file = NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Show debug messages
 | 
			
		||||
#define DEBUG(msg)  printf(msg)
 | 
			
		||||
// Do not show debug messages
 | 
			
		||||
//#define DEBUG(msg)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
 | 
			
		||||
    const int maxneighs = nneighs * nreps;
 | 
			
		||||
    neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
 | 
			
		||||
@@ -125,7 +119,7 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_INIT;
 | 
			
		||||
    LIKWID_MARKER_REGISTER("force");
 | 
			
		||||
    DEBUG("Initializing parameters...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Initializing parameters...\n");
 | 
			
		||||
    init(¶m);
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < argc; i++) {
 | 
			
		||||
@@ -196,11 +190,11 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(param.force_field == FF_EAM) {
 | 
			
		||||
        DEBUG("Initializing EAM parameters...\n");
 | 
			
		||||
        DEBUG_MESSAGE("Initializing EAM parameters...\n");
 | 
			
		||||
        initEam(&eam, ¶m);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG("Initializing atoms...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Initializing atoms...\n");
 | 
			
		||||
    initAtom(atom);
 | 
			
		||||
    initStats(&stats);
 | 
			
		||||
 | 
			
		||||
@@ -216,7 +210,7 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
        atom->cutforcesq[i] = param.cutforce * param.cutforce;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG("Creating atoms...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Creating atoms...\n");
 | 
			
		||||
    for(int i = 0; i < natoms; ++i) {
 | 
			
		||||
        while(atom->Nlocal > atom->Nmax - natoms) {
 | 
			
		||||
            growAtom(atom);
 | 
			
		||||
@@ -247,11 +241,11 @@ int main(int argc, const char *argv[]) {
 | 
			
		||||
        printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG("Initializing neighbor lists...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Initializing neighbor lists...\n");
 | 
			
		||||
    initNeighbor(&neighbor, ¶m);
 | 
			
		||||
    DEBUG("Creating neighbor lists...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Creating neighbor lists...\n");
 | 
			
		||||
    createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
 | 
			
		||||
    DEBUG("Computing forces...\n");
 | 
			
		||||
    DEBUG_MESSAGE("Computing forces...\n");
 | 
			
		||||
 | 
			
		||||
    double T_accum = 0.0;
 | 
			
		||||
    for(int i = 0; i < param.ntimes; i++) {
 | 
			
		||||
 
 | 
			
		||||
@@ -11,6 +11,7 @@
 | 
			
		||||
#include <limits.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
#include <float.h>
 | 
			
		||||
#include <omp.h>
 | 
			
		||||
 | 
			
		||||
#include <likwid-marker.h>
 | 
			
		||||
 | 
			
		||||
@@ -63,6 +64,10 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
 | 
			
		||||
    setupNeighbor(param);
 | 
			
		||||
    setupThermo(param, atom->Natoms);
 | 
			
		||||
    if(param->input_file == NULL) { adjustThermo(param, atom); }
 | 
			
		||||
    #ifdef SORT_ATOMS
 | 
			
		||||
    atom->Nghost = 0;
 | 
			
		||||
    sortAtom(atom);
 | 
			
		||||
    #endif
 | 
			
		||||
    setupPbc(atom, param);
 | 
			
		||||
    initDevice(atom, neighbor);
 | 
			
		||||
    updatePbc(atom, param, true);
 | 
			
		||||
@@ -76,9 +81,12 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("reneighbour");
 | 
			
		||||
    updateAtomsPbc(atom, param);
 | 
			
		||||
    #ifdef SORT_ATOMS
 | 
			
		||||
    atom->Nghost = 0;
 | 
			
		||||
    sortAtom(atom);
 | 
			
		||||
    #endif
 | 
			
		||||
    setupPbc(atom, param);
 | 
			
		||||
    updatePbc(atom, param, true);
 | 
			
		||||
    //sortAtom(atom);
 | 
			
		||||
    buildNeighbor(atom, neighbor);
 | 
			
		||||
    LIKWID_MARKER_STOP("reneighbour");
 | 
			
		||||
    E = getTimeStamp();
 | 
			
		||||
@@ -145,7 +153,7 @@ int main(int argc, char** argv) {
 | 
			
		||||
 | 
			
		||||
    initParameter(¶m);
 | 
			
		||||
    for(int i = 0; i < argc; i++) {
 | 
			
		||||
        if((strcmp(argv[i], "-p") == 0)) {
 | 
			
		||||
        if((strcmp(argv[i], "-p") == 0) || strcmp(argv[i], "--params") == 0) {
 | 
			
		||||
            readParameter(¶m, argv[++i]);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
@@ -200,19 +208,25 @@ int main(int argc, char** argv) {
 | 
			
		||||
            param.vtk_file = strdup(argv[++i]);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
        if((strcmp(argv[i], "-w") == 0)) {
 | 
			
		||||
            param.write_atom_file = strdup(argv[++i]);
 | 
			
		||||
            continue;
 | 
			
		||||
        }
 | 
			
		||||
        if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
 | 
			
		||||
            printf("MD Bench: A minimalistic re-implementation of miniMD\n");
 | 
			
		||||
            printf(HLINE);
 | 
			
		||||
            printf("-p <string>:          file to read parameters from (can be specified more than once)\n");
 | 
			
		||||
            printf("-f <string>:          force field (lj, eam or dem), default lj\n");
 | 
			
		||||
            printf("-i <string>:          input file with atom positions (dump)\n");
 | 
			
		||||
            printf("-e <string>:          input file for EAM\n");
 | 
			
		||||
            printf("-n / --nsteps <int>:  set number of timesteps for simulation\n");
 | 
			
		||||
            printf("-nx/-ny/-nz <int>:    set linear dimension of systembox in x/y/z direction\n");
 | 
			
		||||
            printf("-r / --radius <real>: set cutoff radius\n");
 | 
			
		||||
            printf("-s / --skin <real>:   set skin (verlet buffer)\n");
 | 
			
		||||
            printf("--freq <real>:        processor frequency (GHz)\n");
 | 
			
		||||
            printf("--vtk <string>:       VTK file for visualization\n");
 | 
			
		||||
            printf("-p / --params <string>:     file to read parameters from (can be specified more than once)\n");
 | 
			
		||||
            printf("-f <string>:                force field (lj, eam or dem), default lj\n");
 | 
			
		||||
            printf("-i <string>:                input file with atom positions (dump)\n");
 | 
			
		||||
            printf("-e <string>:                input file for EAM\n");
 | 
			
		||||
            printf("-n / --nsteps <int>:        set number of timesteps for simulation\n");
 | 
			
		||||
            printf("-nx/-ny/-nz <int>:          set linear dimension of systembox in x/y/z direction\n");
 | 
			
		||||
            printf("-half <int>:                use half (1) or full (0) neighbor lists\n");
 | 
			
		||||
            printf("-r / --radius <real>:       set cutoff radius\n");
 | 
			
		||||
            printf("-s / --skin <real>:         set skin (verlet buffer)\n");
 | 
			
		||||
            printf("-w <file>:                  write input atoms to file\n");
 | 
			
		||||
            printf("--freq <real>:              processor frequency (GHz)\n");
 | 
			
		||||
            printf("--vtk <string>:             VTK file for visualization\n");
 | 
			
		||||
            printf(HLINE);
 | 
			
		||||
            exit(EXIT_SUCCESS);
 | 
			
		||||
        }
 | 
			
		||||
@@ -229,6 +243,10 @@ int main(int argc, char** argv) {
 | 
			
		||||
    traceAddresses(¶m, &atom, &neighbor, n + 1);
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    if(param.write_atom_file != NULL) {
 | 
			
		||||
        writeAtom(&atom, ¶m);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //writeInput(¶m, &atom);
 | 
			
		||||
 | 
			
		||||
    timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
 | 
			
		||||
@@ -275,6 +293,30 @@ int main(int argc, char** argv) {
 | 
			
		||||
    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
 | 
			
		||||
            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
 | 
			
		||||
    printf(HLINE);
 | 
			
		||||
 | 
			
		||||
    int nthreads = 0;
 | 
			
		||||
    int chunkSize = 0;
 | 
			
		||||
    omp_sched_t schedKind;
 | 
			
		||||
    char schedType[10];
 | 
			
		||||
#pragma omp parallel
 | 
			
		||||
#pragma omp master
 | 
			
		||||
    {
 | 
			
		||||
    	omp_get_schedule(&schedKind, &chunkSize);
 | 
			
		||||
 | 
			
		||||
    	switch (schedKind)
 | 
			
		||||
    	{
 | 
			
		||||
        	case omp_sched_static:  strcpy(schedType, "static"); break;
 | 
			
		||||
        	case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
 | 
			
		||||
        	case omp_sched_guided:  strcpy(schedType, "guided"); break;
 | 
			
		||||
        	case omp_sched_auto:    strcpy(schedType, "auto"); break;
 | 
			
		||||
    	}
 | 
			
		||||
	
 | 
			
		||||
	nthreads = omp_get_max_threads();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("Num threads: %d\n", nthreads);
 | 
			
		||||
    printf("Schedule: (%s,%d)\n", schedType, chunkSize);
 | 
			
		||||
    
 | 
			
		||||
    printf("Performance: %.2f million atom updates per second\n",
 | 
			
		||||
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
 | 
			
		||||
#ifdef COMPUTE_STATS
 | 
			
		||||
 
 | 
			
		||||
@@ -326,45 +326,45 @@ void sortAtom(Atom* atom) {
 | 
			
		||||
    int Nmax = atom->Nmax;
 | 
			
		||||
    int* binpos = bincount;
 | 
			
		||||
 | 
			
		||||
    for(int i=1; i<mbins; i++) {
 | 
			
		||||
        binpos[i] += binpos[i-1];
 | 
			
		||||
    for(int i = 1; i < mbins; i++) {
 | 
			
		||||
        binpos[i] += binpos[i - 1];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
    #ifdef AOS
 | 
			
		||||
    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 | 
			
		||||
    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 | 
			
		||||
#else
 | 
			
		||||
    #else
 | 
			
		||||
    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 | 
			
		||||
#endif
 | 
			
		||||
    #endif
 | 
			
		||||
    MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
 | 
			
		||||
    MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
 | 
			
		||||
 | 
			
		||||
    for(int mybin = 0; mybin<mbins; mybin++) {
 | 
			
		||||
        int start = mybin>0?binpos[mybin-1]:0;
 | 
			
		||||
    for(int mybin = 0; mybin < mbins; mybin++) {
 | 
			
		||||
        int start = mybin > 0 ? binpos[mybin - 1] : 0;
 | 
			
		||||
        int count = binpos[mybin] - start;
 | 
			
		||||
        for(int k=0; k<count; k++) {
 | 
			
		||||
        for(int k = 0; k < count; k++) {
 | 
			
		||||
            int new_i = start + k;
 | 
			
		||||
            int old_i = bins[mybin * atoms_per_bin + k];
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
            #ifdef AOS
 | 
			
		||||
            new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
 | 
			
		||||
            new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
 | 
			
		||||
            new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
 | 
			
		||||
            new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
 | 
			
		||||
            new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
 | 
			
		||||
            new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
 | 
			
		||||
#else
 | 
			
		||||
            #else
 | 
			
		||||
            new_x[new_i] = old_x[old_i];
 | 
			
		||||
            new_y[new_i] = old_y[old_i];
 | 
			
		||||
            new_z[new_i] = old_z[old_i];
 | 
			
		||||
            new_vx[new_i] = old_vx[old_i];
 | 
			
		||||
            new_vy[new_i] = old_vy[old_i];
 | 
			
		||||
            new_vz[new_i] = old_vz[old_i];
 | 
			
		||||
#endif
 | 
			
		||||
            #endif
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -372,7 +372,7 @@ void sortAtom(Atom* atom) {
 | 
			
		||||
    free(atom->vx);
 | 
			
		||||
    atom->x = new_x;
 | 
			
		||||
    atom->vx = new_vx;
 | 
			
		||||
#ifndef AOS
 | 
			
		||||
    #ifndef AOS
 | 
			
		||||
    free(atom->y);
 | 
			
		||||
    free(atom->z);
 | 
			
		||||
    free(atom->vy);
 | 
			
		||||
@@ -381,5 +381,5 @@ void sortAtom(Atom* atom) {
 | 
			
		||||
    atom->z = new_z;
 | 
			
		||||
    atom->vy = new_vy;
 | 
			
		||||
    atom->vz = new_vz;
 | 
			
		||||
#endif
 | 
			
		||||
    #endif
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -125,7 +125,7 @@ void setupPbc(Atom *atom, Parameter *param) {
 | 
			
		||||
        if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
 | 
			
		||||
            if (x < Cutneigh         && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(+1,+1,+1); }
 | 
			
		||||
            if (x < Cutneigh         && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(+1,-1,+1); }
 | 
			
		||||
            if (x < Cutneigh         && y >= Cutneigh        && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
 | 
			
		||||
            if (x < Cutneigh         && y < Cutneigh        && z >= (zprd-Cutneigh))  { ADDGHOST(+1,+1,-1); }
 | 
			
		||||
            if (x < Cutneigh         && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
 | 
			
		||||
            if (x >= (xprd-Cutneigh) && y < Cutneigh         && z < Cutneigh)         { ADDGHOST(-1,+1,+1); }
 | 
			
		||||
            if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh)         { ADDGHOST(-1,-1,+1); }
 | 
			
		||||
 
 | 
			
		||||
@@ -1,88 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Initializing parameters...
 | 
			
		||||
Initializing atoms...
 | 
			
		||||
Creating atoms...
 | 
			
		||||
Pattern: seq
 | 
			
		||||
Number of timesteps: 200
 | 
			
		||||
Number of atoms: 256
 | 
			
		||||
Number of neighbors per atom: 1024
 | 
			
		||||
Number of times to replicate neighbor lists: 1
 | 
			
		||||
Estimated total data volume (kB): 1062.9120
 | 
			
		||||
Estimated atom data volume (kB): 6.1440
 | 
			
		||||
Estimated neighborlist data volume (kB): 1050.6240
 | 
			
		||||
Initializing neighbor lists...
 | 
			
		||||
Creating neighbor lists...
 | 
			
		||||
Computing forces...
 | 
			
		||||
Total time: 0.2735, Mega atom updates/s: 0.1872
 | 
			
		||||
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 8, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 1018.9055
 | 
			
		||||
	Average SIMD iterations per atom: 127.3632
 | 
			
		||||
	Total number of computed pair interactions: 52428800
 | 
			
		||||
	Total number of SIMD iterations: 6553600
 | 
			
		||||
	Useful read data volume for force computation: 1.47GB
 | 
			
		||||
	Cycles/SIMD iteration: 83.4598
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_DP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   0.110776 |
 | 
			
		||||
|     call count    |        200 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|                   Event                  | Counter | HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  |  267036300 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  |  219034500 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  |  273793400 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    10.9296 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE |   PMC0  |          0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_DOUBLE   |   PMC1  |     159400 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE |   PMC2  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE |   PMC3  |  197068800 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |       8643 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |       1367 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |       9124 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |       1354 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |       9138 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |       1356 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |       5586 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |       1297 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |       5328 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |       1269 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |       5280 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |       1295 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     0.1108 |
 | 
			
		||||
|        Runtime unhalted [s]       |     0.0878 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2564 |
 | 
			
		||||
|                CPI                |     0.8202 |
 | 
			
		||||
|             Energy [J]            |    10.9296 |
 | 
			
		||||
|             Power [W]             |    98.6643 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            DP [MFLOP/s]           | 14233.3287 |
 | 
			
		||||
|          AVX DP [MFLOP/s]         | 14231.8898 |
 | 
			
		||||
|          Packed [MUOPS/s]         |  1778.9862 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |     1.4389 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |    24.9001 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.0028 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |     4.5861 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.0005 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |    29.4863 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     0.0033 |
 | 
			
		||||
|       Operational intensity       |   482.7104 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,168 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Parameters:
 | 
			
		||||
	Force field: lj
 | 
			
		||||
	Kernel: plain-C
 | 
			
		||||
	Data layout: AoS
 | 
			
		||||
	Floating-point precision: double
 | 
			
		||||
	Unit cells (nx, ny, nz): 32, 32, 32
 | 
			
		||||
	Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
 | 
			
		||||
	Periodic (x, y, z): 1, 1, 1
 | 
			
		||||
	Lattice size: 1.679596e+00
 | 
			
		||||
	Epsilon: 1.000000e+00
 | 
			
		||||
	Sigma: 1.000000e+00
 | 
			
		||||
	Spring constant: 1.000000e+00
 | 
			
		||||
	Damping constant: 1.000000e+00
 | 
			
		||||
	Temperature: 1.440000e+00
 | 
			
		||||
	RHO: 8.442000e-01
 | 
			
		||||
	Mass: 1.000000e+00
 | 
			
		||||
	Number of types: 4
 | 
			
		||||
	Number of timesteps: 200
 | 
			
		||||
	Report stats every (timesteps): 100
 | 
			
		||||
	Reneighbor every (timesteps): 20
 | 
			
		||||
	Prune every (timesteps): 1000
 | 
			
		||||
	Output positions every (timesteps): 20
 | 
			
		||||
	Output velocities every (timesteps): 5
 | 
			
		||||
	Delta time (dt): 5.000000e-03
 | 
			
		||||
	Cutoff radius: 2.500000e+00
 | 
			
		||||
	Skin: 3.000000e-01
 | 
			
		||||
	Half neighbor lists: 0
 | 
			
		||||
	Processor frequency (GHz): 2.0000
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
step	temp		pressure
 | 
			
		||||
0	1.440000e+00	1.215639e+00
 | 
			
		||||
100	8.200895e-01	6.923143e-01
 | 
			
		||||
200	7.961495e-01	6.721043e-01
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
 | 
			
		||||
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
Performance: 2.28 million atom updates per second
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 8, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 76.0352
 | 
			
		||||
	Average SIMD iterations per atom: 9.9181
 | 
			
		||||
	Total number of computed pair interactions: 2003182862
 | 
			
		||||
	Total number of SIMD iterations: 261297661
 | 
			
		||||
	Useful read data volume for force computation: 57.46GB
 | 
			
		||||
	Cycles/SIMD iteration: 40.4432
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_DP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   5.115807 |
 | 
			
		||||
|     call count    |        201 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  | 12592470000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  | 10196910000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 12746120000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    307.9429 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_DOUBLE   |   PMC1  |    79042240 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE |   PMC3  |  8076039000 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |    22734550 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |     1147714 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |    22755180 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |     1144415 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |    22762780 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |     1129051 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |    22905660 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |     1143324 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |    22914860 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |     1169116 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |    22890220 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |     1180739 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     5.1158 |
 | 
			
		||||
|        Runtime unhalted [s]       |     4.0885 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2508 |
 | 
			
		||||
|                CPI                |     0.8098 |
 | 
			
		||||
|             Energy [J]            |   307.9429 |
 | 
			
		||||
|             Power [W]             |    60.1944 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            DP [MFLOP/s]           | 12644.6041 |
 | 
			
		||||
|          AVX DP [MFLOP/s]         | 12629.1535 |
 | 
			
		||||
|          Packed [MUOPS/s]         |  1578.6442 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |    15.4506 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |  1713.4438 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     8.7656 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    86.5003 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.4425 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |  1799.9442 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     9.2082 |
 | 
			
		||||
|       Operational intensity       |     7.0250 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
Region reneighbour, Group 1: MEM_DP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   5.897385 |
 | 
			
		||||
|     call count    |         10 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  | 18212540000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  | 11728500000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 14660630000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    338.9000 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_DOUBLE   |   PMC1  |  6240402000 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE |   PMC3  |      983040 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |     2086787 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |     1115626 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |     2089964 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |     1117021 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |     2103832 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |     1117965 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |     2086930 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |     1102471 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |     2094688 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |     1103018 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |     2097438 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |     1102525 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     5.8974 |
 | 
			
		||||
|        Runtime unhalted [s]       |     4.7026 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2473 |
 | 
			
		||||
|                CPI                |     0.6440 |
 | 
			
		||||
|             Energy [J]            |   338.9000 |
 | 
			
		||||
|             Power [W]             |    57.4661 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            DP [MFLOP/s]           |  1059.4978 |
 | 
			
		||||
|          AVX DP [MFLOP/s]         |     1.3335 |
 | 
			
		||||
|          Packed [MUOPS/s]         |     0.1667 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |  1058.1643 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |   136.3006 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.8038 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    72.2612 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.4262 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |   208.5618 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     1.2300 |
 | 
			
		||||
|       Operational intensity       |     5.0800 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,88 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Initializing parameters...
 | 
			
		||||
Initializing atoms...
 | 
			
		||||
Creating atoms...
 | 
			
		||||
Pattern: seq
 | 
			
		||||
Number of timesteps: 200
 | 
			
		||||
Number of atoms: 256
 | 
			
		||||
Number of neighbors per atom: 1024
 | 
			
		||||
Number of times to replicate neighbor lists: 1
 | 
			
		||||
Estimated total data volume (kB): 1056.7680
 | 
			
		||||
Estimated atom data volume (kB): 3.0720
 | 
			
		||||
Estimated neighborlist data volume (kB): 1050.6240
 | 
			
		||||
Initializing neighbor lists...
 | 
			
		||||
Creating neighbor lists...
 | 
			
		||||
Computing forces...
 | 
			
		||||
Total time: 0.2466, Mega atom updates/s: 0.2076
 | 
			
		||||
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 16, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 1018.9055
 | 
			
		||||
	Average SIMD iterations per atom: 63.6816
 | 
			
		||||
	Total number of computed pair interactions: 52428800
 | 
			
		||||
	Total number of SIMD iterations: 3276800
 | 
			
		||||
	Useful read data volume for force computation: 0.84GB
 | 
			
		||||
	Cycles/SIMD iteration: 150.4999
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_SP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   0.085843 |
 | 
			
		||||
|     call count    |        200 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|                   Event                  | Counter | HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  |  129769100 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  |  172300100 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  |  215371300 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |     9.2849 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE |   PMC0  |          0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_SINGLE   |   PMC1  |     154000 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE |   PMC2  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE |   PMC3  |   89088000 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |       8354 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |       1126 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |       7863 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |       1105 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |       7990 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |       1113 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |       4775 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |       1112 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |       4201 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |       1127 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |       4035 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |       1120 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     0.0858 |
 | 
			
		||||
|        Runtime unhalted [s]       |     0.0691 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2787 |
 | 
			
		||||
|                CPI                |     1.3277 |
 | 
			
		||||
|             Energy [J]            |     9.2849 |
 | 
			
		||||
|             Power [W]             |   108.1610 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            SP [MFLOP/s]           | 16606.5397 |
 | 
			
		||||
|          AVX SP [MFLOP/s]         | 16604.7458 |
 | 
			
		||||
|          Packed [MUOPS/s]         |  1037.7966 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |     1.7940 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |    27.7476 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.0024 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |     4.9974 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.0004 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |    32.7450 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     0.0028 |
 | 
			
		||||
|       Operational intensity       |   507.1471 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,168 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Parameters:
 | 
			
		||||
	Force field: lj
 | 
			
		||||
	Kernel: plain-C
 | 
			
		||||
	Data layout: AoS
 | 
			
		||||
	Floating-point precision: single
 | 
			
		||||
	Unit cells (nx, ny, nz): 32, 32, 32
 | 
			
		||||
	Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
 | 
			
		||||
	Periodic (x, y, z): 1, 1, 1
 | 
			
		||||
	Lattice size: 1.679596e+00
 | 
			
		||||
	Epsilon: 1.000000e+00
 | 
			
		||||
	Sigma: 1.000000e+00
 | 
			
		||||
	Spring constant: 1.000000e+00
 | 
			
		||||
	Damping constant: 1.000000e+00
 | 
			
		||||
	Temperature: 1.440000e+00
 | 
			
		||||
	RHO: 8.442000e-01
 | 
			
		||||
	Mass: 1.000000e+00
 | 
			
		||||
	Number of types: 4
 | 
			
		||||
	Number of timesteps: 200
 | 
			
		||||
	Report stats every (timesteps): 100
 | 
			
		||||
	Reneighbor every (timesteps): 20
 | 
			
		||||
	Prune every (timesteps): 1000
 | 
			
		||||
	Output positions every (timesteps): 20
 | 
			
		||||
	Output velocities every (timesteps): 5
 | 
			
		||||
	Delta time (dt): 5.000000e-03
 | 
			
		||||
	Cutoff radius: 2.500000e+00
 | 
			
		||||
	Skin: 3.000000e-01
 | 
			
		||||
	Half neighbor lists: 0
 | 
			
		||||
	Processor frequency (GHz): 2.0000
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
step	temp		pressure
 | 
			
		||||
0	1.440000e+00	1.215639e+00
 | 
			
		||||
100	8.200897e-01	6.923144e-01
 | 
			
		||||
200	7.961481e-01	6.721031e-01
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
 | 
			
		||||
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
Performance: 2.42 million atom updates per second
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 16, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 76.0351
 | 
			
		||||
	Average SIMD iterations per atom: 5.0875
 | 
			
		||||
	Total number of computed pair interactions: 2003181259
 | 
			
		||||
	Total number of SIMD iterations: 134032075
 | 
			
		||||
	Useful read data volume for force computation: 32.79GB
 | 
			
		||||
	Cycles/SIMD iteration: 68.9511
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_SP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   4.452877 |
 | 
			
		||||
|     call count    |        201 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  |  7428719000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  |  8875251000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 11094050000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    265.5057 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_SINGLE   |   PMC1  |    79036820 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE |   PMC3  |  3935012000 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |    19716700 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |      595747 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |    19734880 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |      597090 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |    19732800 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |      595219 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |    19886430 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |      632443 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |    19887210 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |      633169 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |    19935560 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |      634112 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     4.4529 |
 | 
			
		||||
|        Runtime unhalted [s]       |     3.5585 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2693 |
 | 
			
		||||
|                CPI                |     1.1947 |
 | 
			
		||||
|             Energy [J]            |   265.5057 |
 | 
			
		||||
|             Power [W]             |    59.6257 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            SP [MFLOP/s]           | 14156.9661 |
 | 
			
		||||
|          AVX SP [MFLOP/s]         | 14139.2165 |
 | 
			
		||||
|          Packed [MUOPS/s]         |   883.7010 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |    17.7496 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |  1708.8254 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     7.6092 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    53.0035 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.2360 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |  1761.8288 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     7.8452 |
 | 
			
		||||
|       Operational intensity       |     8.0354 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
Region reneighbour, Group 1: MEM_SP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   5.935627 |
 | 
			
		||||
|     call count    |         10 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  | 18208530000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  | 11805500000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 14756870000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    340.7903 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_SINGLE   |   PMC1  |  6240406000 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE |   PMC3  |      491520 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |     1772377 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |      975760 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |     1770611 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |      977433 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |     1771722 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |      979122 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |     1782901 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |      967621 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |     1780789 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |      967179 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |     1784733 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |      969349 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     5.9356 |
 | 
			
		||||
|        Runtime unhalted [s]       |     4.7334 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2675 |
 | 
			
		||||
|                CPI                |     0.6483 |
 | 
			
		||||
|             Energy [J]            |   340.7903 |
 | 
			
		||||
|             Power [W]             |    57.4144 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            SP [MFLOP/s]           |  1052.6723 |
 | 
			
		||||
|          AVX SP [MFLOP/s]         |     1.3249 |
 | 
			
		||||
|          Packed [MUOPS/s]         |     0.0828 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |  1051.3474 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |   114.9736 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.6824 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    62.9308 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.3735 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |   177.9044 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     1.0560 |
 | 
			
		||||
|       Operational intensity       |     5.9171 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,148 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-avx512-dp-ICX.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 47.68 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 42.0     0.0  | 12.5  |  5.0     5.0  |  5.0     5.0  |  0.0  | 42.0  | 12.5  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | movsxd rbx, dword ptr [r12+r14*4]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea rcx, ptr [rbx+rbx*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl rcx, 0x6
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0x40]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm4, zmm3, zmm29
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm3, zmm3, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea ecx, ptr [rbx+rbx*1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmp rdi, rcx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dl
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz cl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea ebx, ptr [rbx+rbx*1+0x1]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm17, zmm25, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm17, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm18, zmm3, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm18, zmm4, zmm4
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm19, zmm18
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | cmp rdi, rbx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz bl
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ebp, ebx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm20, zmm19, zmm22
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm21, zmm19, zmm19
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm20, zmm21, zmm20
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovupd zmm21, zmmword ptr [rsp+0x80]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm21, zmm21, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bpl, 0x4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm1, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm19, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddpd zmm20, zmm20, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm19, zmm20
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovupd zmm20, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm20, zmm20, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | not bpl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub bpl, cl
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ebp
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm18, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm18, zmm26, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm15{k1}, zmm19, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm4, zmm18, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm4, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm4, zmm21, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12{k1}, zmm19, zmm3
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm3, zmm4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea ecx, ptr [rdx+rdx*1]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov eax, ebx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm8{k1}, zmm19, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm3, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm3, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm19, zmm17
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm19, zmm19, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl al, 0x5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm1, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm3, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm17, zmm17, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm3, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm17, zmm23, zmm30
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | sub cl, al
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | add cl, 0xfd
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ecx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm4, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm4, zmm27, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm14{k1}, zmm3, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm21, zmm4, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm21, zmm17, zmm17
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm21, zmm19, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm10{k1}, zmm3, zmm20
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm20, zmm21
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm6{k1}, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm20, zmm22
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm20, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm18, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm1, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm18, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm3, zmm3, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm18, zmm3
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx*4]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ecx, ebx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl cl, 0x6
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | sub al, cl
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | add al, 0xfb
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm21, zmm0, 0x1
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovupd zmm18, zmmword ptr [rsp+0x180]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm18, zmm18, zmm29
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm20, zmm24, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm21, zmm28, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm16{k1}, zmm3, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm21, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm19, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm19, zmm18, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm11{k1}, zmm3, zmm17
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm17, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm7{k1}, zmm3, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm17, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm4, zmm17, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm4, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm4, zmm1, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm4, zmm4, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddpd zmm3, zmm3, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm4, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl dl, 0x3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bl, 0x7
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub dl, bl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add dl, 0xf7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, edx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm19, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm13{k1}, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm9{k1}, zmm3, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm5{k1}, zmm3, zmm21
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | inc r14
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r11, r14
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jnz 0xfffffffffffffd99
 | 
			
		||||
Total Num Of Uops: 123
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,159 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-avx512-dp-ICX.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-01-03 00:07:20
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
2287 |             |      |             |             |      |      |      |      ||      |      |   .LBB5_11:                               #
 | 
			
		||||
2288 |             |      |             |             |      |      |      |      ||      |      |   #   Parent Loop BB5_6 Depth=1
 | 
			
		||||
2289 |             |      |             |             |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
2290 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   movslq (%r12,%r14,4), %rbx
 | 
			
		||||
2291 |             | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   leaq (%rbx,%rbx,2), %rcx
 | 
			
		||||
2292 | 0.00        |      |             |             |      |      | 1.00 |      ||  1.0 |      |   shlq $6, %rcx
 | 
			
		||||
2293 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovapd (%rsi,%rcx), %zmm29
 | 
			
		||||
2294 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovapd 64(%rsi,%rcx), %zmm30
 | 
			
		||||
2295 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  0.0 |      |   vmovapd 128(%rsi,%rcx), %zmm31
 | 
			
		||||
2296 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 64(%rsp), %zmm3         # 64-byte Reload
 | 
			
		||||
2297 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm3, %zmm4
 | 
			
		||||
2298 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 320(%rsp), %zmm3        # 64-byte Reload
 | 
			
		||||
2299 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm30, %zmm3, %zmm3
 | 
			
		||||
2300 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rbx,%rbx), %ecx
 | 
			
		||||
2301 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rcx, %rdi
 | 
			
		||||
2302 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dl
 | 
			
		||||
2303 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %cl
 | 
			
		||||
2304 |             | 1.00 |             |             |      |      |      |      ||      |      |   leal 1(%rbx,%rbx), %ebx
 | 
			
		||||
2305 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubpd %zmm31, %zmm25, %zmm17
 | 
			
		||||
2306 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm17, %zmm17, %zmm18
 | 
			
		||||
2307 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
 | 
			
		||||
2308 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
 | 
			
		||||
2309 | 2.75        |      |             |             |      | 0.25 |      |      ||  8.0 |      |   vrcp14pd %zmm18, %zmm19
 | 
			
		||||
2310 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   cmpq %rbx, %rdi
 | 
			
		||||
2311 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %bl
 | 
			
		||||
2312 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %ebx, %ebp
 | 
			
		||||
2313 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm22, %zmm19, %zmm20
 | 
			
		||||
2314 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm19, %zmm19, %zmm21
 | 
			
		||||
2315 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm21, %zmm20
 | 
			
		||||
2316 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 128(%rsp), %zmm21       # 64-byte Reload
 | 
			
		||||
2317 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm29, %zmm21, %zmm21
 | 
			
		||||
2318 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $4, %bpl
 | 
			
		||||
2319 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm19, %zmm1, %zmm19
 | 
			
		||||
2320 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm19, %zmm19
 | 
			
		||||
2321 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm2, %zmm20, %zmm20
 | 
			
		||||
2322 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm19, %zmm19
 | 
			
		||||
2323 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 256(%rsp), %zmm20       # 64-byte Reload
 | 
			
		||||
2324 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm20, %zmm20
 | 
			
		||||
2325 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   notb %bpl
 | 
			
		||||
2326 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   subb %cl, %bpl
 | 
			
		||||
2327 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2328 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
 | 
			
		||||
2329 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm31, %zmm26, %zmm18
 | 
			
		||||
2330 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
 | 
			
		||||
2331 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm18, %zmm4
 | 
			
		||||
2332 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
 | 
			
		||||
2333 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
 | 
			
		||||
2334 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
 | 
			
		||||
2335 | 2.25        |      |             |             |      | 0.75 |      |      ||      |      |   vrcp14pd %zmm4, %zmm3
 | 
			
		||||
2336 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rdx,%rdx), %ecx
 | 
			
		||||
2337 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %ebx, %eax
 | 
			
		||||
2338 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
 | 
			
		||||
2339 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm22, %zmm3, %zmm17
 | 
			
		||||
2340 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm3, %zmm19
 | 
			
		||||
2341 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm19, %zmm17
 | 
			
		||||
2342 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 448(%rsp), %zmm19       # 64-byte Reload
 | 
			
		||||
2343 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm29, %zmm19, %zmm19
 | 
			
		||||
2344 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $5, %al
 | 
			
		||||
2345 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm1, %zmm3
 | 
			
		||||
2346 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm3, %zmm3
 | 
			
		||||
2347 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm2, %zmm17, %zmm17
 | 
			
		||||
2348 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm3, %zmm3
 | 
			
		||||
2349 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm23, %zmm17
 | 
			
		||||
2350 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   subb %al, %cl
 | 
			
		||||
2351 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   addb $-3, %cl
 | 
			
		||||
2352 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
2353 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
 | 
			
		||||
2354 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm31, %zmm27, %zmm4
 | 
			
		||||
2355 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
 | 
			
		||||
2356 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm4, %zmm4, %zmm21
 | 
			
		||||
2357 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
 | 
			
		||||
2358 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
 | 
			
		||||
2359 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
 | 
			
		||||
2360 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm21, %zmm20
 | 
			
		||||
2361 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
 | 
			
		||||
2362 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm22, %zmm20, %zmm3
 | 
			
		||||
2363 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm18
 | 
			
		||||
2364 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm18, %zmm3
 | 
			
		||||
2365 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm1, %zmm18
 | 
			
		||||
2366 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm18, %zmm18
 | 
			
		||||
2367 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2368 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm18, %zmm3
 | 
			
		||||
2369 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (,%rdx,4), %eax
 | 
			
		||||
2370 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %ebx, %ecx
 | 
			
		||||
2371 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $6, %cl
 | 
			
		||||
2372 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   subb %cl, %al
 | 
			
		||||
2373 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   addb $-5, %al
 | 
			
		||||
2374 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %eax, %k1
 | 
			
		||||
2375 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
 | 
			
		||||
2376 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 384(%rsp), %zmm18       # 64-byte Reload
 | 
			
		||||
2377 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm18, %zmm18
 | 
			
		||||
2378 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm24, %zmm20
 | 
			
		||||
2379 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm31, %zmm28, %zmm21
 | 
			
		||||
2380 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
 | 
			
		||||
2381 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm21, %zmm21, %zmm19
 | 
			
		||||
2382 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
 | 
			
		||||
2383 | 0.25        |      |             |             |      | 0.75 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
 | 
			
		||||
2384 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
 | 
			
		||||
2385 | 2.00        |      |             |             |      | 1.00 |      |      ||      |      |   vrcp14pd %zmm19, %zmm17
 | 
			
		||||
2386 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
 | 
			
		||||
2387 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm22, %zmm17, %zmm3
 | 
			
		||||
2388 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm17, %zmm17, %zmm4
 | 
			
		||||
2389 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm4, %zmm3
 | 
			
		||||
2390 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm17, %zmm1, %zmm4
 | 
			
		||||
2391 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm4, %zmm4
 | 
			
		||||
2392 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2393 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm4, %zmm3
 | 
			
		||||
2394 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $3, %dl
 | 
			
		||||
2395 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $7, %bl
 | 
			
		||||
2396 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   subb %bl, %dl
 | 
			
		||||
2397 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   addb $-9, %dl
 | 
			
		||||
2398 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %edx, %k1
 | 
			
		||||
2399 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
2400 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
 | 
			
		||||
2401 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
 | 
			
		||||
2402 | 0.00        |      |             |             |      | 1.00 |      |      ||      |  4.0 |   vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
 | 
			
		||||
2403 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   incq %r14
 | 
			
		||||
2404 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %r14, %r11
 | 
			
		||||
2405 |             |      |             |             |      |      |      |      ||      |      | * jne .LBB5_11
 | 
			
		||||
 | 
			
		||||
       40.0          14.5   5.00   5.00   5.00   5.00          40.0   14.5           50.0    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
2402 |  4.0 | vfmadd231pd	%zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
 | 
			
		||||
2401 |  4.0 | vfmadd231pd	%zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
 | 
			
		||||
2400 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
 | 
			
		||||
2386 |  4.0 | vfmadd231pd	%zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
 | 
			
		||||
2384 |  4.0 | vfmadd231pd	%zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
 | 
			
		||||
2380 |  4.0 | vfmadd231pd	%zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
 | 
			
		||||
2361 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
 | 
			
		||||
2359 |  4.0 | vfmadd231pd	%zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
 | 
			
		||||
2355 |  4.0 | vfmadd231pd	%zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
 | 
			
		||||
2338 |  4.0 | vfmadd231pd	%zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
 | 
			
		||||
2334 |  4.0 | vfmadd231pd	%zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
 | 
			
		||||
2330 |  4.0 | vfmadd231pd	%zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
 | 
			
		||||
2394 |  3.0 | shlb	$3, %dl                        | [2394, 2396, 2397]
 | 
			
		||||
2318 |  3.0 | shlb	$4, %bpl                       | [2318, 2325, 2326]
 | 
			
		||||
2403 |  1.0 | incq	%r14                           | [2403]
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,198 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icc-avx512-dp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 62.00 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 58.0     0.0  | 16.0  | 16.0    15.0  | 16.0    15.0  |  2.0  | 58.0  | 16.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | mov edx, dword ptr [r10+rsi*4]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | inc rsi
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm20, zmmword ptr [rsp+0x380]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm25, zmmword ptr [rsp+0x340]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm24, zmmword ptr [rsp+0x1c0]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm23, zmmword ptr [rsp+0x2c0]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm16, zmmword ptr [rsp+0x3c0]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm14, zmmword ptr [rsp+0x300]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm15, zmmword ptr [rsp+0x240]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm12, zmmword ptr [rsp+0x180]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm21, zmmword ptr [rsp+0x200]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm18, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm22, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm17, zmmword ptr [rsp+0x280]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r12d, ptr [rdx+rdx*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r12d, 0x3
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r13d, ptr [rdx+rdx*1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r12, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | cmp r13d, r11d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx+rdx*1+0x1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | mov edx, 0x0
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz dl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | cmp eax, r11d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | mov eax, 0x0
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r13d, edx
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz al
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm13, zmm29, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm15, zmm26, zmm26
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm12, zmm23, zmm23
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm14, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm16, zmmword ptr [rsp+0xc0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm13, zmm30, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm15, zmm27, zmm27
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12, zmm24, zmm24
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm14, zmm21, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm13, zmm31, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm15, zmm28, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12, zmm25, zmm25
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm14, zmm22, zmm22
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm19, zmm13
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm18, zmm15
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm17, zmm12
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1, zmm13, zmm16, 0x11
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k6, zmm15, zmm16, 0x11
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k7, zmm12, zmm16, 0x11
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k0, zmm14, zmm16, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm15, zmm14
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm16, zmmword ptr [rsp+0x40]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm12, zmmword ptr [rsp+0x80]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm13, zmm19, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm13, zmm19, zmm13
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | neg r13d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm19, zmm13
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r12d, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmsub213pd zmm13, zmm19, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm19, zmm12
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm13, zmm13, zmm19
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r13d, 0xff
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm14, zmm13
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | nop 
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm13, zmmword ptr [rsp+0x400]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm10, zmm14
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r12d, 0x4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r13d, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k5, r13d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw r13d, k1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r12d, k5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k5, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k1, r13d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r13d, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k5, k5, k1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r12d, k5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k5, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r12d, ptr [rdx+rdx*1]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm9{k5}, zmm19, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm13{k5}, zmm19, zmm31
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm31, zmmword ptr [rsp+0x440]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm29, zmm18, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm31{k5}, zmm19, zmm30
 | 
			
		||||
|   2^     |             |      | 1.0         |             | 1.0  |      |      |      | vmovups zmmword ptr [rsp+0x400], zmm13
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm30, zmm18, zmm29
 | 
			
		||||
|   2^     |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [rsp+0x440], zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm13, zmm18, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmsub213pd zmm30, zmm18, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm18, zmm12
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm30, zmm18
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r12d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm13, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm29, zmm10, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r13d, 0x5
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r12d, r13d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k1, r12d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw r12d, k6
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r13d, k1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k1, r13d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, r12d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r12d, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k1, k1, k6
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r13d, k1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k1, r13d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r13d, ptr [rdx*4]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm6{k1}, zmm29, zmm26
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r13d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm7{k1}, zmm29, zmm27
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm8{k1}, zmm29, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm26, zmm17, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm28, zmm17, zmm12
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm15, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm12, zmm15, zmm12
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm27, zmm17, zmm26
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm15, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm13, zmm17, zmm27
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmsub213pd zmm27, zmm17, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm27, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r13d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm13, zmm14
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl edx, 0x3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r12d, 0x6
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | neg edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm10, zmm17
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r13d, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, r13d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add edx, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl eax, 0x7
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub edx, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb eax, k6
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw eax, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k7, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k7, k6, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb edx, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k7, edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw edx, k0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm11{k7}, zmm18, zmm23
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm4{k7}, zmm18, zmm24
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm5{k7}, zmm18, zmm25
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm23, zmm15, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmsub213pd zmm19, zmm15, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm15, zmm19, zmm12
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm24, zmm23, zmm15
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm25, zmm10, zmm24
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb eax, k6
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k0, edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k0, k6, k0
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r12d, k0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k6, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm3{k6}, zmm25, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm2{k6}, zmm25, zmm21
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm0{k6}, zmm25, zmm20
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp rsi, rdi
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jl 0xfffffffffffffc6f
 | 
			
		||||
Total Num Of Uops: 187
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,152 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icc-avx512-sp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 51.00 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 47.5     0.0  |  9.0  | 11.0    11.0  | 11.0     8.0  |  3.0  | 47.5  |  9.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | mov edi, dword ptr [rcx+rax*4]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r12d, r13d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd rdi, edi
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | inc rax
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm10, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | test edi, 0x7fffffff
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm11, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm9, zmmword ptr [rsp+0xc0]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz r12b
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r14, ptr [rdi+rdi*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r14, 0x5
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r8d, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | neg r8d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r11d, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r8d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k0, r8d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r9d, ptr [r12+r12*2]
 | 
			
		||||
|   2      | 1.0         |      | 1.0     1.0 |             |      |      |      |      | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm2, zmm3, zmm3
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm10, zmm10
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm11, zmm11
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm26, zmm29, zmm29
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm2, zmm5, zmm5
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30, zmm8, zmm8
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm1, zmm9, zmm9
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm26, zmm28, zmm28
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm2, zmm27, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30, zmm4, zmm4
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm1, zmm7, zmm7
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm26, zmm25, zmm25
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm31, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k7, zmm30, zmm24, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm6, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k3, zmm2, zmm24, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm2, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k5, zmm26, zmm24, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm26, zmm26
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm30, zmm31, zmm23
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k2, k0, k3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k3, zmm1, zmm24, 0x11
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm31, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm31, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r9d
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm1, zmm31, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm31, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm1, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r9d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k4, r9d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm30, zmm30, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k1, k4, k5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm1, zmm21, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm26, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm26, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r10d, ptr [r12*8]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm26, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r10d
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm31, zmm26, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm26, zmm26, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm31, zmm26
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r10d, r12d
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm30, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r10d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k6, r10d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm26, zmm21, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k4, k6, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm25{k1}{z}, zmm25, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31{k1}{z}, zmm28, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm28, zmm6, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30{k1}{z}, zmm29, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm29, zmm2, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm25{k2}, zmm27, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm31{k2}, zmm5, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm27, zmm6, zmm28
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30{k2}, zmm3, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm2, zmm29
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm5, zmm6, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm27, zmm6, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm6, zmm6, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm3, zmm2, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm1, zmm2, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm2, zmm2, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm26, zmm27, zmm6
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm1, zmm2
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm5, zmm5, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm3, zmm3, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm6, zmm21, zmm5
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm27, zmm21, zmm3
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm25{k4}, zmm4, zmm6
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm4, zmmword ptr [r14+rsi*1]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm31{k4}, zmm8, zmm6
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30{k4}, zmm10, zmm6
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r11d, 0x4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r12d, r11d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r12d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k0, r12d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k5, k0, k3
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm25{k5}, zmm7, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm31{k5}, zmm9, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30{k5}, zmm11, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubps zmm7, zmm4, zmm25
 | 
			
		||||
|   2      |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [r14+rsi*1], zmm7
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubps zmm4, zmm8, zmm31
 | 
			
		||||
|   2      |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubps zmm2, zmm1, zmm30
 | 
			
		||||
|   2      |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp rax, rdx
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jb 0xfffffffffffffd30
 | 
			
		||||
Total Num Of Uops: 142
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,154 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icx-avx512-dp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 49.26 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 44.0     0.0  | 13.5  |  5.5     5.5  |  5.5     5.5  |  0.0  | 44.0  | 13.5  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | movsxd rcx, dword ptr [r10+rbx*4]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea rdx, ptr [rcx+rcx*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl rdx, 0x6
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0x10]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm3, zmm3, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm31, zmm24, zmm30
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm16, zmmword ptr [rsp+0x150]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm16, zmm16, zmm29
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm17, zmm31, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm17, zmm16, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm17, zmm3, zmm3
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm18, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm21, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm18, zmm19
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm18, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm20, zmm19, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm22, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm18, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm20, zmm25, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea edx, ptr [rcx+rcx*1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmp r11, rdx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dl
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz al
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add ecx, ecx
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | inc ecx
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | cmp r11, rcx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz cl
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm19, zmm18
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm19, zmmword ptr [rsp+0x210]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm19, zmm19, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dil
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ebp, edi
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bpl, 0x4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub bpl, al
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add bpl, 0xef
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ebp
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm17, zmm0, 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm17, zmmword ptr [rsp+0x110]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm17, zmm17, zmm29
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx+rdx*1]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ebp, edi
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm18, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm14{k1}, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm3, zmm17, zmm17
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm3, zmm19, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm11{k1}, zmm16, zmm18
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm16, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm7{k1}, zmm31, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm21, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm16, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm16, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm31, zmm18, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm22, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm16, zmm16, zmm31
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm31, zmm31, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bpl, 0x5
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | or bpl, al
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | or bpl, 0xdd
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ebp
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm3, zmm0, 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0xd0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm3, zmm3, zmm29
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm18, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm18, zmm26, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm16, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm15{k1}, zmm19, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm18, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm19, zmm3, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm19, zmm31, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm10{k1}, zmm17, zmm16
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm17, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm6{k1}, zmm20, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm21, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm16, zmm17, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm17, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm20, zmm16, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm17, zmm22, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm17, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm16, zmm17
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx*4]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl dil, 0x6
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | or dil, al
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | or dil, 0xbb
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, edi
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm19, zmm0, 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm17, zmmword ptr [rsp+0x190]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm17, zmm17, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm19, zmm23, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm20, zmm27, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm16, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm13{k1}, zmm31, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm28, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm28, zmm19, zmm19
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm28, zmm17, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm9{k1}, zmm3, zmm16
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm3, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm5{k1}, zmm18, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm21, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm16, zmm3, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm3, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm18, zmm16, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm22, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm16, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl dl, 0x3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl cl, 0x7
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | or cl, dl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add cl, 0xf7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ecx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm28, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm3, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12{k1}, zmm17, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm8{k1}, zmm19, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm4{k1}, zmm20, zmm3
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | inc rbx
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r9, rbx
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jnz 0xfffffffffffffd5a
 | 
			
		||||
Total Num Of Uops: 129
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,288 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      12200
 | 
			
		||||
Total Cycles:      4745
 | 
			
		||||
Total uOps:        14000
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.95
 | 
			
		||||
IPC:               2.57
 | 
			
		||||
Block RThroughput: 34.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      5     0.50    *                   movslq	(%r10,%rbx,4), %rcx
 | 
			
		||||
 1      1     0.50                        leaq	(%rcx,%rcx,2), %rdx
 | 
			
		||||
 1      1     0.50                        shlq	$6, %rdx
 | 
			
		||||
 2      8     0.50    *                   vmovupd	(%rsi,%rdx), %zmm28
 | 
			
		||||
 2      8     0.50    *                   vmovupd	64(%rsi,%rdx), %zmm29
 | 
			
		||||
 2      8     0.50    *                   vmovupd	128(%rsi,%rdx), %zmm30
 | 
			
		||||
 2      8     0.50    *                   vmovupd	16(%rsp), %zmm3
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm24, %zmm31
 | 
			
		||||
 2      8     0.50    *                   vmovupd	336(%rsp), %zmm16
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm16, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm31, %zmm31, %zmm17
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm16, %zmm17
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm3, %zmm17
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm17, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm21, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm19, %zmm20
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm22, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm18, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm25, %zmm20
 | 
			
		||||
 1      1     0.50                        leal	(%rcx,%rcx), %edx
 | 
			
		||||
 1      1     0.25                        cmpq	%rdx, %r11
 | 
			
		||||
 1      1     0.50                        setne	%dl
 | 
			
		||||
 1      1     0.50                        sete	%al
 | 
			
		||||
 1      1     0.25                        addl	%ecx, %ecx
 | 
			
		||||
 1      1     0.25                        incl	%ecx
 | 
			
		||||
 1      1     0.25                        cmpq	%rcx, %r11
 | 
			
		||||
 1      1     0.50                        sete	%cl
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm19, %zmm18
 | 
			
		||||
 2      8     0.50    *                   vmovupd	528(%rsp), %zmm19
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm19, %zmm19
 | 
			
		||||
 1      1     0.50                        setne	%dil
 | 
			
		||||
 1      1     0.25                        movl	%edi, %ebp
 | 
			
		||||
 1      1     0.50                        shlb	$4, %bpl
 | 
			
		||||
 1      1     0.25                        subb	%al, %bpl
 | 
			
		||||
 1      1     0.25                        addb	$-17, %bpl
 | 
			
		||||
 1      1     1.00                        kmovd	%ebp, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
 2      8     0.50    *                   vmovupd	272(%rsp), %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm17, %zmm17
 | 
			
		||||
 1      1     0.50                        leal	(%rdx,%rdx), %eax
 | 
			
		||||
 1      1     0.25                        movl	%edi, %ebp
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm18, %zmm18
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm20, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1}
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm3, %zmm16
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm21, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm18, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm22, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm31, %zmm16, %zmm16
 | 
			
		||||
 2      8     0.50    *                   vmovupd	464(%rsp), %zmm31
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm31, %zmm31
 | 
			
		||||
 1      1     0.50                        shlb	$5, %bpl
 | 
			
		||||
 1      1     0.25                        orb	%al, %bpl
 | 
			
		||||
 1      1     0.25                        orb	$-35, %bpl
 | 
			
		||||
 1      1     1.00                        kmovd	%ebp, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
 2      8     0.50    *                   vmovupd	208(%rsp), %zmm3
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm18, %zmm16
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm26, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm18, %zmm19
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm3, %zmm19
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm31, %zmm31, %zmm19
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1}
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm19, %zmm17
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm17, %zmm21, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm16, %zmm20
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm17, %zmm22, %zmm17
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm17, %zmm17
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm17, %zmm16, %zmm16
 | 
			
		||||
 1      1     0.50                        leal	(,%rdx,4), %eax
 | 
			
		||||
 1      1     0.50                        shlb	$6, %dil
 | 
			
		||||
 1      1     0.25                        orb	%al, %dil
 | 
			
		||||
 1      1     0.25                        orb	$-69, %dil
 | 
			
		||||
 1      1     1.00                        kmovd	%edi, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
 2      8     0.50    *                   vmovupd	400(%rsp), %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm17, %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm23, %zmm19
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm27, %zmm20
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm20, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1}
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm28, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm3, %zmm21, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm16, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm3, %zmm22, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm3, %zmm16, %zmm3
 | 
			
		||||
 1      1     0.50                        shlb	$3, %dl
 | 
			
		||||
 1      1     0.50                        shlb	$7, %cl
 | 
			
		||||
 1      1     0.25                        orb	%dl, %cl
 | 
			
		||||
 1      1     0.25                        addb	$-9, %cl
 | 
			
		||||
 1      1     1.00                        kmovd	%ecx, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1}
 | 
			
		||||
 1      1     0.25                        incq	%rbx
 | 
			
		||||
 1      1     0.25                        cmpq	%rbx, %r9
 | 
			
		||||
 1      1     0.50                        jne	.LBB5_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -      -     45.53  20.45  5.50   5.50    -     44.64  18.38   -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -     movslq	(%r10,%rbx,4), %rcx
 | 
			
		||||
 -      -      -     0.99    -      -      -     0.01    -      -     leaq	(%rcx,%rcx,2), %rdx
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$6, %rdx
 | 
			
		||||
 -      -     0.01   0.99   0.49   0.51    -      -      -      -     vmovupd	(%rsi,%rdx), %zmm28
 | 
			
		||||
 -      -     0.01   0.91   0.51   0.49    -     0.08    -      -     vmovupd	64(%rsi,%rdx), %zmm29
 | 
			
		||||
 -      -     0.01   0.56   0.49   0.51    -     0.43    -      -     vmovupd	128(%rsi,%rdx), %zmm30
 | 
			
		||||
 -      -      -     0.99   0.50   0.50    -     0.01    -      -     vmovupd	16(%rsp), %zmm3
 | 
			
		||||
 -      -     0.95    -      -      -      -     0.05    -      -     vsubpd	%zmm28, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.48    -      -      -      -     0.52    -      -     vsubpd	%zmm30, %zmm24, %zmm31
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovupd	336(%rsp), %zmm16
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vsubpd	%zmm29, %zmm16, %zmm16
 | 
			
		||||
 -      -     0.48    -      -      -      -     0.52    -      -     vmulpd	%zmm31, %zmm31, %zmm17
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vfmadd231pd	%zmm16, %zmm16, %zmm17
 | 
			
		||||
 -      -     0.04    -      -      -      -     0.96    -      -     vfmadd231pd	%zmm3, %zmm3, %zmm17
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm17, %zmm18
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulpd	%zmm18, %zmm21, %zmm19
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vaddpd	%zmm1, %zmm19, %zmm20
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulpd	%zmm18, %zmm22, %zmm18
 | 
			
		||||
 -      -     0.95    -      -      -      -     0.05    -      -     vmulpd	%zmm20, %zmm18, %zmm18
 | 
			
		||||
 -      -     0.92    -      -      -      -     0.08    -      -     vsubpd	%zmm30, %zmm25, %zmm20
 | 
			
		||||
 -      -      -     0.94    -      -      -     0.06    -      -     leal	(%rcx,%rcx), %edx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmpq	%rdx, %r11
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     setne	%dl
 | 
			
		||||
 -      -     0.44    -      -      -      -      -     0.56    -     sete	%al
 | 
			
		||||
 -      -      -     0.07    -      -      -     0.02   0.91    -     addl	%ecx, %ecx
 | 
			
		||||
 -      -      -     0.53    -      -      -     0.46   0.01    -     incl	%ecx
 | 
			
		||||
 -      -      -     0.51    -      -      -     0.46   0.03    -     cmpq	%rcx, %r11
 | 
			
		||||
 -      -     0.02    -      -      -      -      -     0.98    -     sete	%cl
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vmulpd	%zmm18, %zmm19, %zmm18
 | 
			
		||||
 -      -     0.01   0.99   0.51   0.49    -      -      -      -     vmovupd	528(%rsp), %zmm19
 | 
			
		||||
 -      -     0.47    -      -      -      -     0.53    -      -     vsubpd	%zmm28, %zmm19, %zmm19
 | 
			
		||||
 -      -     0.04    -      -      -      -      -     0.96    -     setne	%dil
 | 
			
		||||
 -      -      -     0.95    -      -      -     0.02   0.03    -     movl	%edi, %ebp
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     shlb	$4, %bpl
 | 
			
		||||
 -      -      -     0.96    -      -      -      -     0.04    -     subb	%al, %bpl
 | 
			
		||||
 -      -      -     0.06    -      -      -      -     0.94    -     addb	$-17, %bpl
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ebp, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
 -      -     0.02   0.97   0.50   0.50    -     0.01    -      -     vmovupd	272(%rsp), %zmm17
 | 
			
		||||
 -      -     0.96    -      -      -      -     0.04    -      -     vsubpd	%zmm29, %zmm17, %zmm17
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leal	(%rdx,%rdx), %eax
 | 
			
		||||
 -      -      -     0.05    -      -      -      -     0.95    -     movl	%edi, %ebp
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm2, %zmm18, %zmm18
 | 
			
		||||
 -      -     0.53    -      -      -      -     0.47    -      -     vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1}
 | 
			
		||||
 -      -     0.45    -      -      -      -     0.55    -      -     vmulpd	%zmm20, %zmm20, %zmm3
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vfmadd231pd	%zmm17, %zmm17, %zmm3
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vfmadd231pd	%zmm19, %zmm19, %zmm3
 | 
			
		||||
 -      -     0.47    -      -      -      -     0.53    -      -     vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1}
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm3, %zmm16
 | 
			
		||||
 -      -     0.53    -      -      -      -     0.47    -      -     vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1}
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm16, %zmm21, %zmm18
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 -      -     0.97    -      -      -      -     0.03    -      -     vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vaddpd	%zmm1, %zmm18, %zmm31
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulpd	%zmm16, %zmm22, %zmm16
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vmulpd	%zmm31, %zmm16, %zmm16
 | 
			
		||||
 -      -      -     0.99   0.50   0.50    -     0.01    -      -     vmovupd	464(%rsp), %zmm31
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vsubpd	%zmm28, %zmm31, %zmm31
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     shlb	$5, %bpl
 | 
			
		||||
 -      -      -     0.94    -      -      -      -     0.06    -     orb	%al, %bpl
 | 
			
		||||
 -      -      -     0.04    -      -      -      -     0.96    -     orb	$-35, %bpl
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ebp, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
 -      -      -     0.99   0.50   0.50    -     0.01    -      -     vmovupd	208(%rsp), %zmm3
 | 
			
		||||
 -      -     0.95    -      -      -      -     0.05    -      -     vsubpd	%zmm29, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm16, %zmm18, %zmm16
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubpd	%zmm30, %zmm26, %zmm18
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1}
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vmulpd	%zmm18, %zmm18, %zmm19
 | 
			
		||||
 -      -     0.06    -      -      -      -     0.94    -      -     vfmadd231pd	%zmm3, %zmm3, %zmm19
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm31, %zmm31, %zmm19
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1}
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm19, %zmm17
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1}
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vmulpd	%zmm17, %zmm21, %zmm16
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 -      -     0.09    -      -      -      -     0.91    -      -     vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vaddpd	%zmm1, %zmm16, %zmm20
 | 
			
		||||
 -      -     0.93    -      -      -      -     0.07    -      -     vmulpd	%zmm17, %zmm22, %zmm17
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vmulpd	%zmm20, %zmm17, %zmm17
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm17, %zmm16, %zmm16
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leal	(,%rdx,4), %eax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlb	$6, %dil
 | 
			
		||||
 -      -      -     0.02    -      -      -      -     0.98    -     orb	%al, %dil
 | 
			
		||||
 -      -      -     0.48    -      -      -      -     0.52    -     orb	$-69, %dil
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%edi, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovupd	400(%rsp), %zmm17
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vsubpd	%zmm28, %zmm17, %zmm17
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vsubpd	%zmm29, %zmm23, %zmm19
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -     vsubpd	%zmm30, %zmm27, %zmm20
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -     vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1}
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vmulpd	%zmm20, %zmm20, %zmm28
 | 
			
		||||
 -      -     0.04    -      -      -      -     0.96    -      -     vfmadd231pd	%zmm19, %zmm19, %zmm28
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vfmadd231pd	%zmm17, %zmm17, %zmm28
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1}
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm28, %zmm3
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulpd	%zmm3, %zmm21, %zmm16
 | 
			
		||||
 -      -     0.55    -      -      -      -     0.45    -      -     vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddpd	%zmm1, %zmm16, %zmm18
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulpd	%zmm3, %zmm22, %zmm3
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vmulpd	%zmm18, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm3, %zmm16, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlb	$3, %dl
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlb	$7, %cl
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     orb	%dl, %cl
 | 
			
		||||
 -      -      -     0.52    -      -      -      -     0.48    -     addb	$-9, %cl
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ecx, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -     vmulpd	%zmm2, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.97    -      -      -      -     0.03    -      -     vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1}
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1}
 | 
			
		||||
 -      -     0.97    -      -      -      -     0.03    -      -     vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1}
 | 
			
		||||
 -      -      -     0.48    -      -      -      -     0.52    -     incq	%rbx
 | 
			
		||||
 -      -      -     0.52    -      -      -      -     0.48    -     cmpq	%rbx, %r9
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     jne	.LBB5_12
 | 
			
		||||
@@ -1,167 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-dp.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-14 12:51:57
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                 Port pressure in cycles                                                  
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
2241 |             |             |             |             |      |       |      |      |      |      ||      |      |   # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
 | 
			
		||||
2242 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
2243 |             |             |             |             |      |       |      |      |      |      ||      |      |   .LBB5_12:                               #   Parent Loop BB5_7 Depth=1
 | 
			
		||||
2244 |             |             |             |             |      |       |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
2245 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   movslq (%r10,%rbx,4), %rcx
 | 
			
		||||
2246 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||  6.0 |      |   leaq (%rcx,%rcx,2), %rdx
 | 
			
		||||
2247 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||  1.0 |      |   shlq $6, %rdx
 | 
			
		||||
2248 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd (%rsi,%rdx), %zmm28             # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2249 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 64(%rsi,%rdx), %zmm29           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2250 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   vmovupd 128(%rsi,%rdx), %zmm30          # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2251 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 16(%rsp), %zmm3                 # 64-byte Reload
 | 
			
		||||
2252 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm3, %zmm3
 | 
			
		||||
2253 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vsubpd %zmm30, %zmm24, %zmm31
 | 
			
		||||
2254 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 336(%rsp), %zmm16               # 64-byte Reload
 | 
			
		||||
2255 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm16, %zmm16
 | 
			
		||||
2256 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm31, %zmm31, %zmm17
 | 
			
		||||
2257 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm16, %zmm16, %zmm17  # zmm17 = (zmm16 * zmm16) + zmm17
 | 
			
		||||
2258 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm3, %zmm3, %zmm17    # zmm17 = (zmm3 * zmm3) + zmm17
 | 
			
		||||
2259 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||  6.0 |      |   vrcp14pd %zmm17, %zmm18
 | 
			
		||||
2260 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm21, %zmm19
 | 
			
		||||
2261 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2262 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2263 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vaddpd %zmm1, %zmm19, %zmm20
 | 
			
		||||
2264 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm22, %zmm18
 | 
			
		||||
2265 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm18, %zmm18
 | 
			
		||||
2266 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||      |      |   vsubpd %zmm30, %zmm25, %zmm20
 | 
			
		||||
2267 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   leal (%rcx,%rcx), %edx
 | 
			
		||||
2268 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   cmpq %rdx, %r11
 | 
			
		||||
2269 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %dl
 | 
			
		||||
2270 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %al
 | 
			
		||||
2271 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   addl %ecx, %ecx
 | 
			
		||||
2272 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   incl %ecx
 | 
			
		||||
2273 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   cmpq %rcx, %r11
 | 
			
		||||
2274 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %cl
 | 
			
		||||
2275 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm19, %zmm18
 | 
			
		||||
2276 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 528(%rsp), %zmm19               # 64-byte Reload
 | 
			
		||||
2277 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm19, %zmm19
 | 
			
		||||
2278 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %dil
 | 
			
		||||
2279 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2280 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |  1.0 |   shlb $4, %bpl
 | 
			
		||||
2281 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   subb %al, %bpl
 | 
			
		||||
2282 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   addb $-17, %bpl
 | 
			
		||||
2283 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2284 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
2285 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 272(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2286 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm17, %zmm17
 | 
			
		||||
2287 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   leal (%rdx,%rdx), %eax
 | 
			
		||||
2288 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2289 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm2, %zmm18, %zmm18
 | 
			
		||||
2290 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
 | 
			
		||||
2291 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm3
 | 
			
		||||
2292 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm3   # zmm3 = (zmm17 * zmm17) + zmm3
 | 
			
		||||
2293 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm3   # zmm3 = (zmm19 * zmm19) + zmm3
 | 
			
		||||
2294 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
 | 
			
		||||
2295 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vrcp14pd %zmm3, %zmm16
 | 
			
		||||
2296 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
 | 
			
		||||
2297 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm21, %zmm18
 | 
			
		||||
2298 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2299 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2300 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vaddpd %zmm1, %zmm18, %zmm31
 | 
			
		||||
2301 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm22, %zmm16
 | 
			
		||||
2302 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm31, %zmm16, %zmm16
 | 
			
		||||
2303 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 464(%rsp), %zmm31               # 64-byte Reload
 | 
			
		||||
2304 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm31, %zmm31
 | 
			
		||||
2305 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |  1.0 |   shlb $5, %bpl
 | 
			
		||||
2306 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   orb %al, %bpl
 | 
			
		||||
2307 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   orb $-35, %bpl
 | 
			
		||||
2308 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2309 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
2310 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 208(%rsp), %zmm3                # 64-byte Reload
 | 
			
		||||
2311 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm3, %zmm3
 | 
			
		||||
2312 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm18, %zmm16
 | 
			
		||||
2313 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm30, %zmm26, %zmm18
 | 
			
		||||
2314 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2315 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
 | 
			
		||||
2316 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm18, %zmm19
 | 
			
		||||
2317 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm3, %zmm19    # zmm19 = (zmm3 * zmm3) + zmm19
 | 
			
		||||
2318 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm31, %zmm31, %zmm19  # zmm19 = (zmm31 * zmm31) + zmm19
 | 
			
		||||
2319 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
 | 
			
		||||
2320 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vrcp14pd %zmm19, %zmm17
 | 
			
		||||
2321 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
 | 
			
		||||
2322 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm17, %zmm21, %zmm16
 | 
			
		||||
2323 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2324 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2325 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm20
 | 
			
		||||
2326 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm17, %zmm22, %zmm17
 | 
			
		||||
2327 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm20, %zmm17, %zmm17
 | 
			
		||||
2328 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||      |      |   vmulpd %zmm17, %zmm16, %zmm16
 | 
			
		||||
2329 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   leal (,%rdx,4), %eax
 | 
			
		||||
2330 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shlb $6, %dil
 | 
			
		||||
2331 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orb %al, %dil
 | 
			
		||||
2332 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orb $-69, %dil
 | 
			
		||||
2333 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %edi, %k1
 | 
			
		||||
2334 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
2335 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 400(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2336 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm17, %zmm17
 | 
			
		||||
2337 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm23, %zmm19
 | 
			
		||||
2338 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm30, %zmm27, %zmm20
 | 
			
		||||
2339 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2340 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
 | 
			
		||||
2341 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm28
 | 
			
		||||
2342 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm28  # zmm28 = (zmm19 * zmm19) + zmm28
 | 
			
		||||
2343 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm28  # zmm28 = (zmm17 * zmm17) + zmm28
 | 
			
		||||
2344 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
 | 
			
		||||
2345 | 2.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vrcp14pd %zmm28, %zmm3
 | 
			
		||||
2346 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
 | 
			
		||||
2347 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm3, %zmm21, %zmm16
 | 
			
		||||
2348 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2349 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2350 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm18
 | 
			
		||||
2351 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm3, %zmm22, %zmm3
 | 
			
		||||
2352 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm3, %zmm3
 | 
			
		||||
2353 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm3, %zmm16, %zmm3
 | 
			
		||||
2354 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shlb $3, %dl
 | 
			
		||||
2355 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shlb $7, %cl
 | 
			
		||||
2356 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orb %dl, %cl
 | 
			
		||||
2357 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   addb $-9, %cl
 | 
			
		||||
2358 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
2359 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
2360 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2361 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
 | 
			
		||||
2362 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
 | 
			
		||||
2363 | 0.24        |             |             |             |      | 0.760 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
 | 
			
		||||
2364 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   incq %rbx
 | 
			
		||||
2365 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   cmpq %rbx, %r9
 | 
			
		||||
2366 |             |             |             |             |      |       |      |      |      |      ||      |      | * jne .LBB5_12
 | 
			
		||||
2367 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       44.0          15.0          5.50   5.50   5.50   5.50          43.99   15.0                           71    6.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
2280 |  6.0 | shlb	$4, %bpl                       | [2280, 2281, 2282, 2305, 2306, 2307]
 | 
			
		||||
2363 |  4.0 | vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
 | 
			
		||||
2362 |  4.0 | vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
 | 
			
		||||
2361 |  4.0 | vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
 | 
			
		||||
2346 |  4.0 | vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
 | 
			
		||||
2344 |  4.0 | vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
 | 
			
		||||
2340 |  4.0 | vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
 | 
			
		||||
2321 |  4.0 | vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
 | 
			
		||||
2319 |  4.0 | vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
 | 
			
		||||
2315 |  4.0 | vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
 | 
			
		||||
2296 |  4.0 | vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
 | 
			
		||||
2294 |  4.0 | vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
 | 
			
		||||
2290 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
 | 
			
		||||
2330 |  3.0 | shlb	$6, %dil                       | [2330, 2331, 2332]
 | 
			
		||||
2364 |  1.0 | incq	%rbx                           | [2364]
 | 
			
		||||
 | 
			
		||||
@@ -1,167 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-dp.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:30:53
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
2241 |             |      |             |             |      |      |      |      ||      |      |   # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
 | 
			
		||||
2242 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
2243 |             |      |             |             |      |      |      |      ||      |      |   .LBB5_12:                               #   Parent Loop BB5_7 Depth=1
 | 
			
		||||
2244 |             |      |             |             |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
2245 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   movslq (%r10,%rbx,4), %rcx
 | 
			
		||||
2246 |             | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   leaq (%rcx,%rcx,2), %rdx
 | 
			
		||||
2247 | 0.00        |      |             |             |      |      | 1.00 |      ||  1.0 |      |   shlq $6, %rdx
 | 
			
		||||
2248 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd (%rsi,%rdx), %zmm28             # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2249 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 64(%rsi,%rdx), %zmm29           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2250 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   vmovupd 128(%rsi,%rdx), %zmm30          # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2251 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 16(%rsp), %zmm3                 # 64-byte Reload
 | 
			
		||||
2252 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm28, %zmm3, %zmm3
 | 
			
		||||
2253 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubpd %zmm30, %zmm24, %zmm31
 | 
			
		||||
2254 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 336(%rsp), %zmm16               # 64-byte Reload
 | 
			
		||||
2255 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm16, %zmm16
 | 
			
		||||
2256 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm31, %zmm31, %zmm17
 | 
			
		||||
2257 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm16, %zmm16, %zmm17  # zmm17 = (zmm16 * zmm16) + zmm17
 | 
			
		||||
2258 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm3, %zmm3, %zmm17    # zmm17 = (zmm3 * zmm3) + zmm17
 | 
			
		||||
2259 | 2.50        |      |             |             |      | 0.50 |      |      ||  8.0 |      |   vrcp14pd %zmm17, %zmm18
 | 
			
		||||
2260 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm21, %zmm19
 | 
			
		||||
2261 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2262 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2263 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vaddpd %zmm1, %zmm19, %zmm20
 | 
			
		||||
2264 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm22, %zmm18
 | 
			
		||||
2265 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm18, %zmm18
 | 
			
		||||
2266 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vsubpd %zmm30, %zmm25, %zmm20
 | 
			
		||||
2267 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rcx,%rcx), %edx
 | 
			
		||||
2268 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rdx, %r11
 | 
			
		||||
2269 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dl
 | 
			
		||||
2270 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %al
 | 
			
		||||
2271 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   addl %ecx, %ecx
 | 
			
		||||
2272 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   incl %ecx
 | 
			
		||||
2273 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rcx, %r11
 | 
			
		||||
2274 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %cl
 | 
			
		||||
2275 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm19, %zmm18
 | 
			
		||||
2276 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 528(%rsp), %zmm19               # 64-byte Reload
 | 
			
		||||
2277 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vsubpd %zmm28, %zmm19, %zmm19
 | 
			
		||||
2278 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dil
 | 
			
		||||
2279 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2280 | 0.00        |      |             |             |      |      | 1.00 |      ||      |  1.0 |   shlb $4, %bpl
 | 
			
		||||
2281 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |  1.0 |   subb %al, %bpl
 | 
			
		||||
2282 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |  1.0 |   addb $-17, %bpl
 | 
			
		||||
2283 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2284 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
2285 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 272(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2286 | 0.25        |      |             |             |      | 0.75 |      |      ||      |      |   vsubpd %zmm29, %zmm17, %zmm17
 | 
			
		||||
2287 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rdx,%rdx), %eax
 | 
			
		||||
2288 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2289 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm2, %zmm18, %zmm18
 | 
			
		||||
2290 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
 | 
			
		||||
2291 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm3
 | 
			
		||||
2292 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm3   # zmm3 = (zmm17 * zmm17) + zmm3
 | 
			
		||||
2293 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm3   # zmm3 = (zmm19 * zmm19) + zmm3
 | 
			
		||||
2294 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
 | 
			
		||||
2295 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm3, %zmm16
 | 
			
		||||
2296 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
 | 
			
		||||
2297 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm21, %zmm18
 | 
			
		||||
2298 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2299 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2300 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm1, %zmm18, %zmm31
 | 
			
		||||
2301 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm22, %zmm16
 | 
			
		||||
2302 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm31, %zmm16, %zmm16
 | 
			
		||||
2303 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 464(%rsp), %zmm31               # 64-byte Reload
 | 
			
		||||
2304 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm28, %zmm31, %zmm31
 | 
			
		||||
2305 | 0.00        |      |             |             |      |      | 1.00 |      ||      |  1.0 |   shlb $5, %bpl
 | 
			
		||||
2306 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |  1.0 |   orb %al, %bpl
 | 
			
		||||
2307 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |  1.0 |   orb $-35, %bpl
 | 
			
		||||
2308 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2309 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
2310 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 208(%rsp), %zmm3                # 64-byte Reload
 | 
			
		||||
2311 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm3, %zmm3
 | 
			
		||||
2312 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm18, %zmm16
 | 
			
		||||
2313 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm26, %zmm18
 | 
			
		||||
2314 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2315 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
 | 
			
		||||
2316 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm18, %zmm19
 | 
			
		||||
2317 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm3, %zmm19    # zmm19 = (zmm3 * zmm3) + zmm19
 | 
			
		||||
2318 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm31, %zmm31, %zmm19  # zmm19 = (zmm31 * zmm31) + zmm19
 | 
			
		||||
2319 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
 | 
			
		||||
2320 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm19, %zmm17
 | 
			
		||||
2321 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
 | 
			
		||||
2322 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm21, %zmm16
 | 
			
		||||
2323 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2324 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2325 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm20
 | 
			
		||||
2326 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm22, %zmm17
 | 
			
		||||
2327 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm17, %zmm17
 | 
			
		||||
2328 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm16, %zmm16
 | 
			
		||||
2329 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (,%rdx,4), %eax
 | 
			
		||||
2330 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $6, %dil
 | 
			
		||||
2331 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   orb %al, %dil
 | 
			
		||||
2332 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   orb $-69, %dil
 | 
			
		||||
2333 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %edi, %k1
 | 
			
		||||
2334 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
2335 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 400(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2336 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm28, %zmm17, %zmm17
 | 
			
		||||
2337 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm23, %zmm19
 | 
			
		||||
2338 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm27, %zmm20
 | 
			
		||||
2339 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2340 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
 | 
			
		||||
2341 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm28
 | 
			
		||||
2342 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm28  # zmm28 = (zmm19 * zmm19) + zmm28
 | 
			
		||||
2343 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm28  # zmm28 = (zmm17 * zmm17) + zmm28
 | 
			
		||||
2344 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
 | 
			
		||||
2345 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm28, %zmm3
 | 
			
		||||
2346 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
 | 
			
		||||
2347 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm21, %zmm16
 | 
			
		||||
2348 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2349 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2350 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm18
 | 
			
		||||
2351 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm22, %zmm3
 | 
			
		||||
2352 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm18, %zmm3, %zmm3
 | 
			
		||||
2353 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm16, %zmm3
 | 
			
		||||
2354 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $3, %dl
 | 
			
		||||
2355 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $7, %cl
 | 
			
		||||
2356 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   orb %dl, %cl
 | 
			
		||||
2357 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   addb $-9, %cl
 | 
			
		||||
2358 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
2359 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
2360 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2361 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
 | 
			
		||||
2362 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
 | 
			
		||||
2363 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
 | 
			
		||||
2364 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   incq %rbx
 | 
			
		||||
2365 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rbx, %r9
 | 
			
		||||
2366 |             |      |             |             |      |      |      |      ||      |      | * jne .LBB5_12
 | 
			
		||||
2367 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       44.0          15.0   5.50   5.50   5.50   5.50          44.0   15.0           66.0    6.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
2280 |  6.0 | shlb	$4, %bpl                       | [2280, 2281, 2282, 2305, 2306, 2307]
 | 
			
		||||
2363 |  4.0 | vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
 | 
			
		||||
2362 |  4.0 | vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
 | 
			
		||||
2361 |  4.0 | vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
 | 
			
		||||
2346 |  4.0 | vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
 | 
			
		||||
2344 |  4.0 | vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
 | 
			
		||||
2340 |  4.0 | vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
 | 
			
		||||
2321 |  4.0 | vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
 | 
			
		||||
2319 |  4.0 | vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
 | 
			
		||||
2315 |  4.0 | vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
 | 
			
		||||
2296 |  4.0 | vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
 | 
			
		||||
2294 |  4.0 | vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
 | 
			
		||||
2290 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
 | 
			
		||||
2330 |  3.0 | shlb	$6, %dil                       | [2330, 2331, 2332]
 | 
			
		||||
2364 |  1.0 | incq	%rbx                           | [2364]
 | 
			
		||||
 | 
			
		||||
@@ -1,162 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icx-avx512-sp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 64.00 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 50.0     0.0  |  7.0  |  9.5     8.1  |  9.5     7.9  |  3.0  | 50.0  |  7.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | movsxd rax, dword ptr [r11+rdx*4]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov rsi, rax
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl rsi, 0x5
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea rbx, ptr [rsi+rsi*2]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm15, zmmword ptr [rdi+rbx*1]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x80]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm24, zmm1, zmm15
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm25, zmm1, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm26, zmm9, zmm27
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm21, zmm1, zmm15
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm22, zmm1, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm23, zmm10, zmm27
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x1c0]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm17, zmm1, zmm15
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0xc0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm19, zmm1, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm20, zmm11, zmm27
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x180]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm18, zmm1, zmm15
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm16, zmm8, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm15, zmm12, zmm27
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm27, zmm26, zmm26
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231ps zmm27, zmm25, zmm25
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm27, zmm24, zmm24
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm28, zmm23, zmm23
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm28, zmm22, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231ps zmm28, zmm21, zmm21
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm29, zmm20, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231ps zmm29, zmm19, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm29, zmm17, zmm17
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm30, zmm15, zmm15
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm30, zmm16, zmm16
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm31, zmm27
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm1, zmm28
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm2, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm30, zmm18, zmm18
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm3, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm4, zmm6, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm4, zmm31, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm4, zmm31, zmm4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm5, zmm4, zmm13
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm7, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5, zmm31, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm6, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm31, zmm1, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm1, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4, zmm4, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm31, zmm13
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm1, zmm7, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm1, zmm1, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5, zmm6, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm5, zmm2, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5, zmm2, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm1, zmm31, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm31, zmm5, zmm13
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm2, zmm7, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm2, zmm2, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm6, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm31, zmm3, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm3, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm2, zmm5, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm31, zmm13
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm3, zmm7, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm3, zmm3, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm3, zmm31, zmm3
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | xor esi, esi
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | xor edi, edi
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | test eax, 0x7fffffff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz sil
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dil
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | mov eax, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmovz eax, r8d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | mov ecx, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmovz ecx, r9d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | xor esi, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, esi
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm27, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4, zmm4, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5{k1}{z}, zmm24, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm24{k1}{z}, zmm25, zmm4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4{k1}{z}, zmm26, zmm4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea esi, ptr [rdi+rdi*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | or esi, 0xfc
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, esi
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm28, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm1, zmm1, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm21{k1}{z}, zmm21, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm5, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm21{k1}{z}, zmm22, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm21, zmm24, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm1{k1}{z}, zmm23, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm1, zmm4, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm29, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm2, zmm2, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4{k1}{z}, zmm17, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm17{k1}{z}, zmm19, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm2{k1}{z}, zmm20, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ecx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm30, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm3, zmm3, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm18{k1}{z}, zmm18, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm4, zmm4, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm4, zmm5, zmm4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5{k1}{z}, zmm16, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm17, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm5, zmm21, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm3{k1}{z}, zmm15, zmm3
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | mov rax, qword ptr [r15+0xb0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm2, zmm2, zmm3
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm3, zmmword ptr [rax+rbx*1]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm3, zmm3, zmm4
 | 
			
		||||
|   2      |             |      | 0.5         | 0.5         | 1.0  |      |      |      | vmovups zmmword ptr [rax+rbx*1], zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm1, zmm1, zmm2
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm2, zmm2, zmm5
 | 
			
		||||
|   2      |             |      | 0.5         | 0.5         | 1.0  |      |      |      | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm1, zmm2, zmm1
 | 
			
		||||
|   2      |             |      | 0.5         | 0.5         | 1.0  |      |      |      | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r10, rdx
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jz 0x34
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | mov rdi, qword ptr [r15+0xa0]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | inc rdx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | jmp 0xfffffffffffffcfc
 | 
			
		||||
Total Num Of Uops: 140
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,304 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      13000
 | 
			
		||||
Total Cycles:      5640
 | 
			
		||||
Total uOps:        15400
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.73
 | 
			
		||||
IPC:               2.30
 | 
			
		||||
Block RThroughput: 40.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      5     0.50    *                   movslq	(%r11,%rdx,4), %rax
 | 
			
		||||
 1      1     0.25                        movq	%rax, %rsi
 | 
			
		||||
 1      1     0.50                        shlq	$5, %rsi
 | 
			
		||||
 1      1     0.50                        leaq	(%rsi,%rsi,2), %rbx
 | 
			
		||||
 2      8     0.50    *                   vmovups	(%rdi,%rbx), %zmm15
 | 
			
		||||
 2      8     0.50    *                   vmovups	32(%rdi,%rbx), %zmm16
 | 
			
		||||
 2      8     0.50    *                   vmovups	64(%rdi,%rbx), %zmm27
 | 
			
		||||
 2      8     0.50    *                   vmovups	128(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm24
 | 
			
		||||
 2      8     0.50    *                   vmovups	320(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm1, %zmm25
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm9, %zmm26
 | 
			
		||||
 2      8     0.50    *                   vmovups	(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm21
 | 
			
		||||
 2      8     0.50    *                   vmovups	256(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm1, %zmm22
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm10, %zmm23
 | 
			
		||||
 2      8     0.50    *                   vmovups	448(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm17
 | 
			
		||||
 2      8     0.50    *                   vmovups	192(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm1, %zmm19
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm11, %zmm20
 | 
			
		||||
 2      8     0.50    *                   vmovups	384(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm8, %zmm16
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm12, %zmm15
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm26, %zmm26, %zmm27
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm25, %zmm25, %zmm27
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm24, %zmm24, %zmm27
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm23, %zmm23, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm22, %zmm22, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm21, %zmm21, %zmm28
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm20, %zmm20, %zmm29
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm19, %zmm19, %zmm29
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm17, %zmm17, %zmm29
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm15, %zmm15, %zmm30
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm16, %zmm16, %zmm30
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm27, %zmm31
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm28, %zmm1
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm29, %zmm2
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm18, %zmm18, %zmm30
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm30, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm6, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm4, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm7, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm31, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm6, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm4, %zmm4
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm7, %zmm1
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm1, %zmm1
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm6, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm31, %zmm1
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm5, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm7, %zmm2
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm2, %zmm2
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm6, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm5, %zmm2
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm7, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm31, %zmm3
 | 
			
		||||
 1      0     0.17                        xorl	%esi, %esi
 | 
			
		||||
 1      0     0.17                        xorl	%edi, %edi
 | 
			
		||||
 1      1     0.25                        testl	$2147483647, %eax
 | 
			
		||||
 1      1     0.50                        sete	%sil
 | 
			
		||||
 1      1     0.50                        setne	%dil
 | 
			
		||||
 1      1     0.25                        movl	$255, %eax
 | 
			
		||||
 1      1     0.50                        cmovel	%r8d, %eax
 | 
			
		||||
 1      1     0.25                        movl	$255, %ecx
 | 
			
		||||
 1      1     0.50                        cmovel	%r9d, %ecx
 | 
			
		||||
 1      1     0.25                        xorl	$255, %esi
 | 
			
		||||
 1      1     1.00                        kmovd	%esi, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm27, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm4, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm24, %zmm5 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm25, %zmm24 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm26, %zmm4 {%k1} {z}
 | 
			
		||||
 1      1     0.50                        leal	(%rdi,%rdi,2), %esi
 | 
			
		||||
 1      1     0.25                        orl	$252, %esi
 | 
			
		||||
 1      1     1.00                        kmovd	%esi, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm1, %zmm1
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm21, %zmm21 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm21, %zmm5, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm22, %zmm21 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm21, %zmm24, %zmm21
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm23, %zmm1 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm1, %zmm4, %zmm1
 | 
			
		||||
 1      1     1.00                        kmovd	%eax, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm29, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm2, %zmm2
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm17, %zmm4 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm19, %zmm17 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm20, %zmm2 {%k1} {z}
 | 
			
		||||
 1      1     1.00                        kmovd	%ecx, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm30, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm18, %zmm18 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm18, %zmm4, %zmm4
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm4, %zmm5, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm16, %zmm5 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm5, %zmm17, %zmm5
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm5, %zmm21, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm15, %zmm3 {%k1} {z}
 | 
			
		||||
 1      5     0.50    *                   movq	176(%r15), %rax
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm3, %zmm2, %zmm2
 | 
			
		||||
 2      8     0.50    *                   vmovups	(%rax,%rbx), %zmm3
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm4, %zmm3, %zmm3
 | 
			
		||||
 2      1     1.00           *            vmovups	%zmm3, (%rax,%rbx)
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm2, %zmm1, %zmm1
 | 
			
		||||
 2      8     0.50    *                   vmovups	32(%rax,%rbx), %zmm2
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm5, %zmm2, %zmm2
 | 
			
		||||
 2      1     1.00           *            vmovups	%zmm2, 32(%rax,%rbx)
 | 
			
		||||
 2      8     0.50    *                   vmovups	64(%rax,%rbx), %zmm2
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm1, %zmm2, %zmm1
 | 
			
		||||
 2      1     1.00           *            vmovups	%zmm1, 64(%rax,%rbx)
 | 
			
		||||
 1      1     0.25                        cmpq	%rdx, %r10
 | 
			
		||||
 1      1     0.50                        je	.LBB4_18
 | 
			
		||||
 1      5     0.50    *                   movq	160(%r15), %rdi
 | 
			
		||||
 1      1     0.25                        incq	%rdx
 | 
			
		||||
 1      1     0.50                        jmp	.LBB4_8
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -      -     52.01  14.97  8.49   8.51   3.00   52.02  11.00  2.00   
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -     movslq	(%r11,%rdx,4), %rax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     movq	%rax, %rsi
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlq	$5, %rsi
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rsi,%rsi,2), %rbx
 | 
			
		||||
 -      -     0.01   0.99   0.50   0.50    -      -      -      -     vmovups	(%rdi,%rbx), %zmm15
 | 
			
		||||
 -      -      -      -     0.50   0.50    -     1.00    -      -     vmovups	32(%rdi,%rbx), %zmm16
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	64(%rdi,%rbx), %zmm27
 | 
			
		||||
 -      -      -     0.99   0.51   0.49    -     0.01    -      -     vmovups	128(%rsp), %zmm1
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm15, %zmm1, %zmm24
 | 
			
		||||
 -      -      -     1.00   0.49   0.51    -      -      -      -     vmovups	320(%rsp), %zmm1
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm16, %zmm1, %zmm25
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm27, %zmm9, %zmm26
 | 
			
		||||
 -      -     0.01   0.99   0.51   0.49    -      -      -      -     vmovups	(%rsp), %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm15, %zmm1, %zmm21
 | 
			
		||||
 -      -      -      -     0.49   0.51    -     1.00    -      -     vmovups	256(%rsp), %zmm1
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm16, %zmm1, %zmm22
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm27, %zmm10, %zmm23
 | 
			
		||||
 -      -      -     1.00   0.51   0.49    -      -      -      -     vmovups	448(%rsp), %zmm1
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm15, %zmm1, %zmm17
 | 
			
		||||
 -      -     0.01    -     0.49   0.51    -     0.99    -      -     vmovups	192(%rsp), %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm16, %zmm1, %zmm19
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm27, %zmm11, %zmm20
 | 
			
		||||
 -      -     0.99    -     0.50   0.50    -     0.01    -      -     vmovups	384(%rsp), %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm15, %zmm1, %zmm18
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm16, %zmm8, %zmm16
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm27, %zmm12, %zmm15
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm26, %zmm26, %zmm27
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231ps	%zmm25, %zmm25, %zmm27
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vfmadd231ps	%zmm24, %zmm24, %zmm27
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm23, %zmm23, %zmm28
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfmadd231ps	%zmm22, %zmm22, %zmm28
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vfmadd231ps	%zmm21, %zmm21, %zmm28
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulps	%zmm20, %zmm20, %zmm29
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vfmadd231ps	%zmm19, %zmm19, %zmm29
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfmadd231ps	%zmm17, %zmm17, %zmm29
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulps	%zmm15, %zmm15, %zmm30
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vfmadd231ps	%zmm16, %zmm16, %zmm30
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm27, %zmm31
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm28, %zmm1
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm29, %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231ps	%zmm18, %zmm18, %zmm30
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm30, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm31, %zmm6, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm13, %zmm4, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm31, %zmm7, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm5, %zmm31, %zmm5
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm1, %zmm6, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm5, %zmm4, %zmm4
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm1, %zmm7, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm5, %zmm1, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm2, %zmm6, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm1, %zmm31, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vaddps	%zmm13, %zmm5, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm7, %zmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm31, %zmm2, %zmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm3, %zmm6, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm5, %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm3, %zmm7, %zmm3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm5, %zmm3, %zmm3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm3, %zmm31, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     xorl	%esi, %esi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     xorl	%edi, %edi
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     testl	$2147483647, %eax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     sete	%sil
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     setne	%dil
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     movl	$255, %eax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmovel	%r8d, %eax
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     movl	$255, %ecx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmovel	%r9d, %ecx
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     xorl	$255, %esi
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%esi, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm27, %k1 {%k1}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm14, %zmm4, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm24, %zmm5 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm4, %zmm25, %zmm24 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm26, %zmm4 {%k1} {z}
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leal	(%rdi,%rdi,2), %esi
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     orl	$252, %esi
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%esi, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm14, %zmm1, %zmm1
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm1, %zmm21, %zmm21 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm21, %zmm5, %zmm5
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulps	%zmm1, %zmm22, %zmm21 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vaddps	%zmm21, %zmm24, %zmm21
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm1, %zmm23, %zmm1 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm1, %zmm4, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%eax, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm29, %k1 {%k1}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm14, %zmm2, %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm17, %zmm4 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm2, %zmm19, %zmm17 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm20, %zmm2 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ecx, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm30, %k1 {%k1}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm14, %zmm3, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm3, %zmm18, %zmm18 {%k1} {z}
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddps	%zmm18, %zmm4, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm4, %zmm5, %zmm4
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm3, %zmm16, %zmm5 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm5, %zmm17, %zmm5
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddps	%zmm5, %zmm21, %zmm5
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm3, %zmm15, %zmm3 {%k1} {z}
 | 
			
		||||
 -      -      -      -     1.00    -      -      -      -      -     movq	176(%r15), %rax
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddps	%zmm3, %zmm2, %zmm2
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	(%rax,%rbx), %zmm3
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm4, %zmm3, %zmm3
 | 
			
		||||
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%zmm3, (%rax,%rbx)
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm2, %zmm1, %zmm1
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	32(%rax,%rbx), %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vsubps	%zmm5, %zmm2, %zmm2
 | 
			
		||||
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%zmm2, 32(%rax,%rbx)
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	64(%rax,%rbx), %zmm2
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm1, %zmm2, %zmm1
 | 
			
		||||
 -      -      -      -      -     1.00   1.00    -      -      -     vmovups	%zmm1, 64(%rax,%rbx)
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmpq	%rdx, %r10
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     je	.LBB4_18
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -     movq	160(%r15), %rdi
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     incq	%rdx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     jmp	.LBB4_8
 | 
			
		||||
@@ -1,116 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-sp.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-14 12:51:43
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                 Port pressure in cycles                                                  
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
1338 |             |             |             |             |      |       |      |      |      |      ||      |      |   # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
 | 
			
		||||
1339 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
1340 |             |             |             |             |      |       |      |      |      |      ||      |      |   .LBB2_12:                               #   Parent Loop BB2_7 Depth=1
 | 
			
		||||
1341 |             |             |             |             |      |       |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
1342 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   movslq (%r11,%rax,4), %rcx
 | 
			
		||||
1343 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||  6.0 |      |   leaq (%rcx,%rcx,2), %rdx
 | 
			
		||||
1344 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||  1.0 |      |   shlq $5, %rdx
 | 
			
		||||
1345 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd (%rsi,%rdx), %zmm16
 | 
			
		||||
1346 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   vbroadcastf64x4 64(%rsi,%rdx), %zmm20   # zmm20 = mem[0,1,2,3,0,1,2,3]
 | 
			
		||||
1347 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vbroadcastf64x4 (%rsi,%rdx), %zmm19     # zmm19 = mem[0,1,2,3,0,1,2,3]
 | 
			
		||||
1348 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
 | 
			
		||||
1349 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm19, %zmm6, %zmm18
 | 
			
		||||
1350 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm21, %zmm10, %zmm17
 | 
			
		||||
1351 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vsubps %zmm20, %zmm14, %zmm16
 | 
			
		||||
1352 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm16, %zmm16, %zmm22
 | 
			
		||||
1353 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231ps %zmm17, %zmm17, %zmm22  # zmm22 = (zmm17 * zmm17) + zmm22
 | 
			
		||||
1354 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231ps %zmm18, %zmm18, %zmm22  # zmm22 = (zmm18 * zmm18) + zmm22
 | 
			
		||||
1355 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||  6.0 |      |   vrcp14ps %zmm22, %zmm23
 | 
			
		||||
1356 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm23, %zmm26, %zmm24
 | 
			
		||||
1357 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm24, %zmm23, %zmm24
 | 
			
		||||
1358 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm24, %zmm23, %zmm24
 | 
			
		||||
1359 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||  4.0 |      |   vaddps %zmm1, %zmm24, %zmm25
 | 
			
		||||
1360 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||      |      |   vmulps %zmm23, %zmm27, %zmm23
 | 
			
		||||
1361 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||  4.0 |      |   vmulps %zmm25, %zmm23, %zmm23
 | 
			
		||||
1362 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||  4.0 |      |   vmulps %zmm23, %zmm24, %zmm23
 | 
			
		||||
1363 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   leal (%rcx,%rcx), %edx
 | 
			
		||||
1364 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %edi, %edi
 | 
			
		||||
1365 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %ebp, %ebp
 | 
			
		||||
1366 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   cmpq %rdx, %r12
 | 
			
		||||
1367 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %dil
 | 
			
		||||
1368 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   leal 1(%rcx,%rcx), %ecx
 | 
			
		||||
1369 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %bpl
 | 
			
		||||
1370 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %edx, %edx
 | 
			
		||||
1371 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %ebx, %ebx
 | 
			
		||||
1372 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   cmpq %rcx, %r12
 | 
			
		||||
1373 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %dl
 | 
			
		||||
1374 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   movl $0, %ecx
 | 
			
		||||
1375 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %bl
 | 
			
		||||
1376 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   cmovel %r8d, %ecx
 | 
			
		||||
1377 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   movl %ebx, %r14d
 | 
			
		||||
1378 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $4, %r14d
 | 
			
		||||
1379 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   subl %ebp, %r14d
 | 
			
		||||
1380 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   leal (%rcx,%rdi,2), %ecx
 | 
			
		||||
1381 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $8, %ecx
 | 
			
		||||
1382 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   addl $239, %r14d
 | 
			
		||||
1383 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   addl $-768, %ecx                     # imm = 0xFD00
 | 
			
		||||
1384 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orl %r14d, %ecx
 | 
			
		||||
1385 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ecx, %k2
 | 
			
		||||
1386 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltps %zmm0, %zmm22, %k2 {%k2}
 | 
			
		||||
1387 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm21, %zmm11, %zmm21
 | 
			
		||||
1388 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm20, %zmm15, %zmm20
 | 
			
		||||
1389 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm19, %zmm7, %zmm19
 | 
			
		||||
1390 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm2, %zmm23, %zmm22
 | 
			
		||||
1391 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
 | 
			
		||||
1392 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm20, %zmm20, %zmm18
 | 
			
		||||
1393 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm21, %zmm21, %zmm18  # zmm18 = (zmm21 * zmm21) + zmm18
 | 
			
		||||
1394 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm19, %zmm19, %zmm18  # zmm18 = (zmm19 * zmm19) + zmm18
 | 
			
		||||
1395 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
 | 
			
		||||
1396 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vrcp14ps %zmm18, %zmm17
 | 
			
		||||
1397 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
 | 
			
		||||
1398 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm17, %zmm26, %zmm16
 | 
			
		||||
1399 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm16, %zmm17, %zmm16
 | 
			
		||||
1400 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm16, %zmm17, %zmm16
 | 
			
		||||
1401 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vaddps %zmm1, %zmm16, %zmm22
 | 
			
		||||
1402 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm17, %zmm27, %zmm17
 | 
			
		||||
1403 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm22, %zmm17, %zmm17
 | 
			
		||||
1404 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm17, %zmm16, %zmm16
 | 
			
		||||
1405 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $6, %ebx
 | 
			
		||||
1406 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   leal (%rbx,%rdi,4), %ecx
 | 
			
		||||
1407 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $7, %edx
 | 
			
		||||
1408 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   leal (%rdx,%rdi,8), %edx
 | 
			
		||||
1409 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $8, %edx
 | 
			
		||||
1410 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   addl %edx, %ecx
 | 
			
		||||
1411 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   addl $-2117, %ecx                    # imm = 0xF7BB
 | 
			
		||||
1412 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ecx, %k2
 | 
			
		||||
1413 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vcmpltps %zmm0, %zmm18, %k2 {%k2}
 | 
			
		||||
1414 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm2, %zmm16, %zmm16
 | 
			
		||||
1415 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
 | 
			
		||||
1416 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
 | 
			
		||||
1417 | 0.24        |             |             |             |      | 0.760 |      |      |      |      ||      |  4.0 |   vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
 | 
			
		||||
1418 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   incq %rax
 | 
			
		||||
1419 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   cmpq %rax, %r10
 | 
			
		||||
1420 |             |             |             |             |      |       |      |      |      |      ||      |      | * jne .LBB2_12
 | 
			
		||||
1421 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       22.5          16.5          2.00   2.00   2.00   2.00          22.49   16.5                           71    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
1417 |  4.0 | vfmadd231ps	%zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
 | 
			
		||||
1416 |  4.0 | vfmadd231ps	%zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
 | 
			
		||||
1415 |  4.0 | vfmadd231ps	%zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
 | 
			
		||||
1397 |  4.0 | vfmadd231ps	%zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
 | 
			
		||||
1395 |  4.0 | vfmadd231ps	%zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
 | 
			
		||||
1391 |  4.0 | vfmadd231ps	%zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
 | 
			
		||||
1418 |  1.0 | incq	%rax                           | [1418]
 | 
			
		||||
 | 
			
		||||
@@ -1,161 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-sp.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:31:04
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
1662 |             |      |             |             |      |      |      |      ||      |      |   # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
 | 
			
		||||
1663 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
1664 |             |      |             |             |      |      |      |      ||      |      |   .LBB4_8:                                # =>This Inner Loop Header: Depth=1
 | 
			
		||||
1665 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   movslq (%r11,%rdx,4), %rax
 | 
			
		||||
1666 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||  1.0 |      |   movq %rax, %rsi
 | 
			
		||||
1667 | 0.00        |      |             |             |      |      | 1.00 |      ||  1.0 |      |   shlq $5, %rsi
 | 
			
		||||
1668 |             | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   leaq (%rsi,%rsi,2), %rbx
 | 
			
		||||
1669 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups (%rdi,%rbx), %zmm15             # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1670 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 32(%rdi,%rbx), %zmm16           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1671 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   vmovups 64(%rdi,%rbx), %zmm27           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1672 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 128(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1673 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm24
 | 
			
		||||
1674 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 320(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1675 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm1, %zmm25
 | 
			
		||||
1676 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubps %zmm27, %zmm9, %zmm26
 | 
			
		||||
1677 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups (%rsp), %zmm1                   # 64-byte Reload
 | 
			
		||||
1678 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm21
 | 
			
		||||
1679 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 256(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1680 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm1, %zmm22
 | 
			
		||||
1681 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm27, %zmm10, %zmm23
 | 
			
		||||
1682 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 448(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1683 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm17
 | 
			
		||||
1684 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 192(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1685 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm1, %zmm19
 | 
			
		||||
1686 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm27, %zmm11, %zmm20
 | 
			
		||||
1687 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 384(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1688 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm18
 | 
			
		||||
1689 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm8, %zmm16
 | 
			
		||||
1690 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm27, %zmm12, %zmm15
 | 
			
		||||
1691 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm26, %zmm26, %zmm27
 | 
			
		||||
1692 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231ps %zmm25, %zmm25, %zmm27  # zmm27 = (zmm25 * zmm25) + zmm27
 | 
			
		||||
1693 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231ps %zmm24, %zmm24, %zmm27  # zmm27 = (zmm24 * zmm24) + zmm27
 | 
			
		||||
1694 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm23, %zmm23, %zmm28
 | 
			
		||||
1695 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm22, %zmm22, %zmm28  # zmm28 = (zmm22 * zmm22) + zmm28
 | 
			
		||||
1696 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm21, %zmm21, %zmm28  # zmm28 = (zmm21 * zmm21) + zmm28
 | 
			
		||||
1697 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm20, %zmm20, %zmm29
 | 
			
		||||
1698 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm19, %zmm19, %zmm29  # zmm29 = (zmm19 * zmm19) + zmm29
 | 
			
		||||
1699 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm17, %zmm17, %zmm29  # zmm29 = (zmm17 * zmm17) + zmm29
 | 
			
		||||
1700 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm15, %zmm15, %zmm30
 | 
			
		||||
1701 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm16, %zmm16, %zmm30  # zmm30 = (zmm16 * zmm16) + zmm30
 | 
			
		||||
1702 | 2.50        |      |             |             |      | 0.50 |      |      ||  8.0 |      |   vrcp14ps %zmm27, %zmm31
 | 
			
		||||
1703 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14ps %zmm28, %zmm1
 | 
			
		||||
1704 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14ps %zmm29, %zmm2
 | 
			
		||||
1705 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm18, %zmm18, %zmm30  # zmm30 = (zmm18 * zmm18) + zmm30
 | 
			
		||||
1706 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14ps %zmm30, %zmm3
 | 
			
		||||
1707 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm31, %zmm6, %zmm4
 | 
			
		||||
1708 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm4, %zmm31, %zmm4
 | 
			
		||||
1709 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm4, %zmm31, %zmm4
 | 
			
		||||
1710 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vaddps %zmm13, %zmm4, %zmm5
 | 
			
		||||
1711 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm7, %zmm31
 | 
			
		||||
1712 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm5, %zmm31, %zmm5
 | 
			
		||||
1713 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm6, %zmm31
 | 
			
		||||
1714 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm1, %zmm31
 | 
			
		||||
1715 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm1, %zmm31
 | 
			
		||||
1716 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm5, %zmm4, %zmm4
 | 
			
		||||
1717 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm13, %zmm31, %zmm5
 | 
			
		||||
1718 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm7, %zmm1
 | 
			
		||||
1719 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm5, %zmm1, %zmm1
 | 
			
		||||
1720 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm6, %zmm5
 | 
			
		||||
1721 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm5, %zmm2, %zmm5
 | 
			
		||||
1722 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm5, %zmm2, %zmm5
 | 
			
		||||
1723 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm31, %zmm1
 | 
			
		||||
1724 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm13, %zmm5, %zmm31
 | 
			
		||||
1725 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm7, %zmm2
 | 
			
		||||
1726 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm2, %zmm2
 | 
			
		||||
1727 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm3, %zmm6, %zmm31
 | 
			
		||||
1728 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm3, %zmm31
 | 
			
		||||
1729 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm3, %zmm31
 | 
			
		||||
1730 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm5, %zmm2
 | 
			
		||||
1731 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm13, %zmm31, %zmm5
 | 
			
		||||
1732 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm3, %zmm7, %zmm3
 | 
			
		||||
1733 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vmulps %zmm5, %zmm3, %zmm3
 | 
			
		||||
1734 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vmulps %zmm3, %zmm31, %zmm3
 | 
			
		||||
1735 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   xorl %esi, %esi
 | 
			
		||||
1736 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   xorl %edi, %edi
 | 
			
		||||
1737 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   testl $2147483647, %eax               # imm = 0x7FFFFFFF
 | 
			
		||||
1738 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %sil
 | 
			
		||||
1739 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dil
 | 
			
		||||
1740 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl $255, %eax
 | 
			
		||||
1741 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   cmovel %r8d, %eax
 | 
			
		||||
1742 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl $255, %ecx
 | 
			
		||||
1743 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   cmovel %r9d, %ecx
 | 
			
		||||
1744 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   xorl $255, %esi
 | 
			
		||||
1745 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %esi, %k1
 | 
			
		||||
1746 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm27, %k1 {%k1}
 | 
			
		||||
1747 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm14, %zmm4, %zmm4
 | 
			
		||||
1748 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
 | 
			
		||||
1749 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
 | 
			
		||||
1750 | 0.25        |      |             |             |      | 0.75 |      |      ||      |      |   vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
 | 
			
		||||
1751 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rdi,%rdi,2), %esi
 | 
			
		||||
1752 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   orl $252, %esi
 | 
			
		||||
1753 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %esi, %k1
 | 
			
		||||
1754 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
1755 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm14, %zmm1, %zmm1
 | 
			
		||||
1756 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
 | 
			
		||||
1757 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vaddps %zmm21, %zmm5, %zmm5
 | 
			
		||||
1758 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
 | 
			
		||||
1759 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm21, %zmm24, %zmm21
 | 
			
		||||
1760 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
 | 
			
		||||
1761 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm1, %zmm4, %zmm1
 | 
			
		||||
1762 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %eax, %k1
 | 
			
		||||
1763 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm29, %k1 {%k1}
 | 
			
		||||
1764 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm14, %zmm2, %zmm2
 | 
			
		||||
1765 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
 | 
			
		||||
1766 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
 | 
			
		||||
1767 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
 | 
			
		||||
1768 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
1769 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm30, %k1 {%k1}
 | 
			
		||||
1770 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm14, %zmm3, %zmm3
 | 
			
		||||
1771 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
 | 
			
		||||
1772 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm18, %zmm4, %zmm4
 | 
			
		||||
1773 | 0.25        |      |             |             |      | 0.75 |      |      ||  4.0 |      |   vaddps %zmm4, %zmm5, %zmm4
 | 
			
		||||
1774 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
 | 
			
		||||
1775 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm5, %zmm17, %zmm5
 | 
			
		||||
1776 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm5, %zmm21, %zmm5
 | 
			
		||||
1777 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
 | 
			
		||||
1778 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   movq 176(%r15), %rax
 | 
			
		||||
1779 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm3, %zmm2, %zmm2
 | 
			
		||||
1780 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups (%rax,%rbx), %zmm3              # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1781 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vsubps %zmm4, %zmm3, %zmm3
 | 
			
		||||
1782 |             |      | 0.50        | 0.50        | 1.00 |      |      |      ||  0.0 |      |   vmovups %zmm3, (%rax,%rbx)              # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1783 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm2, %zmm1, %zmm1
 | 
			
		||||
1784 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 32(%rax,%rbx), %zmm2            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1785 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vsubps %zmm5, %zmm2, %zmm2
 | 
			
		||||
1786 |             |      | 0.50        | 0.50        | 1.00 |      |      |      ||      |      |   vmovups %zmm2, 32(%rax,%rbx)            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1787 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 64(%rax,%rbx), %zmm2            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1788 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vsubps %zmm1, %zmm2, %zmm1
 | 
			
		||||
1789 |             |      | 0.50        | 0.50        | 1.00 |      |      |      ||      |      |   vmovups %zmm1, 64(%rax,%rbx)            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1790 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rdx, %r10
 | 
			
		||||
1791 |             |      |             |             |      |      |      |      ||      |      | * je .LBB4_18
 | 
			
		||||
1792 |             |      |             |             |      |      |      |      ||      |      |   # %bb.9:                                #   in Loop: Header=BB4_8 Depth=1
 | 
			
		||||
1793 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   movq 160(%r15), %rdi
 | 
			
		||||
1794 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |  1.0 |   incq %rdx
 | 
			
		||||
1795 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   jmp .LBB4_8
 | 
			
		||||
1796 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       50.0          9.00   9.50   8.00   9.50   8.00   3.00   50.0   9.00           79.0    1.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
1794 |  1.0 | incq	%rdx                           | [1794]
 | 
			
		||||
 | 
			
		||||
@@ -1,88 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  lammps-icc-avx2.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 25.58 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 13.7     8.0  | 13.6  |  5.5     5.5  |  5.5     5.5  |  0.0  | 13.7  |  7.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmovq rcx, xmm0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vpunpckhqdq xmm2, xmm0, xmm0
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmovq r15, xmm2
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r8d, ecx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shr rcx, 0x20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | lea r14d, ptr [rcx+rcx*2]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | lea r8d, ptr [r8+r8*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd rcx, r8d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r8, r14d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r14d, r15d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shr r15, 0x20
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups xmm7, xmmword ptr [r11+rcx*8]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups xmm6, xmmword ptr [r11+r8*8]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
 | 
			
		||||
|   1      |             | 0.3  |             |             |      | 0.7  |      |      | lea r14d, ptr [r14+r14*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r14, r14d
 | 
			
		||||
|   1      |             | 0.7  |             |             |      | 0.3  |      |      | lea r15d, ptr [r15+r15*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r15, r15d
 | 
			
		||||
|   2      |             |      | 0.5     0.5 | 0.5     0.5 |      | 1.0  |      |      | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
 | 
			
		||||
|   2      |             | 1.0  | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovq xmm0, qword ptr [r11+r14*8+0x10]
 | 
			
		||||
|   2      |             | 0.3  | 0.5     0.5 | 0.5     0.5 |      | 0.7  |      |      | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
 | 
			
		||||
|   2      |             |      | 0.5     0.5 | 0.5     0.5 |      | 1.0  |      |      | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vunpcklpd ymm14, ymm1, ymm6
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vunpckhpd ymm1, ymm1, ymm6
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vsubpd ymm6, ymm10, ymm14
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vinsertf128 ymm7, ymm15, xmm2, 0x1
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vsubpd ymm2, ymm9, ymm1
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vsubpd ymm0, ymm8, ymm7
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vmulpd ymm14, ymm2, ymm2
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vfmadd231pd ymm14, ymm6, ymm6
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vfmadd231pd ymm14, ymm0, ymm0
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vcmppd ymm1, ymm14, ymm5, 0x1
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vpcmpeqd ymm7, ymm7, ymm7
 | 
			
		||||
|   2      | 1.0         |      |             |             |      | 1.0  |      |      | vptest ymm1, ymm7
 | 
			
		||||
|   1      | 1.0     8.0 |      |             |             |      |      |      |      | vdivpd ymm7, ymm4, ymm14
 | 
			
		||||
|   2^     |             | 1.0  | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vmulpd ymm14, ymm7, ymm14
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm15, ymm7, ymm14
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vfmsub213pd ymm14, ymm7, ymm3
 | 
			
		||||
|   2^     | 0.7         | 0.3  | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vmulpd ymm15, ymm15, ymm7
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm7, ymm15, ymm14
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vmulpd ymm6, ymm6, ymm7
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm2, ymm2, ymm7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vandpd ymm6, ymm1, ymm6
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vaddpd ymm13, ymm13, ymm6
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm6, ymm0, ymm7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vandpd ymm0, ymm1, ymm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vandpd ymm1, ymm1, ymm6
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vaddpd ymm12, ymm12, ymm0
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vaddpd ymm11, ymm11, ymm1
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add rdx, 0x4
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp rdx, rsi
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jb 0xffffffffffffff02
 | 
			
		||||
Total Num Of Uops: 62
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,156 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      5600
 | 
			
		||||
Total Cycles:      2352
 | 
			
		||||
Total uOps:        6300
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.68
 | 
			
		||||
IPC:               2.38
 | 
			
		||||
Block RThroughput: 10.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      6     0.50    *                   vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm0, %rcx
 | 
			
		||||
 1      1     1.00                        vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm2, %r15
 | 
			
		||||
 1      1     0.25                        movl	%ecx, %r8d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %rcx
 | 
			
		||||
 1      1     0.50                        leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 1      1     0.50                        leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 1      1     0.25                        movslq	%r8d, %rcx
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r8
 | 
			
		||||
 1      1     0.25                        movl	%r15d, %r14d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %r15
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 1      1     0.50                        leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r14
 | 
			
		||||
 1      1     0.50                        leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 1      1     0.25                        movslq	%r15d, %r15
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 1      1     1.00                        vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 1      1     1.00                        vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 1      3     1.00                        vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 1      4     0.50                        vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 1      1     0.50                        vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 2      3     1.00                        vptest	%ymm7, %ymm1
 | 
			
		||||
 1      14    5.00                        vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 2      11    0.50    *                   vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 2      11    0.50    *                   vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 1      1     0.25                        addq	$4, %rdx
 | 
			
		||||
 1      1     0.25                        cmpq	%rsi, %rdx
 | 
			
		||||
 1      1     0.50                        jb	..B1.22
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -     5.00   16.00  14.12  5.50   5.50    -     13.47  8.41    -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmovq	%xmm0, %rcx
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmovq	%xmm2, %r15
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     movl	%ecx, %r8d
 | 
			
		||||
 -      -     0.06    -      -      -      -      -     0.94    -     shrq	$32, %rcx
 | 
			
		||||
 -      -      -     0.02    -      -      -     0.98    -      -     leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 -      -      -     0.02    -      -      -     0.98    -      -     leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 -      -     0.47   0.02    -      -      -      -     0.51    -     movslq	%r8d, %rcx
 | 
			
		||||
 -      -     0.46   0.02    -      -      -     0.01   0.51    -     movslq	%r14d, %r8
 | 
			
		||||
 -      -     0.03   0.01    -      -      -     0.45   0.51    -     movl	%r15d, %r14d
 | 
			
		||||
 -      -     0.51    -      -      -      -      -     0.49    -     shrq	$32, %r15
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -     vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -     vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 -      -      -      -     0.52   0.48    -      -      -      -     vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 -      -      -     0.02    -      -      -     0.98    -      -     leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 -      -     0.01   0.01    -      -      -     0.01   0.97    -     movslq	%r14d, %r14
 | 
			
		||||
 -      -      -     0.03    -      -      -     0.97    -      -     leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 -      -     0.04    -      -      -      -      -     0.96    -     movslq	%r15d, %r15
 | 
			
		||||
 -      -      -      -     0.07   0.93    -     1.00    -      -     vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 -      -     0.03   0.46   0.49   0.51    -     0.51    -      -     vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -     0.51   0.49    -      -      -      -     vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 -      -     0.47   0.02   0.93   0.07    -     0.51    -      -     vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -     0.50   0.50    -     1.00    -      -     vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -     vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.96   0.04    -      -      -      -      -      -     vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 -      -     0.49   0.51    -      -      -      -      -      -     vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -     vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 -      -     0.03   0.97    -      -      -      -      -      -     vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 -      -     0.94   0.06    -      -      -      -      -      -     vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 -      -     0.47   0.53    -      -      -      -      -      -     vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 -      -     0.96   0.04    -      -      -      -      -      -     vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 -      -     1.00    -      -      -      -     1.00    -      -     vptest	%ymm7, %ymm1
 | 
			
		||||
 -     5.00   1.00    -      -      -      -      -      -      -     vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 -      -     0.93   0.07   0.49   0.51    -      -      -      -     vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 -      -     0.05   0.95    -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.02   0.98    -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 -      -     0.98   0.02    -      -      -      -      -      -     vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.07   0.93   0.51   0.49    -      -      -      -     vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -     vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -     vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.03   0.97    -      -      -      -      -      -     vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 -      -     0.97   0.03    -      -      -      -      -      -     vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 -      -     0.03   0.90    -      -      -     0.07    -      -     vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 -      -     0.06   0.94    -      -      -      -      -      -     vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 -      -     0.03   0.97    -      -      -      -      -      -     vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 -      -     0.46   0.08    -      -      -     0.46    -      -     vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 -      -     0.47   0.01    -      -      -     0.52    -      -     vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -     vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 -      -     0.52   0.48    -      -      -      -      -      -     vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     addq	$4, %rdx
 | 
			
		||||
 -      -      -      -      -      -      -     0.02   0.98    -     cmpq	%rsi, %rdx
 | 
			
		||||
 -      -     0.45    -      -      -      -      -     0.55    -     jb	..B1.22
 | 
			
		||||
@@ -1,158 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      5600
 | 
			
		||||
Total Cycles:      2306
 | 
			
		||||
Total uOps:        6300
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.73
 | 
			
		||||
IPC:               2.43
 | 
			
		||||
Block RThroughput: 10.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      6     0.50    *                   vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm0, %rcx
 | 
			
		||||
 1      1     0.50                        vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm2, %r15
 | 
			
		||||
 1      1     0.25                        movl	%ecx, %r8d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %rcx
 | 
			
		||||
 1      1     0.50                        leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 1      1     0.50                        leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 1      1     0.25                        movslq	%r8d, %rcx
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r8
 | 
			
		||||
 1      1     0.25                        movl	%r15d, %r14d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %r15
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 1      1     0.50                        leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r14
 | 
			
		||||
 1      1     0.50                        leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 1      1     0.25                        movslq	%r15d, %r15
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 1      1     1.00                        vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 1      1     1.00                        vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 1      3     1.00                        vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 1      4     0.50                        vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 1      1     0.50                        vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 2      3     1.00                        vptest	%ymm7, %ymm1
 | 
			
		||||
 1      14    5.00                        vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 2      11    0.50    *                   vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 2      11    0.50    *                   vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 1      1     0.25                        addq	$4, %rdx
 | 
			
		||||
 1      1     0.25                        cmpq	%rsi, %rdx
 | 
			
		||||
 1      1     0.50                        jb	..B1.22
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - ICXDivider
 | 
			
		||||
[1]   - ICXFPDivider
 | 
			
		||||
[2]   - ICXPort0
 | 
			
		||||
[3]   - ICXPort1
 | 
			
		||||
[4]   - ICXPort2
 | 
			
		||||
[5]   - ICXPort3
 | 
			
		||||
[6]   - ICXPort4
 | 
			
		||||
[7]   - ICXPort5
 | 
			
		||||
[8]   - ICXPort6
 | 
			
		||||
[9]   - ICXPort7
 | 
			
		||||
[10]  - ICXPort8
 | 
			
		||||
[11]  - ICXPort9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   
 | 
			
		||||
 -     5.00   15.12  15.03  5.50   5.50    -     13.45  8.40    -      -      -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm0, %rcx
 | 
			
		||||
 -      -      -     0.46    -      -      -     0.54    -      -      -      -     vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm2, %r15
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -     movl	%ecx, %r8d
 | 
			
		||||
 -      -     0.96    -      -      -      -      -     0.04    -      -      -     shrq	$32, %rcx
 | 
			
		||||
 -      -      -     0.01    -      -      -     0.99    -      -      -      -     leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 -      -      -     0.03    -      -      -     0.97    -      -      -      -     leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 -      -     0.48   0.01    -      -      -      -     0.51    -      -      -     movslq	%r8d, %rcx
 | 
			
		||||
 -      -     0.02   0.02    -      -      -     0.01   0.95    -      -      -     movslq	%r14d, %r8
 | 
			
		||||
 -      -     0.02    -      -      -      -      -     0.98    -      -      -     movl	%r15d, %r14d
 | 
			
		||||
 -      -     0.52    -      -      -      -      -     0.48    -      -      -     shrq	$32, %r15
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -      -      -     vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -      -      -     vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 -      -      -      -     0.52   0.48    -      -      -      -      -      -     vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 -      -      -     0.47    -      -      -     0.53    -      -      -      -     leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 -      -     0.01   0.01    -      -      -     0.01   0.97    -      -      -     movslq	%r14d, %r14
 | 
			
		||||
 -      -      -     0.04    -      -      -     0.96    -      -      -      -     leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 -      -     0.48    -      -      -      -     0.01   0.51    -      -      -     movslq	%r15d, %r15
 | 
			
		||||
 -      -      -      -     0.51   0.49    -     1.00    -      -      -      -     vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 -      -     0.02   0.01   0.95   0.05    -     0.97    -      -      -      -     vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -     0.05   0.95    -      -      -      -      -      -     vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 -      -     0.02   0.49   0.49   0.51    -     0.49    -      -      -      -     vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.47   0.53    -      -      -      -      -      -      -      -     vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.50   0.50    -      -      -      -      -      -      -      -     vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 -      -     0.94   0.06    -      -      -      -      -      -      -      -     vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 -      -     0.06   0.94    -      -      -      -      -      -      -      -     vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 -      -     0.04   0.96    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 -      -     0.95   0.05    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 -      -     0.02   0.98    -      -      -      -      -      -      -      -     vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 -      -     0.05   0.95    -      -      -      -      -      -      -      -     vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 -      -     1.00    -      -      -      -     1.00    -      -      -      -     vptest	%ymm7, %ymm1
 | 
			
		||||
 -     5.00   1.00    -      -      -      -      -      -      -      -      -     vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 -      -     0.51   0.49   0.49   0.51    -      -      -      -      -      -     vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 -      -     0.04   0.96    -      -      -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 -      -     0.99   0.01    -      -      -      -      -      -      -      -     vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.49   0.51   0.51   0.49    -      -      -      -      -      -     vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -      -      -     vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 -      -     0.52   0.48    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 -      -     0.46   0.02    -      -      -     0.52    -      -      -      -     vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 -      -     0.49   0.51    -      -      -      -      -      -      -      -     vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 -      -     0.02   0.52    -      -      -     0.46    -      -      -      -     vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -      -      -     vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.49   0.51    -      -      -      -      -      -      -      -     vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 -      -     0.51   0.49    -      -      -      -      -      -      -      -     vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -      -      -     addq	$4, %rdx
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.01   0.98    -      -      -     cmpq	%rsi, %rdx
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -      -      -     jb	..B1.22
 | 
			
		||||
@@ -1,97 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx2.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:29:58
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                       Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |   1   |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
----------------------------------------------------------------------------------------------------
 | 
			
		||||
 256 |             |       |             |             |      |       |      |      ||      |      |   # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
 | 
			
		||||
 257 |             |       |             |             |      |       |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 258 |             |       |             |             |      |       |      |      ||      |      |   ..B1.22:                        # Preds ..B1.24 ..B1.21
 | 
			
		||||
 259 |             |       |             |             |      |       |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 260 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||  4.0 |      |   vmovdqu   (%rbx,%rdx,4), %xmm0                          #60.21
 | 
			
		||||
 261 | 1.00        |       |             |             |      |       |      |      ||  1.0 |      |   vmovq     %xmm0, %rcx                                   #60.21
 | 
			
		||||
 262 |             |       |             |             |      | 1.000 |      |      ||      |      |   vpunpckhqdq %xmm0, %xmm0, %xmm2                         #60.21
 | 
			
		||||
 263 | 1.00        |       |             |             |      |       |      |      ||      |      |   vmovq     %xmm2, %r15                                   #60.21
 | 
			
		||||
 264 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||  1.0 |      |   movl      %ecx, %r8d                                    #60.21
 | 
			
		||||
 265 | 0.00        |       |             |             |      |       | 1.00 |      ||      |      |   shrq      $32, %rcx                                     #60.21
 | 
			
		||||
 266 |             | 0.500 |             |             |      | 0.500 |      |      ||      |      |   lea       (%rcx,%rcx,2), %r14d                          #61.36
 | 
			
		||||
 267 |             | 0.500 |             |             |      | 0.500 |      |      ||  1.0 |      |   lea       (%r8,%r8,2), %r8d                             #61.36
 | 
			
		||||
 268 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||  1.0 |      |   movslq    %r8d, %rcx                                    #61.36
 | 
			
		||||
 269 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movslq    %r14d, %r8                                    #61.36
 | 
			
		||||
 270 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movl      %r15d, %r14d                                  #60.21
 | 
			
		||||
 271 | 0.00        |       |             |             |      |       | 1.00 |      ||      |      |   shrq      $32, %r15                                     #60.21
 | 
			
		||||
 272 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||  4.0 |      |   vmovups   (%r11,%rcx,8), %xmm7                          #61.36
 | 
			
		||||
 273 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmovups   (%r11,%r8,8), %xmm6                           #61.36
 | 
			
		||||
 274 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmovq     16(%r11,%rcx,8), %xmm14                       #61.36
 | 
			
		||||
 275 |             | 0.500 |             |             |      | 0.500 |      |      ||      |      |   lea       (%r14,%r14,2), %r14d                          #61.36
 | 
			
		||||
 276 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movslq    %r14d, %r14                                   #61.36
 | 
			
		||||
 277 |             | 0.500 |             |             |      | 0.500 |      |      ||      |      |   lea       (%r15,%r15,2), %r15d                          #61.36
 | 
			
		||||
 278 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movslq    %r15d, %r15                                   #61.36
 | 
			
		||||
 279 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||      |      |   vmovhpd   16(%r11,%r8,8), %xmm14, %xmm15                #61.36
 | 
			
		||||
 280 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||  3.0 |      |   vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1             #61.36
 | 
			
		||||
 281 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmovq     16(%r11,%r14,8), %xmm0                        #61.36
 | 
			
		||||
 282 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||      |      |   vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6             #61.36
 | 
			
		||||
 283 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||      |      |   vmovhpd   16(%r11,%r15,8), %xmm0, %xmm2                 #61.36
 | 
			
		||||
 284 |             |       |             |             |      | 1.000 |      |      ||      |      |   vunpcklpd %ymm6, %ymm1, %ymm14                          #61.36
 | 
			
		||||
 285 |             |       |             |             |      | 1.000 |      |      ||  1.0 |      |   vunpckhpd %ymm6, %ymm1, %ymm1                           #61.36
 | 
			
		||||
 286 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vsubpd    %ymm14, %ymm10, %ymm6                         #61.36
 | 
			
		||||
 287 |             |       |             |             |      | 1.000 |      |      ||      |      |   vinsertf128 $1, %xmm2, %ymm15, %ymm7                    #61.36
 | 
			
		||||
 288 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vsubpd    %ymm1, %ymm9, %ymm2                           #62.36
 | 
			
		||||
 289 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vsubpd    %ymm7, %ymm8, %ymm0                           #63.36
 | 
			
		||||
 290 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm2, %ymm2, %ymm14                          #64.49
 | 
			
		||||
 291 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vfmadd231pd %ymm6, %ymm6, %ymm14                        #64.49
 | 
			
		||||
 292 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vfmadd231pd %ymm0, %ymm0, %ymm14                        #64.63
 | 
			
		||||
 293 |             |       |             |             |      | 1.000 |      |      ||      |      |   vcmpltpd  %ymm5, %ymm14, %ymm1                          #74.22
 | 
			
		||||
 294 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vpcmpeqd  %ymm7, %ymm7, %ymm7                           #74.22
 | 
			
		||||
 295 | 1.00        |       |             |             |      | 1.000 |      |      ||      |      |   vptest    %ymm7, %ymm1                                  #74.22
 | 
			
		||||
 296 |             |       |             |             |      |       |      |      ||      |      |   #je        ..B1.24       # Prob 50%                      #74.22
 | 
			
		||||
 297 |             |       |             |             |      |       |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
 | 
			
		||||
 298 |             |       |             |             |      |       |      |      ||      |      |   ..B1.23:                        # Preds ..B1.22
 | 
			
		||||
 299 |             |       |             |             |      |       |      |      ||      |      |   # Execution count [1.25e+01]
 | 
			
		||||
 300 | 1.00   8.00 |       |             |             |      |       |      |      || 15.0 |      |   vdivpd    %ymm14, %ymm4, %ymm7                          #75.39
 | 
			
		||||
 301 | 0.50        | 0.500 | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||  4.0 |      |   vmulpd    96(%rsp), %ymm7, %ymm14                       #76.38[spill]
 | 
			
		||||
 302 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm14                         #76.44
 | 
			
		||||
 303 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm15                         #76.50
 | 
			
		||||
 304 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vfmsub213pd %ymm3, %ymm7, %ymm14                        #77.55
 | 
			
		||||
 305 | 0.50        | 0.500 | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmulpd    64(%rsp), %ymm7, %ymm7                        #77.55[spill]
 | 
			
		||||
 306 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm15, %ymm15                         #77.64
 | 
			
		||||
 307 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm15, %ymm7                         #77.70
 | 
			
		||||
 308 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm6, %ymm6                           #78.31
 | 
			
		||||
 309 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vmulpd    %ymm7, %ymm2, %ymm2                           #79.31
 | 
			
		||||
 310 | 0.25        | 0.253 |             |             |      | 0.493 |      |      ||  1.0 |      |   vandpd    %ymm6, %ymm1, %ymm6                           #78.31
 | 
			
		||||
 311 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vaddpd    %ymm6, %ymm13, %ymm13                         #78.17
 | 
			
		||||
 312 | 0.25        | 0.750 |             |             |      |       |      |      ||      |      |   vmulpd    %ymm7, %ymm0, %ymm6                           #80.31
 | 
			
		||||
 313 | 0.16        | 0.417 |             |             |      | 0.423 |      |      ||      |      |   vandpd    %ymm2, %ymm1, %ymm0                           #79.31
 | 
			
		||||
 314 | 0.00        | 0.250 |             |             |      | 0.750 |      |      ||      |      |   vandpd    %ymm6, %ymm1, %ymm1                           #80.31
 | 
			
		||||
 315 | 0.00        | 1.000 |             |             |      |       |      |      ||      |      |   vaddpd    %ymm0, %ymm12, %ymm12                         #79.17
 | 
			
		||||
 316 | 0.50        | 0.500 |             |             |      |       |      |      ||      |  4.0 |   vaddpd    %ymm1, %ymm11, %ymm11                         #80.17
 | 
			
		||||
 317 |             |       |             |             |      |       |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
 | 
			
		||||
 318 |             |       |             |             |      |       |      |      ||      |      |   ..B1.24:                        # Preds ..B1.23 ..B1.22
 | 
			
		||||
 319 |             |       |             |             |      |       |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 320 | 0.00        | 0.000 |             |             |      | -0.01 | 1.00 |      ||      |      |   addq      $4, %rdx                                      #59.9
 | 
			
		||||
 321 | 0.00        | -0.01 |             |             |      | 0.000 | 1.00 |      ||      |      |   cmpq      %rsi, %rdx                                    #59.9
 | 
			
		||||
 322 |             |       |             |             |      |       |      |      ||      |      | * jb        ..B1.22       # Prob 82%                      #59.9
 | 
			
		||||
 323 |             |       |             |             |      |       |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       13.7   8.00   13.66   5.50   5.50   5.50   5.50          13.66   10.0           76.0    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 316 |  4.0 | vaddpd    %ymm1, %ymm11, %ymm11                         #80.17| [316]
 | 
			
		||||
 315 |  4.0 | vaddpd    %ymm0, %ymm12, %ymm12                         #79.17| [315]
 | 
			
		||||
 311 |  4.0 | vaddpd    %ymm6, %ymm13, %ymm13                         #78.17| [311]
 | 
			
		||||
 320 |  1.0 | addq      $4, %rdx                                      #59.9| [320]
 | 
			
		||||
 | 
			
		||||
@@ -1,97 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx2.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-10 16:29:48
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                Port pressure in cycles                                                 
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
-----------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 256 |             |             |             |             |      |      |      |      |      |      ||      |      |   # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
 | 
			
		||||
 257 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 258 |             |             |             |             |      |      |      |      |      |      ||      |      |   ..B1.22:                        # Preds ..B1.24 ..B1.21
 | 
			
		||||
 259 |             |             |             |             |      |      |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 260 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||  5.0 |      |   vmovdqu   (%rbx,%rdx,4), %xmm0                          #60.21
 | 
			
		||||
 261 | 1.00        |             |             |             |      |      |      |      |      |      ||  1.0 |      |   vmovq     %xmm0, %rcx                                   #60.21
 | 
			
		||||
 262 |             | 0.50        |             |             |      | 0.50 |      |      |      |      ||      |      |   vpunpckhqdq %xmm0, %xmm0, %xmm2                         #60.21
 | 
			
		||||
 263 | 1.00        |             |             |             |      |      |      |      |      |      ||      |      |   vmovq     %xmm2, %r15                                   #60.21
 | 
			
		||||
 264 | 0.37        | 0.00        |             |             |      | 0.25 | 0.38 |      |      |      ||  1.0 |      |   movl      %ecx, %r8d                                    #60.21
 | 
			
		||||
 265 | 0.50        |             |             |             |      |      | 0.50 |      |      |      ||      |      |   shrq      $32, %rcx                                     #60.21
 | 
			
		||||
 266 | 0.13        | 0.00        |             |             |      | 0.00 | 0.87 |      |      |      ||      |      |   lea       (%rcx,%rcx,2), %r14d                          #61.36
 | 
			
		||||
 267 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||  6.0 |      |   lea       (%r8,%r8,2), %r8d                             #61.36
 | 
			
		||||
 268 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||  1.0 |      |   movslq    %r8d, %rcx                                    #61.36
 | 
			
		||||
 269 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movslq    %r14d, %r8                                    #61.36
 | 
			
		||||
 270 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movl      %r15d, %r14d                                  #60.21
 | 
			
		||||
 271 | 0.00        |             |             |             |      |      | 1.00 |      |      |      ||      |      |   shrq      $32, %r15                                     #60.21
 | 
			
		||||
 272 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||  5.0 |      |   vmovups   (%r11,%rcx,8), %xmm7                          #61.36
 | 
			
		||||
 273 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmovups   (%r11,%r8,8), %xmm6                           #61.36
 | 
			
		||||
 274 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmovq     16(%r11,%rcx,8), %xmm14                       #61.36
 | 
			
		||||
 275 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   lea       (%r14,%r14,2), %r14d                          #61.36
 | 
			
		||||
 276 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movslq    %r14d, %r14                                   #61.36
 | 
			
		||||
 277 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   lea       (%r15,%r15,2), %r15d                          #61.36
 | 
			
		||||
 278 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movslq    %r15d, %r15                                   #61.36
 | 
			
		||||
 279 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||      |      |   vmovhpd   16(%r11,%r8,8), %xmm14, %xmm15                #61.36
 | 
			
		||||
 280 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||  3.0 |      |   vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1             #61.36
 | 
			
		||||
 281 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmovq     16(%r11,%r14,8), %xmm0                        #61.36
 | 
			
		||||
 282 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||      |      |   vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6             #61.36
 | 
			
		||||
 283 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||      |      |   vmovhpd   16(%r11,%r15,8), %xmm0, %xmm2                 #61.36
 | 
			
		||||
 284 |             |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vunpcklpd %ymm6, %ymm1, %ymm14                          #61.36
 | 
			
		||||
 285 |             |             |             |             |      | 1.00 |      |      |      |      ||  1.0 |      |   vunpckhpd %ymm6, %ymm1, %ymm1                           #61.36
 | 
			
		||||
 286 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vsubpd    %ymm14, %ymm10, %ymm6                         #61.36
 | 
			
		||||
 287 |             |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vinsertf128 $1, %xmm2, %ymm15, %ymm7                    #61.36
 | 
			
		||||
 288 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vsubpd    %ymm1, %ymm9, %ymm2                           #62.36
 | 
			
		||||
 289 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vsubpd    %ymm7, %ymm8, %ymm0                           #63.36
 | 
			
		||||
 290 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm2, %ymm2, %ymm14                          #64.49
 | 
			
		||||
 291 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm6, %ymm6, %ymm14                        #64.49
 | 
			
		||||
 292 | 0.75        | 0.25        |             |             |      |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm0, %ymm0, %ymm14                        #64.63
 | 
			
		||||
 293 | 0.00        |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vcmpltpd  %ymm5, %ymm14, %ymm1                          #74.22
 | 
			
		||||
 294 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vpcmpeqd  %ymm7, %ymm7, %ymm7                           #74.22
 | 
			
		||||
 295 | 1.00        |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vptest    %ymm7, %ymm1                                  #74.22
 | 
			
		||||
 296 |             |             |             |             |      |      |      |      |      |      ||      |      |   #je        ..B1.24       # Prob 50%                      #74.22
 | 
			
		||||
 297 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
 | 
			
		||||
 298 |             |             |             |             |      |      |      |      |      |      ||      |      |   ..B1.23:                        # Preds ..B1.22
 | 
			
		||||
 299 |             |             |             |             |      |      |      |      |      |      ||      |      |   # Execution count [1.25e+01]
 | 
			
		||||
 300 | 1.00   8.00 |             |             |             |      |      |      |      |      |      || 13.0 |      |   vdivpd    %ymm14, %ymm4, %ymm7                          #75.39
 | 
			
		||||
 301 | 0.50        | 0.50        | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||  4.0 |      |   vmulpd    96(%rsp), %ymm7, %ymm14                       #76.38[spill]
 | 
			
		||||
 302 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm14                         #76.44
 | 
			
		||||
 303 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm15                         #76.50
 | 
			
		||||
 304 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vfmsub213pd %ymm3, %ymm7, %ymm14                        #77.55
 | 
			
		||||
 305 | 0.50        | 0.50        | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmulpd    64(%rsp), %ymm7, %ymm7                        #77.55[spill]
 | 
			
		||||
 306 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm15, %ymm15                         #77.64
 | 
			
		||||
 307 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm15, %ymm7                         #77.70
 | 
			
		||||
 308 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm6, %ymm6                           #78.31
 | 
			
		||||
 309 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |      |   vmulpd    %ymm7, %ymm2, %ymm2                           #79.31
 | 
			
		||||
 310 | 0.00        | 0.00        |             |             |      | 1.00 |      |      |      |      ||  1.0 |      |   vandpd    %ymm6, %ymm1, %ymm6                           #78.31
 | 
			
		||||
 311 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||  4.0 |      |   vaddpd    %ymm6, %ymm13, %ymm13                         #78.17
 | 
			
		||||
 312 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |      |   vmulpd    %ymm7, %ymm0, %ymm6                           #80.31
 | 
			
		||||
 313 | 0.00        | 0.00        |             |             |      | 1.00 |      |      |      |      ||      |      |   vandpd    %ymm2, %ymm1, %ymm0                           #79.31
 | 
			
		||||
 314 | 0.00        | 0.00        |             |             |      | 1.00 |      |      |      |      ||      |      |   vandpd    %ymm6, %ymm1, %ymm1                           #80.31
 | 
			
		||||
 315 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |      |   vaddpd    %ymm0, %ymm12, %ymm12                         #79.17
 | 
			
		||||
 316 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |  4.0 |   vaddpd    %ymm1, %ymm11, %ymm11                         #80.17
 | 
			
		||||
 317 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
 | 
			
		||||
 318 |             |             |             |             |      |      |      |      |      |      ||      |      |   ..B1.24:                        # Preds ..B1.23 ..B1.22
 | 
			
		||||
 319 |             |             |             |             |      |      |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 320 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   addq      $4, %rdx                                      #59.9
 | 
			
		||||
 321 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   cmpq      %rsi, %rdx                                    #59.9
 | 
			
		||||
 322 |             |             |             |             |      |      |      |      |      |      ||      |      | * jb        ..B1.22       # Prob 82%                      #59.9
 | 
			
		||||
 323 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       12.8   8.00   12.8          5.50   5.50   5.50   5.50          12.8   12.8                           81    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 316 |  4.0 | vaddpd    %ymm1, %ymm11, %ymm11                         #80.17| [316]
 | 
			
		||||
 315 |  4.0 | vaddpd    %ymm0, %ymm12, %ymm12                         #79.17| [315]
 | 
			
		||||
 311 |  4.0 | vaddpd    %ymm6, %ymm13, %ymm13                         #78.17| [311]
 | 
			
		||||
 320 |  1.0 | addq      $4, %rdx                                      #59.9| [320]
 | 
			
		||||
 | 
			
		||||
@@ -1,75 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  lammps-icc-avx512.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 30.89 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 19.0     0.0  |  4.0  | 13.0    13.0  | 13.0    13.0  |  0.0  | 17.0  |  4.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vpcmpgtd k5, ymm3, ymm4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vpaddd ymm4, ymm4, ymm15
 | 
			
		||||
|   2      |             | 1.0  | 1.0     1.0 |             |      |      |      |      | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vpaddd ymm18, ymm17, ymm17
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r15, 0x8
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vpaddd ymm19, ymm17, ymm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw k2, k5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw k3, k5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw k1, k5
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vpxord zmm21, zmm21, zmm21
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vpxord zmm20, zmm20, zmm20
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vpxord zmm22, zmm22, zmm22
 | 
			
		||||
|   5^     | 1.0         |      | 4.0     4.0 | 4.0     4.0 |      | 1.0  | 1.0  |      | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
 | 
			
		||||
|   5^     | 1.0         |      | 4.0     4.0 | 4.0     4.0 |      | 1.0  | 1.0  |      | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
 | 
			
		||||
|   5^     | 1.0         |      | 4.0     4.0 | 4.0     4.0 |      | 1.0  | 1.0  |      | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubpd zmm18, zmm1, zmm21
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubpd zmm17, zmm2, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubpd zmm19, zmm0, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm31, zmm18, zmm18
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm31, zmm17, zmm17
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm31, zmm19, zmm19
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm30, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k6{k5}, zmm31, zmm14, 0x1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfpclasspd k0, zmm30, 0x1e
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vmovaps zmm23, zmm31
 | 
			
		||||
|   2^     | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | knotw k4, k0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm24, zmm23, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd213pd zmm30{k4}, zmm23, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd213pd zmm30{k4}, zmm24, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm25, zmm30, zmm13
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm27, zmm30, zmm12
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm28, zmm30, zmm25
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm26, zmm30, zmm28
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213pd zmm30, zmm28, zmm5
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm29, zmm26, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm23, zmm29, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm10{k6}, zmm23, zmm17
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm9{k6}, zmm23, zmm18
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm8{k6}, zmm23, zmm19
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r15, r14
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jb 0xffffffffffffff0c
 | 
			
		||||
Total Num Of Uops: 57
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
There were bubbles in the frontend.
 | 
			
		||||
@@ -1,128 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      4200
 | 
			
		||||
Total Cycles:      2465
 | 
			
		||||
Total uOps:        5800
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.35
 | 
			
		||||
IPC:               1.70
 | 
			
		||||
Block RThroughput: 13.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      4     1.00                        vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 2      8     0.50    *                   vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 1      1     0.25                        addq	$8, %r15
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k2
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k3
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k1
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 1      4     1.00                        vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 1      1     0.50                        vmovaps	%zmm31, %zmm23
 | 
			
		||||
 2      11    0.50    *                   vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 1      1     1.00                        knotw	%k0, %k4
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 1      1     0.25                        cmpq	%r14, %r15
 | 
			
		||||
 1      1     0.50                        jb	..B1.16
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -      -     19.02  6.79   12.64  13.36   -     16.03  5.16    -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 -      -     0.28   0.72    -      -      -      -      -      -     vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 -      -     0.14   0.71   0.55   0.45    -     0.15    -      -     vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 -      -      -     0.97    -      -      -     0.03    -      -     vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 -      -     0.14   0.41    -      -      -     0.13   0.32    -     addq	$8, %r15
 | 
			
		||||
 -      -      -     0.99    -      -      -     0.01    -      -     vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     kmovw	%k5, %k2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     kmovw	%k5, %k3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     kmovw	%k5, %k1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 -      -     1.00   0.99   3.52   4.48    -     0.01   1.00    -     vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 -      -     1.00   0.99   4.48   3.52    -     0.01   1.00    -     vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 -      -     1.00   1.00   3.52   4.48    -      -     1.00    -     vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -     vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -     vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 -      -     0.18    -      -      -      -     0.82    -      -     vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -     vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 -      -     0.68    -      -      -      -     0.32    -      -     vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -     vmovaps	%zmm31, %zmm23
 | 
			
		||||
 -      -     1.00    -     0.57   0.43    -      -      -      -     vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     knotw	%k0, %k4
 | 
			
		||||
 -      -     0.44    -      -      -      -     0.56    -      -     vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -     vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.55    -      -      -      -     0.45    -      -     vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -     vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 -      -     0.31    -      -      -      -     0.69    -      -     vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -     vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -     vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -     vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 -      -     0.30    -      -      -      -     0.70    -      -     vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 -      -     0.16    -      -      -      -     0.84    -      -     vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -     vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -     vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -     vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 -      -      -     0.01    -      -      -     0.01   0.98    -     cmpq	%r14, %r15
 | 
			
		||||
 -      -     0.14    -      -      -      -      -     0.86    -     jb	..B1.16
 | 
			
		||||
@@ -1,130 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      4200
 | 
			
		||||
Total Cycles:      2465
 | 
			
		||||
Total uOps:        5800
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.35
 | 
			
		||||
IPC:               1.70
 | 
			
		||||
Block RThroughput: 13.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      4     1.00                        vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 2      8     0.50    *                   vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 1      1     0.25                        addq	$8, %r15
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k2
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k3
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k1
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 1      4     1.00                        vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 1      1     0.50                        vmovaps	%zmm31, %zmm23
 | 
			
		||||
 2      11    0.50    *                   vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 1      1     1.00                        knotw	%k0, %k4
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 1      1     0.25                        cmpq	%r14, %r15
 | 
			
		||||
 1      1     0.50                        jb	..B1.16
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - ICXDivider
 | 
			
		||||
[1]   - ICXFPDivider
 | 
			
		||||
[2]   - ICXPort0
 | 
			
		||||
[3]   - ICXPort1
 | 
			
		||||
[4]   - ICXPort2
 | 
			
		||||
[5]   - ICXPort3
 | 
			
		||||
[6]   - ICXPort4
 | 
			
		||||
[7]   - ICXPort5
 | 
			
		||||
[8]   - ICXPort6
 | 
			
		||||
[9]   - ICXPort7
 | 
			
		||||
[10]  - ICXPort8
 | 
			
		||||
[11]  - ICXPort9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   
 | 
			
		||||
 -      -     19.02  6.79   12.64  13.36   -     16.03  5.16    -      -      -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 -      -     0.28   0.72    -      -      -      -      -      -      -      -     vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 -      -     0.14   0.71   0.55   0.45    -     0.15    -      -      -      -     vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 -      -      -     0.97    -      -      -     0.03    -      -      -      -     vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 -      -     0.14   0.41    -      -      -     0.13   0.32    -      -      -     addq	$8, %r15
 | 
			
		||||
 -      -      -     0.99    -      -      -     0.01    -      -      -      -     vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     kmovw	%k5, %k2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     kmovw	%k5, %k3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     kmovw	%k5, %k1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 -      -     1.00   0.99   3.52   4.48    -     0.01   1.00    -      -      -     vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 -      -     1.00   0.99   4.48   3.52    -     0.01   1.00    -      -      -     vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 -      -     1.00   1.00   3.52   4.48    -      -     1.00    -      -      -     vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -      -      -     vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -      -      -     vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 -      -     0.18    -      -      -      -     0.82    -      -      -      -     vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -      -      -     vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -      -      -     vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 -      -     0.68    -      -      -      -     0.32    -      -      -      -     vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -      -      -     vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -      -      -     vmovaps	%zmm31, %zmm23
 | 
			
		||||
 -      -     1.00    -     0.57   0.43    -      -      -      -      -      -     vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     knotw	%k0, %k4
 | 
			
		||||
 -      -     0.44    -      -      -      -     0.56    -      -      -      -     vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -      -      -     vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.55    -      -      -      -     0.45    -      -      -      -     vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -      -      -     vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 -      -     0.31    -      -      -      -     0.69    -      -      -      -     vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -      -      -     vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -      -      -     vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -      -      -     vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 -      -     0.30    -      -      -      -     0.70    -      -      -      -     vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 -      -     0.16    -      -      -      -     0.84    -      -      -      -     vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -      -      -     vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -      -      -     vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -      -      -     vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 -      -      -     0.01    -      -      -     0.01   0.98    -      -      -     cmpq	%r14, %r15
 | 
			
		||||
 -      -     0.14    -      -      -      -      -     0.86    -      -      -     jb	..B1.16
 | 
			
		||||
@@ -1,77 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx512.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:30:08
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                      
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 200 |             |      |             |             |      |      |      |      ||      |      |   # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
 | 
			
		||||
 201 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 202 |             |      |             |             |      |      |      |      ||      |      |   ..B1.16:                        # Preds ..B1.16 ..B1.15
 | 
			
		||||
 203 |             |      |             |             |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 204 |             |      |             |             |      | 1.00 |      |      ||      |      |   vpcmpgtd  %ymm4, %ymm3, %k5                             #59.9
 | 
			
		||||
 205 | 0.00        | 1.00 |             |             |      | 0.00 |      |      ||      |      |   vpaddd    %ymm15, %ymm4, %ymm4                          #59.9
 | 
			
		||||
 206 | 0.00        | 1.00 | 0.50   0.50 | 0.50   0.50 |      | 0.00 |      |      ||  0.0 |      |   vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z}                 #60.21
 | 
			
		||||
 207 | 0.00        | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   vpaddd    %ymm17, %ymm17, %ymm18                        #61.36
 | 
			
		||||
 208 | 0.00        | 0.16 |             |             |      | 0.00 | 0.84 |      ||      |      |   addq      $8, %r15                                      #59.9
 | 
			
		||||
 209 | 0.00        | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   vpaddd    %ymm18, %ymm17, %ymm19                        #61.36
 | 
			
		||||
 210 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovw     %k5, %k2                                      #61.36
 | 
			
		||||
 211 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovw     %k5, %k3                                      #61.36
 | 
			
		||||
 212 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovw     %k5, %k1                                      #61.36
 | 
			
		||||
 213 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vpxord    %zmm21, %zmm21, %zmm21                        #61.36
 | 
			
		||||
 214 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vpxord    %zmm20, %zmm20, %zmm20                        #61.36
 | 
			
		||||
 215 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vpxord    %zmm22, %zmm22, %zmm22                        #61.36
 | 
			
		||||
 216 | 1.25        | 0.75 | 5.00   5.00 | 5.00   5.00 |      | 0.25 | 0.75 |      || 24.0 |      |   vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2}                #61.36
 | 
			
		||||
 217 | 1.25        | 0.25 | 5.00   5.00 | 5.00   5.00 |      | 0.25 | 1.25 |      ||      |      |   vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3}                 #61.36
 | 
			
		||||
 218 | 1.25        | 0.09 | 5.00   5.00 | 5.00   5.00 |      | 0.25 | 1.41 |      ||      |      |   vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1}               #61.36
 | 
			
		||||
 219 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubpd    %zmm21, %zmm1, %zmm18                         #62.36
 | 
			
		||||
 220 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd    %zmm20, %zmm2, %zmm17                         #61.36
 | 
			
		||||
 221 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd    %zmm22, %zmm0, %zmm19                         #63.36
 | 
			
		||||
 222 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm18, %zmm18, %zmm31                        #64.49
 | 
			
		||||
 223 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm17, %zmm31                      #64.49
 | 
			
		||||
 224 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm19, %zmm19, %zmm31                      #64.63
 | 
			
		||||
 225 | 2.50        |      |             |             |      | 0.50 |      |      ||  8.0 |      |   vrcp14pd  %zmm31, %zmm30                                #75.39
 | 
			
		||||
 226 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmppd    $1, %zmm14, %zmm31, %k6{%k5}                  #74.22
 | 
			
		||||
 227 |             |      |             |             |      | 1.00 |      |      ||      |      |   vfpclasspd $30, %zmm30, %k0                             #75.39
 | 
			
		||||
 228 |             |      |             |             |      |      |      |      ||      |      | * vmovaps   %zmm31, %zmm23                                #75.39
 | 
			
		||||
 229 | 0.50        |      | 0.50   0.50 | 0.50   0.50 |      | 0.50 |      |      ||  4.0 |      |   vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
 | 
			
		||||
 230 | 1.00        |      |             |             |      |      |      |      ||      |      |   knotw     %k0, %k4                                      #75.39
 | 
			
		||||
 231 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm23, %zmm23, %zmm24                        #75.39
 | 
			
		||||
 232 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd213pd %zmm30, %zmm23, %zmm30{%k4}                 #75.39
 | 
			
		||||
 233 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd213pd %zmm30, %zmm24, %zmm30{%k4}                 #75.39
 | 
			
		||||
 234 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm13, %zmm30, %zmm25                        #76.38
 | 
			
		||||
 235 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd    %zmm12, %zmm30, %zmm27                        #77.55
 | 
			
		||||
 236 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm25, %zmm30, %zmm28                        #76.44
 | 
			
		||||
 237 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm28, %zmm30, %zmm26                        #76.50
 | 
			
		||||
 238 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmsub213pd %zmm5, %zmm28, %zmm30                       #77.55
 | 
			
		||||
 239 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vmulpd    %zmm27, %zmm26, %zmm29                        #77.64
 | 
			
		||||
 240 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vmulpd    %zmm30, %zmm29, %zmm23                        #77.70
 | 
			
		||||
 241 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17
 | 
			
		||||
 242 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17
 | 
			
		||||
 243 | 0.00        |      |             |             |      | 1.00 |      |      ||      |  4.0 |   vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17
 | 
			
		||||
 244 | 0.00        | 0.00 |             |             |      | 0.00 | 1.00 |      ||      |      |   cmpq      %r14, %r15                                    #59.9
 | 
			
		||||
 245 |             |      |             |             |      |      |      |      ||      |      | * jb        ..B1.16       # Prob 82%                      #59.9
 | 
			
		||||
 246 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       18.8          5.25   16.0   16.0   16.0   16.0          18.8   5.25           86.0    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 243 |  4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17| [243]
 | 
			
		||||
 242 |  4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17| [242]
 | 
			
		||||
 241 |  4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17| [241]
 | 
			
		||||
 208 |  1.0 | addq      $8, %r15                                      #59.9| [208]
 | 
			
		||||
 205 |  1.0 | vpaddd    %ymm15, %ymm4, %ymm4                          #59.9| [205]
 | 
			
		||||
 | 
			
		||||
@@ -1,77 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx512.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-10 16:29:42
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                 Port pressure in cycles                                                 
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 200 |             |             |             |             |      |       |      |      |      |      ||      |      |   # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
 | 
			
		||||
 201 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 202 |             |             |             |             |      |       |      |      |      |      ||      |      |   ..B1.16:                        # Preds ..B1.16 ..B1.15
 | 
			
		||||
 203 |             |             |             |             |      |       |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 204 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vpcmpgtd  %ymm4, %ymm3, %k5                             #59.9
 | 
			
		||||
 205 | 0.00        | 1.00        |             |             |      | 0.000 |      |      |      |      ||      |      |   vpaddd    %ymm15, %ymm4, %ymm4                          #59.9
 | 
			
		||||
 206 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z}                 #60.21
 | 
			
		||||
 207 | 0.00        | 1.00        |             |             |      | 0.000 |      |      |      |      ||  1.0 |      |   vpaddd    %ymm17, %ymm17, %ymm18                        #61.36
 | 
			
		||||
 208 | 0.00        | 0.00        |             |             |      | 0.000 | 1.00 |      |      |      ||      |      |   addq      $8, %r15                                      #59.9
 | 
			
		||||
 209 | 0.00        | 1.00        |             |             |      | 0.000 |      |      |      |      ||  1.0 |      |   vpaddd    %ymm18, %ymm17, %ymm19                        #61.36
 | 
			
		||||
 210 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovw     %k5, %k2                                      #61.36
 | 
			
		||||
 211 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovw     %k5, %k3                                      #61.36
 | 
			
		||||
 212 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovw     %k5, %k1                                      #61.36
 | 
			
		||||
 213 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vpxord    %zmm21, %zmm21, %zmm21                        #61.36
 | 
			
		||||
 214 | 0.24        |             |             |             |      | 0.760 |      |      |      |      ||      |      |   vpxord    %zmm20, %zmm20, %zmm20                        #61.36
 | 
			
		||||
 215 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vpxord    %zmm22, %zmm22, %zmm22                        #61.36
 | 
			
		||||
 216 | 0.67        | 2.33        | 7.00   7.00 | 7.00   7.00 |      | 0.000 |      |      |      |      || 24.0 |      |   vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2}                #61.36
 | 
			
		||||
 217 | 0.67        | 2.33        | 7.00   7.00 | 7.00   7.00 |      | 0.000 |      |      |      |      ||      |      |   vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3}                 #61.36
 | 
			
		||||
 218 | 0.67        | 2.33        | 7.00   7.00 | 7.00   7.00 |      | 0.000 |      |      |      |      ||      |      |   vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1}               #61.36
 | 
			
		||||
 219 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vsubpd    %zmm21, %zmm1, %zmm18                         #62.36
 | 
			
		||||
 220 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd    %zmm20, %zmm2, %zmm17                         #61.36
 | 
			
		||||
 221 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd    %zmm22, %zmm0, %zmm19                         #63.36
 | 
			
		||||
 222 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm18, %zmm18, %zmm31                        #64.49
 | 
			
		||||
 223 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm17, %zmm31                      #64.49
 | 
			
		||||
 224 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm19, %zmm19, %zmm31                      #64.63
 | 
			
		||||
 225 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||  6.0 |      |   vrcp14pd  %zmm31, %zmm30                                #75.39
 | 
			
		||||
 226 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vcmppd    $1, %zmm14, %zmm31, %k6{%k5}                  #74.22
 | 
			
		||||
 227 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfpclasspd $30, %zmm30, %k0                             #75.39
 | 
			
		||||
 228 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmovaps   %zmm31, %zmm23                                #75.39
 | 
			
		||||
 229 | 0.50        |             | 0.50   0.50 | 0.50   0.50 |      | 0.500 |      |      |      |      ||  4.0 |      |   vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
 | 
			
		||||
 230 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   knotw     %k0, %k4                                      #75.39
 | 
			
		||||
 231 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm23, %zmm23, %zmm24                        #75.39
 | 
			
		||||
 232 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd213pd %zmm30, %zmm23, %zmm30{%k4}                 #75.39
 | 
			
		||||
 233 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd213pd %zmm30, %zmm24, %zmm30{%k4}                 #75.39
 | 
			
		||||
 234 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm13, %zmm30, %zmm25                        #76.38
 | 
			
		||||
 235 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd    %zmm12, %zmm30, %zmm27                        #77.55
 | 
			
		||||
 236 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm25, %zmm30, %zmm28                        #76.44
 | 
			
		||||
 237 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm28, %zmm30, %zmm26                        #76.50
 | 
			
		||||
 238 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmsub213pd %zmm5, %zmm28, %zmm30                       #77.55
 | 
			
		||||
 239 | 0.25        |             |             |             |      | 0.750 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm27, %zmm26, %zmm29                        #77.64
 | 
			
		||||
 240 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm30, %zmm29, %zmm23                        #77.70
 | 
			
		||||
 241 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17
 | 
			
		||||
 242 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17
 | 
			
		||||
 243 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |  4.0 |   vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17
 | 
			
		||||
 244 | 0.00        | 0.00        |             |             |      | -0.01 | 1.00 |      |      |      ||      |      |   cmpq      %r14, %r15                                    #59.9
 | 
			
		||||
 245 |             |             |             |             |      |       |      |      |      |      ||      |      | * jb        ..B1.16       # Prob 82%                      #59.9
 | 
			
		||||
 246 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       18.0          9.98          22.0   22.0   22.0   22.0          18.00   2.00                           89    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 243 |  4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17| [243]
 | 
			
		||||
 242 |  4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17| [242]
 | 
			
		||||
 241 |  4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17| [241]
 | 
			
		||||
 208 |  1.0 | addq      $8, %r15                                      #59.9| [208]
 | 
			
		||||
 205 |  1.0 | vpaddd    %ymm15, %ymm4, %ymm4                          #59.9| [205]
 | 
			
		||||
 | 
			
		||||
@@ -1,197 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      7000
 | 
			
		||||
Total Cycles:      3866
 | 
			
		||||
Total uOps:        7900
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.04
 | 
			
		||||
IPC:               1.81
 | 
			
		||||
Block RThroughput: 21.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      8     0.50    *                   vpbroadcastd	.LCPI0_1(%rip), %xmm1
 | 
			
		||||
 1      10    0.50    *                   vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 2      4     1.50                        vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
 1      1     0.50                        vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
 1      1     0.25                        vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm1, %r14
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm1, %r9
 | 
			
		||||
 1      4     1.00                        vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%r14), %xmm2
 | 
			
		||||
 1      8     0.50    *                   vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 2      4     1.50                        vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
 1      1     0.50                        vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm1, %rdi
 | 
			
		||||
 1      1     0.25                        vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm6, %rcx
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm6, %rax
 | 
			
		||||
 1      4     1.00                        vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rdi), %xmm6
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm1, %rdi
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rdi), %xmm1
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rcx), %xmm7
 | 
			
		||||
 1      8     0.50    *                   vpbroadcastd	.LCPI0_2(%rip), %xmm12
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%r9), %xmm2, %xmm2
 | 
			
		||||
 1      1     0.25                        vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
 2      4     1.50                        vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rax), %xmm7, %xmm7
 | 
			
		||||
 1      1     0.50                        vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
 1      1     0.25                        vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rbx), %xmm6, %xmm6
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm4, %rax
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rsi), %xmm1, %xmm1
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm4, %rcx
 | 
			
		||||
 1      4     1.00                        vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm4, %rsi
 | 
			
		||||
 1      2     1.00                        vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rsi), %xmm4
 | 
			
		||||
 1      3     0.50                        vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rdi), %xmm4, %xmm4
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rcx), %xmm6
 | 
			
		||||
 1      2     1.00                        vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rax), %xmm6, %xmm6
 | 
			
		||||
 1      2     1.00                        vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 1      3     0.50                        vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
 1      3     0.50                        vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
 1      4     1.00                        vfmadd231pd	%ymm1, %ymm1, %ymm6
 | 
			
		||||
 1      4     1.00                        vfmadd231pd	%ymm4, %ymm4, %ymm6
 | 
			
		||||
 1      8     0.50    *                   vbroadcastsd	.LCPI0_3(%rip), %ymm7
 | 
			
		||||
 1      13    5.00                        vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
 1      8     0.50    *                   vbroadcastsd	.LCPI0_4(%rip), %ymm12
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
 1      3     0.50                        vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
 1      10    0.50    *                   vmulpd	128(%rsp), %ymm7, %ymm7
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
 1      1     0.50                        vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
 1      4     1.00                        vfmadd213pd	%ymm0, %ymm7, %ymm2
 | 
			
		||||
 1      1     0.50                        vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 1      4     1.00                        vfmadd213pd	%ymm15, %ymm7, %ymm1
 | 
			
		||||
 1      4     1.00                        vfmadd213pd	%ymm13, %ymm7, %ymm4
 | 
			
		||||
 1      1     0.50                        vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 1      1     0.50                        vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 1      1     0.25                        addq	$4, %rbp
 | 
			
		||||
 1      1     0.25                        cmpq	%rdx, %rbp
 | 
			
		||||
 1      1     0.50                        jb	.LBB0_9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - Zn3AGU0
 | 
			
		||||
[1]   - Zn3AGU1
 | 
			
		||||
[2]   - Zn3AGU2
 | 
			
		||||
[3]   - Zn3ALU0
 | 
			
		||||
[4]   - Zn3ALU1
 | 
			
		||||
[5]   - Zn3ALU2
 | 
			
		||||
[6]   - Zn3ALU3
 | 
			
		||||
[7]   - Zn3BRU1
 | 
			
		||||
[8]   - Zn3FPP0
 | 
			
		||||
[9]   - Zn3FPP1
 | 
			
		||||
[10]  - Zn3FPP2
 | 
			
		||||
[11]  - Zn3FPP3
 | 
			
		||||
[12.0] - Zn3FPP45
 | 
			
		||||
[12.1] - Zn3FPP45
 | 
			
		||||
[13]  - Zn3FPSt
 | 
			
		||||
[14.0] - Zn3LSU
 | 
			
		||||
[14.1] - Zn3LSU
 | 
			
		||||
[14.2] - Zn3LSU
 | 
			
		||||
[15.0] - Zn3Load
 | 
			
		||||
[15.1] - Zn3Load
 | 
			
		||||
[15.2] - Zn3Load
 | 
			
		||||
[16.0] - Zn3Store
 | 
			
		||||
[16.1] - Zn3Store
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] 
 | 
			
		||||
 -      -      -     0.60   0.60   0.60   0.60   0.60   16.84  23.53  16.30  7.33   21.50  21.50   -     6.33   6.33   6.34   6.33   6.33   6.34    -      -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.03   0.97    -     0.51   0.49    -     0.34   0.33   0.33   0.34   0.33   0.33    -      -     vpbroadcastd	.LCPI0_1(%rip), %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.65    -      -     0.35   0.34   0.66    -     0.49   0.05   0.46   0.49   0.05   0.46    -      -     vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.06   2.94    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.65   0.35    -      -      -      -      -      -      -      -      -      -      -      -     vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm1, %r14
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm1, %r9
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.48   0.35   0.17   0.48   0.35   0.17    -      -     vmovsd	(%r14), %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.01   0.18   0.17   0.64   0.47   0.53    -     0.34   0.33   0.33   0.34   0.33   0.33    -      -     vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.92   1.08    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.32   0.68    -      -      -      -      -      -      -      -      -      -      -      -     vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.30   0.70    -      -      -      -      -      -      -      -      -     vmovq	%xmm1, %rdi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.32   0.68    -      -      -      -      -      -      -      -      -      -      -     vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -     vmovq	%xmm6, %rcx
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm6, %rax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.03   0.65   0.32   0.03   0.65   0.32    -      -     vmovsd	(%rdi), %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.36   1.64    -      -      -      -      -      -      -      -      -     vmovq	%xmm1, %rdi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.64   0.36    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.32   0.68    -     0.51   0.33   0.16   0.51   0.33   0.16    -      -     vmovsd	(%rdi), %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.68   0.32    -     0.49   0.01   0.50   0.49   0.01   0.50    -      -     vmovsd	(%rcx), %xmm7
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.48   0.52    -     0.67   0.33    -     0.17   0.62   0.21   0.17   0.62   0.21    -      -     vpbroadcastd	.LCPI0_2(%rip), %xmm12
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.01   0.99    -     0.17   0.83    -     0.02   0.64   0.34   0.02   0.64   0.34    -      -     vmovhpd	(%r9), %xmm2, %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.01    -      -     0.99    -      -      -      -      -      -      -      -      -      -      -     vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.57   2.43    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.34   0.66    -     0.82   0.18    -     0.49   0.35   0.16   0.49   0.35   0.16    -      -     vmovhpd	(%rax), %xmm7, %xmm7
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -     vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -      -      -      -      -      -      -      -     vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.51   0.49    -     0.49   0.51    -     0.35   0.16   0.49   0.35   0.16   0.49    -      -     vmovhpd	(%rbx), %xmm6, %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.04   0.96    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm4, %rax
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.49   0.51    -     0.17   0.83    -     0.16   0.49   0.35   0.16   0.49   0.35    -      -     vmovhpd	(%rsi), %xmm1, %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm4, %rcx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm4, %rsi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.49   0.35   0.16   0.49   0.35   0.16    -      -     vmovsd	(%rsi), %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.31   0.69    -      -      -      -      -      -      -      -      -      -      -     vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.49   0.51    -     0.48   0.52    -     0.35   0.16   0.49   0.35   0.16   0.49    -      -     vmovhpd	(%rdi), %xmm4, %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.52   0.48    -     0.16   0.49   0.35   0.16   0.49   0.35    -      -     vmovsd	(%rcx), %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.35   0.65    -     0.50   0.50    -     0.47   0.35   0.18   0.47   0.35   0.18    -      -     vmovhpd	(%rax), %xmm6, %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -     vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.51   0.49    -      -      -      -      -      -      -      -      -      -      -     vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm1, %ymm1, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm4, %ymm4, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.66   0.34    -     0.51   0.49    -     0.19   0.32   0.49   0.19   0.32   0.49    -      -     vbroadcastsd	.LCPI0_3(%rip), %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     5.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.30   0.70    -     0.49   0.51    -     0.34   0.33   0.33   0.34   0.33   0.33    -      -     vbroadcastsd	.LCPI0_4(%rip), %ymm12
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.82   0.18    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.17   0.83    -      -      -      -      -      -      -      -      -      -      -     vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.01   0.99    -      -     0.18   0.82    -     0.46   0.02   0.52   0.46   0.02   0.52    -      -     vmulpd	128(%rsp), %ymm7, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm0, %ymm7, %ymm2
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.66   0.34    -      -      -      -      -      -      -      -      -      -      -      -      -     vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.66   1.34    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm15, %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm13, %ymm7, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 -      -      -      -     0.40   0.20   0.40    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$4, %rbp
 | 
			
		||||
 -      -      -     0.20   0.20   0.40   0.20    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rdx, %rbp
 | 
			
		||||
 -      -      -     0.40    -      -      -     0.60    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     jb	.LBB0_9
 | 
			
		||||
@@ -1,108 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icx-avx2zen.s
 | 
			
		||||
Architecture:       ZEN3
 | 
			
		||||
Timestamp:          2023-02-10 16:31:30
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                           Port pressure in cycles                                                           
 | 
			
		||||
     |  0   |  1   |  2   |  3   | DV0  | DV1  |  4   |  5   |  6   |  7   |  8   - 8DV  |  9   |  10  |  11  |  12  |  13  ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 175 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
 | 
			
		||||
 176 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 177 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   .LBB0_9:                                #
 | 
			
		||||
 178 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
 179 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
 180 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  1.0 |      |   vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
 181 | 0.00 |      |      | 1.00 |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  3.0 |      |   vpmulld (%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 182 | 0.00 | 0.75 | 0.38 | 0.87 |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vpmovsxdq %xmm11, %ymm1
 | 
			
		||||
 183 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpsllq $3, %ymm1, %ymm1
 | 
			
		||||
 184 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpaddq %ymm1, %ymm3, %ymm1
 | 
			
		||||
 185 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %r14
 | 
			
		||||
 186 | 0.12 | 1.88 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %r9
 | 
			
		||||
 187 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vextracti128 $1, %ymm1, %xmm1
 | 
			
		||||
 188 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
 189 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsubd .LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 190 | 0.00 | 0.75 | 0.38 | 0.87 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm6, %ymm6
 | 
			
		||||
 191 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm6, %ymm6
 | 
			
		||||
 192 | 0.00 | 0.00 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 193 | 0.00 | 0.00 | 0.51 | 0.49 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm6, %ymm3, %ymm6
 | 
			
		||||
 194 | 0.00 | 0.00 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm6, %rcx
 | 
			
		||||
 195 | 0.13 | 1.87 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  6.0 |      |   vpextrq $1, %xmm1, %rbx
 | 
			
		||||
 196 | 0.00 | 2.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm6, %rax
 | 
			
		||||
 197 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm6, %xmm1
 | 
			
		||||
 198 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 199 | 0.00 | 0.00 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 200 | 0.00 | 2.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %rsi
 | 
			
		||||
 201 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
 202 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
 203 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
 204 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
 205 | 0.00 | 0.00 | 0.63 | 0.37 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddd %xmm12, %xmm11, %xmm4
 | 
			
		||||
 206 | 0.00 | 0.75 | 0.00 | 1.25 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm4, %ymm4
 | 
			
		||||
 207 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
 208 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm4, %ymm4
 | 
			
		||||
 209 | 0.00 | 0.00 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm4, %ymm3, %ymm4
 | 
			
		||||
 210 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  5.0 |      |   vmovhpd (%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 211 | 0.75 | 1.25 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rax
 | 
			
		||||
 212 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
 213 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rcx
 | 
			
		||||
 214 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm4, %xmm4
 | 
			
		||||
 215 | 0.00 | 0.00 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rsi
 | 
			
		||||
 216 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vinsertf128 $1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 217 | 1.00 | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rdi
 | 
			
		||||
 218 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
 219 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vsubpd %ymm2, %ymm14, %ymm2
 | 
			
		||||
 220 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
 221 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 222 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 223 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 224 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 225 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm1, %ymm5, %ymm1
 | 
			
		||||
 226 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm4, %ymm10, %ymm4
 | 
			
		||||
 227 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm2, %ymm2, %ymm6
 | 
			
		||||
 228 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
 229 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
 230 | 1.00 |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
 231 |      |      |      |      | 4.50 | 4.50 |      |      |      |      |             |      |      |      |      |      || 13.0 |      |   vdivpd %ymm6, %ymm7, %ymm7
 | 
			
		||||
 232 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm7, %ymm11
 | 
			
		||||
 233 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm9, %ymm11, %ymm11
 | 
			
		||||
 234 | 1.00 |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
 235 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm11, %ymm11
 | 
			
		||||
 236 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vaddpd %ymm12, %ymm11, %ymm12
 | 
			
		||||
 237 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
 238 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmulpd %ymm7, %ymm11, %ymm7
 | 
			
		||||
 239 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm12, %ymm7
 | 
			
		||||
 240 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vcmpltpd %ymm8, %ymm6, %ymm6
 | 
			
		||||
 241 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
 242 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 243 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
 244 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  4.0 |   vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
 245 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 246 | 0.75 | 0.25 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  1.0 |   vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 247 |      |      |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   addq $4, %rbp
 | 
			
		||||
 248 |      |      |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   cmpq %rdx, %rbp
 | 
			
		||||
 249 |      |      |      |      |      |      |      |      | 0.00 |      |             |      | 1.00 |      |      |      ||      |      |   jb .LBB0_9
 | 
			
		||||
 250 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       18.8   18.5   15.9   15.9   4.50   4.50                 0.50   0.50   0.50          0.50          9.00   9.00             72    5.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 244 |  5.0 | vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
 | 
			
		||||
 243 |  5.0 | vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
 | 
			
		||||
 241 |  5.0 | vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
 | 
			
		||||
 247 |  1.0 | addq	$4, %rbp                       | [247]
 | 
			
		||||
 246 |  1.0 | vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13| [246]
 | 
			
		||||
 245 |  1.0 | vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15| [245]
 | 
			
		||||
 242 |  1.0 | vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0| [242]
 | 
			
		||||
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							@@ -1,640 +0,0 @@
 | 
			
		||||
	.text
 | 
			
		||||
	.file	"force_lj.c"
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJFullNeigh_plain_c
 | 
			
		||||
.LCPI0_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI0_3:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI0_4:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.section	.rodata.cst4,"aM",@progbits,4
 | 
			
		||||
	.p2align	2
 | 
			
		||||
.LCPI0_1:
 | 
			
		||||
	.long	3                       # 0x3
 | 
			
		||||
.LCPI0_2:
 | 
			
		||||
	.long	2                       # 0x2
 | 
			
		||||
	.section	.rodata.cst16,"aM",@progbits,16
 | 
			
		||||
	.p2align	4
 | 
			
		||||
.LCPI0_5:
 | 
			
		||||
	.zero	16,255
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_plain_c,@function
 | 
			
		||||
computeForceLJFullNeigh_plain_c:        # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_plain_c$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 320
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, %rbx
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r14d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 128(%rsp)        # 8-byte Spill
 | 
			
		||||
	vmovq	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovdqa	%xmm0, 80(%rsp)         # 16-byte Spill
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r14,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB0_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovq	%xmm0, 32(%rsp)         # 8-byte Folded Spill
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_19
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm13
 | 
			
		||||
	movq	16(%r15), %r11
 | 
			
		||||
	movq	24(%r15), %rsi
 | 
			
		||||
	movslq	8(%r15), %rdi
 | 
			
		||||
	movq	16(%r12), %r15
 | 
			
		||||
	movq	64(%r12), %r8
 | 
			
		||||
	vmovsd	128(%rsp), %xmm0        # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI0_0(%rip), %xmm0, %xmm15
 | 
			
		||||
	movq	%rbx, 24(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqu	(%rbx), %xmm14
 | 
			
		||||
	decq	%r14
 | 
			
		||||
	vmovq	%r15, %xmm0
 | 
			
		||||
	vpbroadcastq	%xmm0, %ymm3
 | 
			
		||||
	vbroadcastsd	%xmm13, %ymm2
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vbroadcastsd	%xmm12, %ymm8
 | 
			
		||||
	vbroadcastsd	%xmm15, %ymm9
 | 
			
		||||
	shlq	$2, %rdi
 | 
			
		||||
	xorl	%r10d, %r10d
 | 
			
		||||
	movq	%r14, 56(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm13, 192(%rsp)       # 16-byte Spill
 | 
			
		||||
	movq	%rsi, 48(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdi, 40(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm15, 176(%rsp)       # 16-byte Spill
 | 
			
		||||
	vmovupd	%ymm2, 224(%rsp)        # 32-byte Spill
 | 
			
		||||
	vmovupd	%ymm9, 128(%rsp)        # 32-byte Spill
 | 
			
		||||
	jmp	.LBB0_6
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_17:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
.LBB0_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vaddsd	(%r8,%r12,8), %xmm10, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%r12,8)
 | 
			
		||||
	vaddsd	(%r8,%rbx,8), %xmm11, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbx,8)
 | 
			
		||||
	vaddsd	(%r8,%rbp,8), %xmm5, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbp,8)
 | 
			
		||||
	leal	3(%r13), %eax
 | 
			
		||||
	addl	$6, %r13d
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	cmovnsl	%eax, %r13d
 | 
			
		||||
	sarl	$2, %r13d
 | 
			
		||||
	movslq	%r13d, %rax
 | 
			
		||||
	vmovq	%rax, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm14, %xmm14
 | 
			
		||||
	addq	%rdi, %r11
 | 
			
		||||
	cmpq	%r14, %r10
 | 
			
		||||
	leaq	1(%r10), %r10
 | 
			
		||||
	je	.LBB0_18
 | 
			
		||||
.LBB0_6:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB0_9 Depth 2
 | 
			
		||||
                                        #     Child Loop BB0_13 Depth 2
 | 
			
		||||
	movl	(%rsi,%r10,4), %r13d
 | 
			
		||||
	leal	(%r10,%r10,2), %r12d
 | 
			
		||||
	leal	(%r10,%r10,2), %ebx
 | 
			
		||||
	incl	%ebx
 | 
			
		||||
	leal	(%r10,%r10,2), %ebp
 | 
			
		||||
	addl	$2, %ebp
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB0_4
 | 
			
		||||
# %bb.7:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovsd	(%r15,%r12,8), %xmm0    # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbx,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
	movl	$4294967292, %eax       # imm = 0xFFFFFFFC
 | 
			
		||||
	andq	%rax, %rdx
 | 
			
		||||
	vmovapd	%xmm0, 112(%rsp)        # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm1, 96(%rsp)         # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm2, (%rsp)           # 16-byte Spill
 | 
			
		||||
	je	.LBB0_16
 | 
			
		||||
# %bb.8:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%rbp, 64(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rbx, 72(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqa	%xmm14, 208(%rsp)       # 16-byte Spill
 | 
			
		||||
	vbroadcastsd	%xmm0, %ymm14
 | 
			
		||||
	vbroadcastsd	%xmm1, %ymm5
 | 
			
		||||
	vbroadcastsd	%xmm2, %ymm10
 | 
			
		||||
	vxorpd	%xmm0, %xmm0, %xmm0
 | 
			
		||||
	vxorpd	%xmm15, %xmm15, %xmm15
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	xorl	%ebp, %ebp
 | 
			
		||||
	vmovapd	%ymm8, %ymm9
 | 
			
		||||
	vmovupd	224(%rsp), %ymm8        # 32-byte Reload
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
movl      $111, %ebx # OSACA START MARKER
 | 
			
		||||
.byte     100        # OSACA START MARKER
 | 
			
		||||
.byte     103        # OSACA START MARKER
 | 
			
		||||
.byte     144        # OSACA START MARKER
 | 
			
		||||
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
 | 
			
		||||
# LLVM-MCA-BEGIN
 | 
			
		||||
.LBB0_9:                                # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	vpbroadcastd	.LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
	vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
	vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
	vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
	vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
	vmovq	%xmm1, %r14
 | 
			
		||||
	vpextrq	$1, %xmm1, %r9
 | 
			
		||||
	vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
	vmovsd	(%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
	vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
	vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
	vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
	vmovq	%xmm6, %rcx
 | 
			
		||||
	vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
	vpextrq	$1, %xmm6, %rax
 | 
			
		||||
	vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
	vmovsd	(%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
	vmovsd	(%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
	vpbroadcastd	.LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
	vmovhpd	(%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
	vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
	vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
	vmovhpd	(%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
	vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
	vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
	vmovhpd	(%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vpextrq	$1, %xmm4, %rax
 | 
			
		||||
	vmovhpd	(%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
	vmovq	%xmm4, %rcx
 | 
			
		||||
	vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
	vmovq	%xmm4, %rsi
 | 
			
		||||
	vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
	vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
	vmovsd	(%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
	vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
	vmovhpd	(%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
	vmovsd	(%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
	vmovhpd	(%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
	vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
	vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
	vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
	vfmadd231pd	%ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
	vfmadd231pd	%ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
	vbroadcastsd	.LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
	vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
	vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
	vbroadcastsd	.LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
	vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
	vmulpd	128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
	vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
	vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
	vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
	vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
	vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
	vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
	vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
	addq	$4, %rbp
 | 
			
		||||
	cmpq	%rdx, %rbp
 | 
			
		||||
	jb	.LBB0_9
 | 
			
		||||
# LLVM-MCA-END
 | 
			
		||||
movl      $222, %ebx # OSACA END MARKER
 | 
			
		||||
.byte     100        # OSACA END MARKER
 | 
			
		||||
.byte     103        # OSACA END MARKER
 | 
			
		||||
.byte     144        # OSACA END MARKER
 | 
			
		||||
# %bb.10:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm0, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm0, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm10
 | 
			
		||||
	vpermilpd	$1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm15, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm15, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm11
 | 
			
		||||
	vpermilpd	$1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm13, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm13, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm5
 | 
			
		||||
	movq	56(%rsp), %r14          # 8-byte Reload
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vmovapd	192(%rsp), %xmm13       # 16-byte Reload
 | 
			
		||||
	movq	48(%rsp), %rsi          # 8-byte Reload
 | 
			
		||||
	movq	40(%rsp), %rdi          # 8-byte Reload
 | 
			
		||||
	vmovdqa	208(%rsp), %xmm14       # 16-byte Reload
 | 
			
		||||
	vmovapd	176(%rsp), %xmm15       # 16-byte Reload
 | 
			
		||||
	vmovapd	%ymm9, %ymm8
 | 
			
		||||
	movq	72(%rsp), %rbx          # 8-byte Reload
 | 
			
		||||
	movq	64(%rsp), %rbp          # 8-byte Reload
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
	jmp	.LBB0_11
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_4:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movslq	%r13d, %rdx
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	jmp	.LBB0_5
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_16:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
.LBB0_11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_13
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	incq	%rdx
 | 
			
		||||
	cmpq	%rdx, %r13
 | 
			
		||||
	je	.LBB0_17
 | 
			
		||||
.LBB0_13:                               # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movl	(%r11,%rdx,4), %eax
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm0, %xmm6
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	incl	%ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm4, %xmm2
 | 
			
		||||
	leal	2(%rax,%rax,2), %eax
 | 
			
		||||
	cltq
 | 
			
		||||
	vmovapd	(%rsp), %xmm1           # 16-byte Reload
 | 
			
		||||
	vsubsd	(%r15,%rax,8), %xmm1, %xmm1
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm7
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
 | 
			
		||||
	vucomisd	%xmm13, %xmm7
 | 
			
		||||
	jae	.LBB0_12
 | 
			
		||||
# %bb.14:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	vmovsd	.LCPI0_3(%rip), %xmm0   # xmm0 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm7, %xmm0, %xmm7
 | 
			
		||||
	vmulsd	%xmm7, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm0, %xmm12, %xmm0
 | 
			
		||||
	vmulsd	%xmm7, %xmm0, %xmm0
 | 
			
		||||
	vaddsd	.LCPI0_4(%rip), %xmm0, %xmm4
 | 
			
		||||
	vmulsd	%xmm7, %xmm15, %xmm7
 | 
			
		||||
	vmulsd	%xmm0, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm4, %xmm0, %xmm0
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	vfmadd231sd	%xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_12
 | 
			
		||||
.LBB0_18:                               # 
 | 
			
		||||
	movq	24(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm14, (%rax)
 | 
			
		||||
.LBB0_19:                               # 
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	vzeroupper
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	32(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end0:
 | 
			
		||||
	.size	computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJHalfNeigh
 | 
			
		||||
.LCPI1_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI1_1:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI1_2:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJHalfNeigh
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJHalfNeigh,@function
 | 
			
		||||
computeForceLJHalfNeigh:                # 
 | 
			
		||||
.LcomputeForceLJHalfNeigh$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 96
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, 16(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r13d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 8(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 32(%rsp)         # 8-byte Spill
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r13,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB1_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovsd	%xmm0, 24(%rsp)         # 8-byte Spill
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_8
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	8(%rsp), %xmm0          # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm12
 | 
			
		||||
	movq	16(%r15), %rax
 | 
			
		||||
	movq	24(%r15), %rcx
 | 
			
		||||
	movq	%rcx, 8(%rsp)           # 8-byte Spill
 | 
			
		||||
	movslq	8(%r15), %rdx
 | 
			
		||||
	movq	16(%r12), %rsi
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI1_0(%rip), %xmm0, %xmm11
 | 
			
		||||
	movq	16(%rsp), %rcx          # 8-byte Reload
 | 
			
		||||
	vmovdqu	(%rcx), %xmm10
 | 
			
		||||
	shlq	$2, %rdx
 | 
			
		||||
	movq	%rdx, (%rsp)            # 8-byte Spill
 | 
			
		||||
	xorl	%r12d, %r12d
 | 
			
		||||
	jmp	.LBB1_4
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	movq	%r9, %rdx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
.LBB1_6:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vaddsd	(%rdi,%r15,8), %xmm14, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r15,8)
 | 
			
		||||
	vaddsd	(%rdi,%r10,8), %xmm9, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r10,8)
 | 
			
		||||
	vaddsd	(%rdi,%r11,8), %xmm13, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r11,8)
 | 
			
		||||
	leal	3(%r9), %ecx
 | 
			
		||||
	addl	$6, %r9d
 | 
			
		||||
	testl	%ecx, %ecx
 | 
			
		||||
	cmovnsl	%ecx, %r9d
 | 
			
		||||
	sarl	$2, %r9d
 | 
			
		||||
	movslq	%r9d, %rcx
 | 
			
		||||
	vmovq	%rcx, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm10, %xmm10
 | 
			
		||||
	incq	%r12
 | 
			
		||||
	addq	(%rsp), %rax            # 8-byte Folded Reload
 | 
			
		||||
	cmpq	%r13, %r12
 | 
			
		||||
	je	.LBB1_7
 | 
			
		||||
.LBB1_4:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB1_10 Depth 2
 | 
			
		||||
	movq	8(%rsp), %rcx           # 8-byte Reload
 | 
			
		||||
	movslq	(%rcx,%r12,4), %r9
 | 
			
		||||
	leaq	(%r12,%r12,2), %rcx
 | 
			
		||||
	leal	1(%rcx), %r10d
 | 
			
		||||
	leal	2(%rcx), %r11d
 | 
			
		||||
	movl	%ecx, %r15d
 | 
			
		||||
	testq	%r9, %r9
 | 
			
		||||
	jle	.LBB1_5
 | 
			
		||||
# %bb.9:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vmovsd	(%rsi,%r15,8), %xmm15   # xmm15 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r10,8), %xmm4    # xmm4 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r11,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	movl	%r9d, %edx
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
	xorl	%ecx, %ecx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	jmp	.LBB1_10
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_13:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	incq	%rcx
 | 
			
		||||
	cmpq	%rcx, %rdx
 | 
			
		||||
	je	.LBB1_6
 | 
			
		||||
.LBB1_10:                               # 
 | 
			
		||||
                                        #   Parent Loop BB1_4 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movslq	(%rax,%rcx,4), %r8
 | 
			
		||||
	leaq	(%r8,%r8,2), %r14
 | 
			
		||||
	vsubsd	(%rsi,%r14,8), %xmm15, %xmm2
 | 
			
		||||
	movslq	%r14d, %rbp
 | 
			
		||||
	vsubsd	8(%rsi,%rbp,8), %xmm4, %xmm5
 | 
			
		||||
	vsubsd	16(%rsi,%rbp,8), %xmm1, %xmm0
 | 
			
		||||
	vmulsd	%xmm2, %xmm2, %xmm6
 | 
			
		||||
	vfmadd231sd	%xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
 | 
			
		||||
	vfmadd231sd	%xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
 | 
			
		||||
	vucomisd	%xmm12, %xmm6
 | 
			
		||||
	jae	.LBB1_13
 | 
			
		||||
# %bb.11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	vmovsd	.LCPI1_1(%rip), %xmm3   # xmm3 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm6, %xmm3, %xmm6
 | 
			
		||||
	vmulsd	32(%rsp), %xmm6, %xmm3  # 8-byte Folded Reload
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm8
 | 
			
		||||
	vmulsd	%xmm3, %xmm8, %xmm3
 | 
			
		||||
	vaddsd	.LCPI1_2(%rip), %xmm3, %xmm7
 | 
			
		||||
	vmulsd	%xmm6, %xmm11, %xmm6
 | 
			
		||||
	vmulsd	%xmm3, %xmm6, %xmm3
 | 
			
		||||
	vmulsd	%xmm7, %xmm3, %xmm3
 | 
			
		||||
	vmulsd	%xmm2, %xmm3, %xmm6
 | 
			
		||||
	vaddsd	%xmm6, %xmm14, %xmm14
 | 
			
		||||
	vmulsd	%xmm5, %xmm3, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm9, %xmm9
 | 
			
		||||
	vmulsd	%xmm0, %xmm3, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm13, %xmm13
 | 
			
		||||
	cmpl	%r13d, %r8d
 | 
			
		||||
	jge	.LBB1_13
 | 
			
		||||
# %bb.12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	leaq	1(%rbp), %rbx
 | 
			
		||||
	addq	$2, %rbp
 | 
			
		||||
	vmovsd	(%rdi,%r14,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm6, %xmm3, %xmm3
 | 
			
		||||
	vmovsd	%xmm3, (%rdi,%r14,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbx,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm2, %xmm3, %xmm2
 | 
			
		||||
	vmovsd	%xmm2, (%rdi,%rbx,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm0, %xmm2, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%rbp,8)
 | 
			
		||||
	jmp	.LBB1_13
 | 
			
		||||
.LBB1_7:                                # 
 | 
			
		||||
	movq	16(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm10, (%rax)
 | 
			
		||||
.LBB1_8:                                # 
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	24(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end1:
 | 
			
		||||
	.size	computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.globl	computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_simd,@function
 | 
			
		||||
computeForceLJFullNeigh_simd:           # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_simd$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rax
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	movl	4(%rsi), %eax
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	jle	.LBB2_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%rsi), %rdi
 | 
			
		||||
	shlq	$3, %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB2_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	movq	stderr(%rip), %rcx
 | 
			
		||||
	movl	$.L.str.2, %edi
 | 
			
		||||
	movl	$65, %esi
 | 
			
		||||
	movl	$1, %edx
 | 
			
		||||
	callq	fwrite
 | 
			
		||||
	movl	$-1, %edi
 | 
			
		||||
	callq	exit
 | 
			
		||||
.Lfunc_end2:
 | 
			
		||||
	.size	computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.type	.L.str,@object          # 
 | 
			
		||||
	.section	.rodata.str1.1,"aMS",@progbits,1
 | 
			
		||||
.L.str:
 | 
			
		||||
	.asciz	"force"
 | 
			
		||||
	.size	.L.str, 6
 | 
			
		||||
	.type	.L.str.1,@object        # 
 | 
			
		||||
.L.str.1:
 | 
			
		||||
	.asciz	"forceLJ-halfneigh"
 | 
			
		||||
	.size	.L.str.1, 18
 | 
			
		||||
	.type	.L.str.2,@object        # 
 | 
			
		||||
.L.str.2:
 | 
			
		||||
	.asciz	"Error: SIMD kernel not implemented for specified instruction set!"
 | 
			
		||||
	.size	.L.str.2, 66
 | 
			
		||||
	.ident	"Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
 | 
			
		||||
	.section	".note.GNU-stack","",@progbits
 | 
			
		||||
@@ -1,105 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      force_lj_icx_avx2_markers.s
 | 
			
		||||
Architecture:       ZEN3
 | 
			
		||||
Timestamp:          2022-12-12 12:47:07
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                           Port pressure in cycles                                                            
 | 
			
		||||
     |  0   |   1   |  2   |  3   | DV0  | DV1  |  4   |  5   |  6   |  7   |  8   - 8DV  |  9   |  10  |  11  |  12  |  13  ||  CP  | LCD  |
 | 
			
		||||
---------------------------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 172 |      |       |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   .LBB0_9:                                #
 | 
			
		||||
 173 |      |       |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
 174 |      |       |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
 175 |      | 0.250 | 0.75 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  1.0 |      |   vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
 176 | 0.00 |       |      | 1.00 |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  3.0 |      |   vpmulld (%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 177 | 0.00 | 1.010 | 0.25 | 0.74 |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vpmovsxdq %xmm11, %ymm1
 | 
			
		||||
 178 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpsllq $3, %ymm1, %ymm1
 | 
			
		||||
 179 | 0.00 | 0.000 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpaddq %ymm1, %ymm3, %ymm1
 | 
			
		||||
 180 | 0.00 | 0.000 | 0.51 | 0.49 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %r14
 | 
			
		||||
 181 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %r9
 | 
			
		||||
 182 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vextracti128 $1, %ymm1, %xmm1
 | 
			
		||||
 183 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
 184 | 0.00 | 0.000 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsubd .LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 185 | 0.00 | 0.750 | 0.38 | 0.87 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm6, %ymm6
 | 
			
		||||
 186 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm6, %ymm6
 | 
			
		||||
 187 | 0.00 | 0.000 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 188 | 0.00 | 0.000 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm6, %ymm3, %ymm6
 | 
			
		||||
 189 | 0.00 | 0.000 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm6, %rcx
 | 
			
		||||
 190 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  6.0 |      |   vpextrq $1, %xmm1, %rbx
 | 
			
		||||
 191 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm6, %rax
 | 
			
		||||
 192 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm6, %xmm1
 | 
			
		||||
 193 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 194 | 0.00 | 0.000 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 195 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %rsi
 | 
			
		||||
 196 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
 197 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
 198 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
 199 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
 200 | 0.00 | 0.000 | 0.62 | 0.38 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddd %xmm12, %xmm11, %xmm4
 | 
			
		||||
 201 | 0.00 | 0.750 | 0.00 | 1.25 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm4, %ymm4
 | 
			
		||||
 202 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
 203 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm4, %ymm4
 | 
			
		||||
 204 | 0.00 | 0.000 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm4, %ymm3, %ymm4
 | 
			
		||||
 205 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  5.0 |      |   vmovhpd (%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 206 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rax
 | 
			
		||||
 207 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
 208 | 0.00 | 0.000 | 0.51 | 0.49 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rcx
 | 
			
		||||
 209 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm4, %xmm4
 | 
			
		||||
 210 | 0.00 | -0.01 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rsi
 | 
			
		||||
 211 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vinsertf128 $1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 212 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rdi
 | 
			
		||||
 213 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
 214 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vsubpd %ymm2, %ymm14, %ymm2
 | 
			
		||||
 215 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
 216 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 217 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 218 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 219 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 220 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm1, %ymm5, %ymm1
 | 
			
		||||
 221 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm4, %ymm10, %ymm4
 | 
			
		||||
 222 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm2, %ymm2, %ymm6
 | 
			
		||||
 223 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
 224 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
 225 | 1.00 |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
 226 |      |       |      |      | 4.50 | 4.50 |      |      |      |      |             |      |      |      |      |      || 13.0 |      |   vdivpd %ymm6, %ymm7, %ymm7
 | 
			
		||||
 227 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm7, %ymm11
 | 
			
		||||
 228 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm9, %ymm11, %ymm11
 | 
			
		||||
 229 | 1.00 |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
 230 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm11, %ymm11
 | 
			
		||||
 231 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vaddpd %ymm12, %ymm11, %ymm12
 | 
			
		||||
 232 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
 233 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmulpd %ymm7, %ymm11, %ymm7
 | 
			
		||||
 234 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm12, %ymm7
 | 
			
		||||
 235 |      |       | 0.12 | 0.88 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vcmpltpd %ymm8, %ymm6, %ymm6
 | 
			
		||||
 236 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
 237 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 238 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
 239 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  4.0 |   vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
 240 | 0.62 | 0.380 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 241 | 0.50 | 0.500 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  1.0 |   vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 242 |      |       |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   addq $4, %rbp
 | 
			
		||||
 243 |      |       |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   cmpq %rdx, %rbp
 | 
			
		||||
 244 |      |       |      |      |      |      |      |      | 0.00 |      |             |      | 1.00 |      |      |      ||      |      |   jb .LBB0_9
 | 
			
		||||
 | 
			
		||||
       16.1   15.63   15.6   15.6   4.50   4.50                 0.50   0.50   0.50          0.50          9.00   9.00             72    5.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 239 |  5.0 | vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
 | 
			
		||||
 238 |  5.0 | vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
 | 
			
		||||
 236 |  5.0 | vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
 | 
			
		||||
 242 |  1.0 | addq	$4, %rbp                       | [242]
 | 
			
		||||
 241 |  1.0 | vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13| [241]
 | 
			
		||||
 240 |  1.0 | vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15| [240]
 | 
			
		||||
 237 |  1.0 | vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0| [237]
 | 
			
		||||
 | 
			
		||||
@@ -1,638 +0,0 @@
 | 
			
		||||
	.text
 | 
			
		||||
	.file	"force_lj.c"
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJFullNeigh_plain_c
 | 
			
		||||
.LCPI0_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI0_3:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI0_4:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.section	.rodata.cst4,"aM",@progbits,4
 | 
			
		||||
	.p2align	2
 | 
			
		||||
.LCPI0_1:
 | 
			
		||||
	.long	3                       # 0x3
 | 
			
		||||
.LCPI0_2:
 | 
			
		||||
	.long	2                       # 0x2
 | 
			
		||||
	.section	.rodata.cst16,"aM",@progbits,16
 | 
			
		||||
	.p2align	4
 | 
			
		||||
.LCPI0_5:
 | 
			
		||||
	.zero	16,255
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_plain_c,@function
 | 
			
		||||
computeForceLJFullNeigh_plain_c:        # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_plain_c$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 320
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, %rbx
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r14d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 128(%rsp)        # 8-byte Spill
 | 
			
		||||
	vmovq	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovdqa	%xmm0, 80(%rsp)         # 16-byte Spill
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r14,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB0_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovq	%xmm0, 32(%rsp)         # 8-byte Folded Spill
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_19
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm13
 | 
			
		||||
	movq	16(%r15), %r11
 | 
			
		||||
	movq	24(%r15), %rsi
 | 
			
		||||
	movslq	8(%r15), %rdi
 | 
			
		||||
	movq	16(%r12), %r15
 | 
			
		||||
	movq	64(%r12), %r8
 | 
			
		||||
	vmovsd	128(%rsp), %xmm0        # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI0_0(%rip), %xmm0, %xmm15
 | 
			
		||||
	movq	%rbx, 24(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqu	(%rbx), %xmm14
 | 
			
		||||
	decq	%r14
 | 
			
		||||
	vmovq	%r15, %xmm0
 | 
			
		||||
	vpbroadcastq	%xmm0, %ymm3
 | 
			
		||||
	vbroadcastsd	%xmm13, %ymm2
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vbroadcastsd	%xmm12, %ymm8
 | 
			
		||||
	vbroadcastsd	%xmm15, %ymm9
 | 
			
		||||
	shlq	$2, %rdi
 | 
			
		||||
	xorl	%r10d, %r10d
 | 
			
		||||
	movq	%r14, 56(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm13, 192(%rsp)       # 16-byte Spill
 | 
			
		||||
	movq	%rsi, 48(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdi, 40(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm15, 176(%rsp)       # 16-byte Spill
 | 
			
		||||
	vmovupd	%ymm2, 224(%rsp)        # 32-byte Spill
 | 
			
		||||
	vmovupd	%ymm9, 128(%rsp)        # 32-byte Spill
 | 
			
		||||
	jmp	.LBB0_6
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_17:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
.LBB0_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vaddsd	(%r8,%r12,8), %xmm10, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%r12,8)
 | 
			
		||||
	vaddsd	(%r8,%rbx,8), %xmm11, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbx,8)
 | 
			
		||||
	vaddsd	(%r8,%rbp,8), %xmm5, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbp,8)
 | 
			
		||||
	leal	3(%r13), %eax
 | 
			
		||||
	addl	$6, %r13d
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	cmovnsl	%eax, %r13d
 | 
			
		||||
	sarl	$2, %r13d
 | 
			
		||||
	movslq	%r13d, %rax
 | 
			
		||||
	vmovq	%rax, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm14, %xmm14
 | 
			
		||||
	addq	%rdi, %r11
 | 
			
		||||
	cmpq	%r14, %r10
 | 
			
		||||
	leaq	1(%r10), %r10
 | 
			
		||||
	je	.LBB0_18
 | 
			
		||||
.LBB0_6:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB0_9 Depth 2
 | 
			
		||||
                                        #     Child Loop BB0_13 Depth 2
 | 
			
		||||
	movl	(%rsi,%r10,4), %r13d
 | 
			
		||||
	leal	(%r10,%r10,2), %r12d
 | 
			
		||||
	leal	(%r10,%r10,2), %ebx
 | 
			
		||||
	incl	%ebx
 | 
			
		||||
	leal	(%r10,%r10,2), %ebp
 | 
			
		||||
	addl	$2, %ebp
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB0_4
 | 
			
		||||
# %bb.7:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovsd	(%r15,%r12,8), %xmm0    # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbx,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
	movl	$4294967292, %eax       # imm = 0xFFFFFFFC
 | 
			
		||||
	andq	%rax, %rdx
 | 
			
		||||
	vmovapd	%xmm0, 112(%rsp)        # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm1, 96(%rsp)         # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm2, (%rsp)           # 16-byte Spill
 | 
			
		||||
	je	.LBB0_16
 | 
			
		||||
# %bb.8:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%rbp, 64(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rbx, 72(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqa	%xmm14, 208(%rsp)       # 16-byte Spill
 | 
			
		||||
	vbroadcastsd	%xmm0, %ymm14
 | 
			
		||||
	vbroadcastsd	%xmm1, %ymm5
 | 
			
		||||
	vbroadcastsd	%xmm2, %ymm10
 | 
			
		||||
	vxorpd	%xmm0, %xmm0, %xmm0
 | 
			
		||||
	vxorpd	%xmm15, %xmm15, %xmm15
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	xorl	%ebp, %ebp
 | 
			
		||||
	vmovapd	%ymm8, %ymm9
 | 
			
		||||
	vmovupd	224(%rsp), %ymm8        # 32-byte Reload
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
    # OSACA-BEGIN
 | 
			
		||||
    # LLVM-MCA-BEGIN
 | 
			
		||||
.LBB0_9:                                # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2 
 | 
			
		||||
	vpbroadcastd	.LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
	vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
	vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
	vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
	vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
	vmovq	%xmm1, %r14
 | 
			
		||||
	vpextrq	$1, %xmm1, %r9
 | 
			
		||||
	vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
	vmovsd	(%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
	vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
	vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
	vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
	vmovq	%xmm6, %rcx
 | 
			
		||||
	vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
	vpextrq	$1, %xmm6, %rax
 | 
			
		||||
	vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
	vmovsd	(%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
	vmovsd	(%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
	vpbroadcastd	.LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
	vmovhpd	(%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
	vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
	vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
	vmovhpd	(%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
	vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
	vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
	vmovhpd	(%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vpextrq	$1, %xmm4, %rax
 | 
			
		||||
	vmovhpd	(%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
	vmovq	%xmm4, %rcx
 | 
			
		||||
	vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
	vmovq	%xmm4, %rsi
 | 
			
		||||
	vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
	vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
	vmovsd	(%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
	vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
	vmovhpd	(%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
	vmovsd	(%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
	vmovhpd	(%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
	vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
	vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
	vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
	vfmadd231pd	%ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
	vfmadd231pd	%ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
	vbroadcastsd	.LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
	vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
	vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
	vbroadcastsd	.LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
	vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
	vmulpd	128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
	vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
	vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
	vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
	vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
	vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
	vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
	vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
	addq	$4, %rbp
 | 
			
		||||
	cmpq	%rdx, %rbp
 | 
			
		||||
	jb	.LBB0_9
 | 
			
		||||
    # LLVM-MCA-END
 | 
			
		||||
    # OSACA-END
 | 
			
		||||
# %bb.10:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm0, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm0, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm10
 | 
			
		||||
	vpermilpd	$1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm15, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm15, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm11
 | 
			
		||||
	vpermilpd	$1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm13, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm13, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm5
 | 
			
		||||
	movq	56(%rsp), %r14          # 8-byte Reload
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vmovapd	192(%rsp), %xmm13       # 16-byte Reload
 | 
			
		||||
	movq	48(%rsp), %rsi          # 8-byte Reload
 | 
			
		||||
	movq	40(%rsp), %rdi          # 8-byte Reload
 | 
			
		||||
	vmovdqa	208(%rsp), %xmm14       # 16-byte Reload
 | 
			
		||||
	vmovapd	176(%rsp), %xmm15       # 16-byte Reload
 | 
			
		||||
	vmovapd	%ymm9, %ymm8
 | 
			
		||||
	movq	72(%rsp), %rbx          # 8-byte Reload
 | 
			
		||||
	movq	64(%rsp), %rbp          # 8-byte Reload
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
	jmp	.LBB0_11
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_4:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movslq	%r13d, %rdx
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	jmp	.LBB0_5
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_16:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
.LBB0_11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_13
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	incq	%rdx
 | 
			
		||||
	cmpq	%rdx, %r13
 | 
			
		||||
	je	.LBB0_17
 | 
			
		||||
.LBB0_13:                               # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movl	(%r11,%rdx,4), %eax
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm0, %xmm6
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	incl	%ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm4, %xmm2
 | 
			
		||||
	leal	2(%rax,%rax,2), %eax
 | 
			
		||||
	cltq
 | 
			
		||||
	vmovapd	(%rsp), %xmm1           # 16-byte Reload
 | 
			
		||||
	vsubsd	(%r15,%rax,8), %xmm1, %xmm1
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm7
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
 | 
			
		||||
	vucomisd	%xmm13, %xmm7
 | 
			
		||||
	jae	.LBB0_12
 | 
			
		||||
# %bb.14:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	vmovsd	.LCPI0_3(%rip), %xmm0   # xmm0 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm7, %xmm0, %xmm7
 | 
			
		||||
	vmulsd	%xmm7, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm0, %xmm12, %xmm0
 | 
			
		||||
	vmulsd	%xmm7, %xmm0, %xmm0
 | 
			
		||||
	vaddsd	.LCPI0_4(%rip), %xmm0, %xmm4
 | 
			
		||||
	vmulsd	%xmm7, %xmm15, %xmm7
 | 
			
		||||
	vmulsd	%xmm0, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm4, %xmm0, %xmm0
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	vfmadd231sd	%xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_12
 | 
			
		||||
.LBB0_18:                               # 
 | 
			
		||||
	movq	24(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm14, (%rax)
 | 
			
		||||
.LBB0_19:                               # 
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	vzeroupper
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	32(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end0:
 | 
			
		||||
	.size	computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJHalfNeigh
 | 
			
		||||
.LCPI1_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI1_1:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI1_2:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJHalfNeigh
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJHalfNeigh,@function
 | 
			
		||||
computeForceLJHalfNeigh:                # 
 | 
			
		||||
.LcomputeForceLJHalfNeigh$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 96
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, 16(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r13d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 8(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 32(%rsp)         # 8-byte Spill
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r13,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB1_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovsd	%xmm0, 24(%rsp)         # 8-byte Spill
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_8
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	8(%rsp), %xmm0          # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm12
 | 
			
		||||
	movq	16(%r15), %rax
 | 
			
		||||
	movq	24(%r15), %rcx
 | 
			
		||||
	movq	%rcx, 8(%rsp)           # 8-byte Spill
 | 
			
		||||
	movslq	8(%r15), %rdx
 | 
			
		||||
	movq	16(%r12), %rsi
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI1_0(%rip), %xmm0, %xmm11
 | 
			
		||||
	movq	16(%rsp), %rcx          # 8-byte Reload
 | 
			
		||||
	vmovdqu	(%rcx), %xmm10
 | 
			
		||||
	shlq	$2, %rdx
 | 
			
		||||
	movq	%rdx, (%rsp)            # 8-byte Spill
 | 
			
		||||
	xorl	%r12d, %r12d
 | 
			
		||||
	jmp	.LBB1_4
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	movq	%r9, %rdx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
.LBB1_6:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vaddsd	(%rdi,%r15,8), %xmm14, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r15,8)
 | 
			
		||||
	vaddsd	(%rdi,%r10,8), %xmm9, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r10,8)
 | 
			
		||||
	vaddsd	(%rdi,%r11,8), %xmm13, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r11,8)
 | 
			
		||||
	leal	3(%r9), %ecx
 | 
			
		||||
	addl	$6, %r9d
 | 
			
		||||
	testl	%ecx, %ecx
 | 
			
		||||
	cmovnsl	%ecx, %r9d
 | 
			
		||||
	sarl	$2, %r9d
 | 
			
		||||
	movslq	%r9d, %rcx
 | 
			
		||||
	vmovq	%rcx, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm10, %xmm10
 | 
			
		||||
	incq	%r12
 | 
			
		||||
	addq	(%rsp), %rax            # 8-byte Folded Reload
 | 
			
		||||
	cmpq	%r13, %r12
 | 
			
		||||
	je	.LBB1_7
 | 
			
		||||
.LBB1_4:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB1_10 Depth 2
 | 
			
		||||
	movq	8(%rsp), %rcx           # 8-byte Reload
 | 
			
		||||
	movslq	(%rcx,%r12,4), %r9
 | 
			
		||||
	leaq	(%r12,%r12,2), %rcx
 | 
			
		||||
	leal	1(%rcx), %r10d
 | 
			
		||||
	leal	2(%rcx), %r11d
 | 
			
		||||
	movl	%ecx, %r15d
 | 
			
		||||
	testq	%r9, %r9
 | 
			
		||||
	jle	.LBB1_5
 | 
			
		||||
# %bb.9:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vmovsd	(%rsi,%r15,8), %xmm15   # xmm15 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r10,8), %xmm4    # xmm4 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r11,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	movl	%r9d, %edx
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
	xorl	%ecx, %ecx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	jmp	.LBB1_10
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_13:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	incq	%rcx
 | 
			
		||||
	cmpq	%rcx, %rdx
 | 
			
		||||
	je	.LBB1_6
 | 
			
		||||
.LBB1_10:                               # 
 | 
			
		||||
                                        #   Parent Loop BB1_4 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movslq	(%rax,%rcx,4), %r8
 | 
			
		||||
	leaq	(%r8,%r8,2), %r14
 | 
			
		||||
	vsubsd	(%rsi,%r14,8), %xmm15, %xmm2
 | 
			
		||||
	movslq	%r14d, %rbp
 | 
			
		||||
	vsubsd	8(%rsi,%rbp,8), %xmm4, %xmm5
 | 
			
		||||
	vsubsd	16(%rsi,%rbp,8), %xmm1, %xmm0
 | 
			
		||||
	vmulsd	%xmm2, %xmm2, %xmm6
 | 
			
		||||
	vfmadd231sd	%xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
 | 
			
		||||
	vfmadd231sd	%xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
 | 
			
		||||
	vucomisd	%xmm12, %xmm6
 | 
			
		||||
	jae	.LBB1_13
 | 
			
		||||
# %bb.11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	vmovsd	.LCPI1_1(%rip), %xmm3   # xmm3 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm6, %xmm3, %xmm6
 | 
			
		||||
	vmulsd	32(%rsp), %xmm6, %xmm3  # 8-byte Folded Reload
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm8
 | 
			
		||||
	vmulsd	%xmm3, %xmm8, %xmm3
 | 
			
		||||
	vaddsd	.LCPI1_2(%rip), %xmm3, %xmm7
 | 
			
		||||
	vmulsd	%xmm6, %xmm11, %xmm6
 | 
			
		||||
	vmulsd	%xmm3, %xmm6, %xmm3
 | 
			
		||||
	vmulsd	%xmm7, %xmm3, %xmm3
 | 
			
		||||
	vmulsd	%xmm2, %xmm3, %xmm6
 | 
			
		||||
	vaddsd	%xmm6, %xmm14, %xmm14
 | 
			
		||||
	vmulsd	%xmm5, %xmm3, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm9, %xmm9
 | 
			
		||||
	vmulsd	%xmm0, %xmm3, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm13, %xmm13
 | 
			
		||||
	cmpl	%r13d, %r8d
 | 
			
		||||
	jge	.LBB1_13
 | 
			
		||||
# %bb.12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	leaq	1(%rbp), %rbx
 | 
			
		||||
	addq	$2, %rbp
 | 
			
		||||
	vmovsd	(%rdi,%r14,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm6, %xmm3, %xmm3
 | 
			
		||||
	vmovsd	%xmm3, (%rdi,%r14,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbx,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm2, %xmm3, %xmm2
 | 
			
		||||
	vmovsd	%xmm2, (%rdi,%rbx,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm0, %xmm2, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%rbp,8)
 | 
			
		||||
	jmp	.LBB1_13
 | 
			
		||||
.LBB1_7:                                # 
 | 
			
		||||
	movq	16(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm10, (%rax)
 | 
			
		||||
.LBB1_8:                                # 
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	24(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end1:
 | 
			
		||||
	.size	computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.globl	computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_simd,@function
 | 
			
		||||
computeForceLJFullNeigh_simd:           # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_simd$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rax
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	movl	4(%rsi), %eax
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	jle	.LBB2_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%rsi), %rdi
 | 
			
		||||
	shlq	$3, %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB2_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	movq	stderr(%rip), %rcx
 | 
			
		||||
	movl	$.L.str.2, %edi
 | 
			
		||||
	movl	$65, %esi
 | 
			
		||||
	movl	$1, %edx
 | 
			
		||||
	callq	fwrite
 | 
			
		||||
	movl	$-1, %edi
 | 
			
		||||
	callq	exit
 | 
			
		||||
.Lfunc_end2:
 | 
			
		||||
	.size	computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.type	.L.str,@object          # 
 | 
			
		||||
	.section	.rodata.str1.1,"aMS",@progbits,1
 | 
			
		||||
.L.str:
 | 
			
		||||
	.asciz	"force"
 | 
			
		||||
	.size	.L.str, 6
 | 
			
		||||
 | 
			
		||||
	.type	.L.str.1,@object        # 
 | 
			
		||||
.L.str.1:
 | 
			
		||||
	.asciz	"forceLJ-halfneigh"
 | 
			
		||||
	.size	.L.str.1, 18
 | 
			
		||||
 | 
			
		||||
	.type	.L.str.2,@object        # 
 | 
			
		||||
.L.str.2:
 | 
			
		||||
	.asciz	"Error: SIMD kernel not implemented for specified instruction set!"
 | 
			
		||||
	.size	.L.str.2, 66
 | 
			
		||||
 | 
			
		||||
	.ident	"Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
 | 
			
		||||
	.section	".note.GNU-stack","",@progbits
 | 
			
		||||
@@ -15,7 +15,7 @@ ISA="${BIN_INFO##*-}"
 | 
			
		||||
CORE="${CORE:-0}"
 | 
			
		||||
FREQ="${FREQ:-2.4}"
 | 
			
		||||
NRUNS="${NRUNS:-3}"
 | 
			
		||||
LOG="${LOG:-latencies_and_cfds.log}"
 | 
			
		||||
LOG="${LOG:-latencies_and_cfds.$(hostname).log}"
 | 
			
		||||
STUB_ONLY="${STUB_ONLY:-false}"
 | 
			
		||||
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
 | 
			
		||||
 | 
			
		||||
@@ -37,10 +37,14 @@ CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
 | 
			
		||||
 | 
			
		||||
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
 | 
			
		||||
    ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
 | 
			
		||||
    PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
 | 
			
		||||
    DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
 | 
			
		||||
else
 | 
			
		||||
    ALL_PREFETCHERS=""
 | 
			
		||||
    PREFETCHERS=("IGNORE")
 | 
			
		||||
    DEFAULT_PREFETCHERS=("IGNORE")
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [ -z ${PREFETCHERS+x} ]; then
 | 
			
		||||
    PREFETCHERS=${DEFAULT_PREFETCHERS}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [ "$OPT_SCHEME" == "gromacs" ]; then
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										52
									
								
								util/gather-bench/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								util/gather-bench/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,52 @@
 | 
			
		||||
# Prerequisites
 | 
			
		||||
*.d
 | 
			
		||||
 | 
			
		||||
# Object files
 | 
			
		||||
*.o
 | 
			
		||||
*.ko
 | 
			
		||||
*.obj
 | 
			
		||||
*.elf
 | 
			
		||||
 | 
			
		||||
# Linker output
 | 
			
		||||
*.ilk
 | 
			
		||||
*.map
 | 
			
		||||
*.exp
 | 
			
		||||
 | 
			
		||||
# Precompiled Headers
 | 
			
		||||
*.gch
 | 
			
		||||
*.pch
 | 
			
		||||
 | 
			
		||||
# Libraries
 | 
			
		||||
*.lib
 | 
			
		||||
*.a
 | 
			
		||||
*.la
 | 
			
		||||
*.lo
 | 
			
		||||
 | 
			
		||||
# Shared objects (inc. Windows DLLs)
 | 
			
		||||
*.dll
 | 
			
		||||
*.so
 | 
			
		||||
*.so.*
 | 
			
		||||
*.dylib
 | 
			
		||||
 | 
			
		||||
# Executables
 | 
			
		||||
*.exe
 | 
			
		||||
*.out
 | 
			
		||||
*.app
 | 
			
		||||
*.i*86
 | 
			
		||||
*.x86_64
 | 
			
		||||
*.hex
 | 
			
		||||
 | 
			
		||||
# Debug files
 | 
			
		||||
*.dSYM/
 | 
			
		||||
*.su
 | 
			
		||||
*.idb
 | 
			
		||||
*.pdb
 | 
			
		||||
 | 
			
		||||
# Kernel Module Compile Results
 | 
			
		||||
*.mod*
 | 
			
		||||
*.cmd
 | 
			
		||||
.tmp_versions/
 | 
			
		||||
modules.order
 | 
			
		||||
Module.symvers
 | 
			
		||||
Mkfile.old
 | 
			
		||||
dkms.conf
 | 
			
		||||
							
								
								
									
										21
									
								
								util/gather-bench/LICENSE
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								util/gather-bench/LICENSE
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,21 @@
 | 
			
		||||
MIT License
 | 
			
		||||
 | 
			
		||||
Copyright (c) 2021 RRZE-HPC
 | 
			
		||||
 | 
			
		||||
Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
in the Software without restriction, including without limitation the rights
 | 
			
		||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
furnished to do so, subject to the following conditions:
 | 
			
		||||
 | 
			
		||||
The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
copies or substantial portions of the Software.
 | 
			
		||||
 | 
			
		||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
SOFTWARE.
 | 
			
		||||
							
								
								
									
										126
									
								
								util/gather-bench/Makefile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								util/gather-bench/Makefile
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,126 @@
 | 
			
		||||
#CONFIGURE BUILD SYSTEM
 | 
			
		||||
TARGET	   = gather-bench-$(TAG)
 | 
			
		||||
BUILD_DIR  = ./$(TAG)
 | 
			
		||||
SRC_DIR	= ./src
 | 
			
		||||
MAKE_DIR   = ./
 | 
			
		||||
ISA_DIR	= ./src/$(ISA)
 | 
			
		||||
Q		 ?= @
 | 
			
		||||
 | 
			
		||||
#DO NOT EDIT BELOW
 | 
			
		||||
include $(MAKE_DIR)/config.mk
 | 
			
		||||
include $(MAKE_DIR)/include_$(TAG).mk
 | 
			
		||||
include $(MAKE_DIR)/include_LIKWID.mk
 | 
			
		||||
INCLUDES  += -I./src/includes
 | 
			
		||||
 | 
			
		||||
VPATH	 = $(SRC_DIR) ${ISA_DIR}
 | 
			
		||||
ASM	   = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 | 
			
		||||
ASM	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90))
 | 
			
		||||
OBJ	   = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90))
 | 
			
		||||
OBJ	  += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
 | 
			
		||||
OBJ	  += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S))
 | 
			
		||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA)
 | 
			
		||||
 | 
			
		||||
ifneq ($(VARIANT),)
 | 
			
		||||
	.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
 | 
			
		||||
    CPPFLAGS += -DAOS
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(TEST)),true)
 | 
			
		||||
    CPPFLAGS += -DTEST
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(PADDING)),true)
 | 
			
		||||
    CPPFLAGS += -DPADDING
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true)
 | 
			
		||||
    CPPFLAGS += -DMEASURE_GATHER_CYCLES
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true)
 | 
			
		||||
    CPPFLAGS += -DONLY_FIRST_DIMENSION
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(MEM_TRACER)),true)
 | 
			
		||||
    CPPFLAGS += -DMEM_TRACER
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c
 | 
			
		||||
	@echo "===>  LINKING  $(TARGET)"
 | 
			
		||||
	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS)
 | 
			
		||||
 | 
			
		||||
${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c
 | 
			
		||||
	@echo "===>  LINKING  $(TARGET)-$* "
 | 
			
		||||
	$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS)
 | 
			
		||||
 | 
			
		||||
asm:  $(BUILD_DIR) $(ASM)
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.c
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
 | 
			
		||||
	$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.s:  %.c
 | 
			
		||||
	@echo "===>  GENERATE ASM  $@"
 | 
			
		||||
	$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.s:  %.f90
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(FC) -S  $(FCFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.cc
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
 | 
			
		||||
	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.cpp
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
 | 
			
		||||
	$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.f90
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(FC) -c  $(FCFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.F90
 | 
			
		||||
	@echo "===>  COMPILE  $@"
 | 
			
		||||
	$(Q)$(FC) -c  $(CPPFLAGS)  $(FCFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.s
 | 
			
		||||
	@echo "===>  ASSEMBLE  $@"
 | 
			
		||||
	$(Q)$(AS)  $(ASFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR)/%.o:  %.S
 | 
			
		||||
	@echo "===>  ASSEMBLE  $@"
 | 
			
		||||
	$(Q)$(CC) -c $(CPPFLAGS) $< -o $@
 | 
			
		||||
 | 
			
		||||
tags:
 | 
			
		||||
	@echo "===>  GENERATE  TAGS"
 | 
			
		||||
	$(Q)ctags -R
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
$(BUILD_DIR):
 | 
			
		||||
	@mkdir $(BUILD_DIR)
 | 
			
		||||
 | 
			
		||||
ifeq ($(findstring $(MAKECMDGOALS),clean),)
 | 
			
		||||
-include $(OBJ:.o=.d)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
.PHONY: clean distclean
 | 
			
		||||
 | 
			
		||||
clean:
 | 
			
		||||
	@echo "===>  CLEAN"
 | 
			
		||||
	@rm -rf $(BUILD_DIR)
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
 | 
			
		||||
distclean: clean
 | 
			
		||||
	@echo "===>  DIST CLEAN"
 | 
			
		||||
	@rm -f $(TARGET)
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
							
								
								
									
										2
									
								
								util/gather-bench/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								util/gather-bench/README.md
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
			
		||||
# gather-bench
 | 
			
		||||
A X86 gather instruction performance benchmark
 | 
			
		||||
							
								
								
									
										22
									
								
								util/gather-bench/config.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								util/gather-bench/config.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,22 @@
 | 
			
		||||
# Supported: GCC, CLANG, ICC
 | 
			
		||||
TAG ?= ICC
 | 
			
		||||
# Supported: avx2, avx512
 | 
			
		||||
ISA ?= avx512
 | 
			
		||||
# Use likwid?
 | 
			
		||||
ENABLE_LIKWID ?= false
 | 
			
		||||
 | 
			
		||||
# SP or DP
 | 
			
		||||
DATA_TYPE ?= DP
 | 
			
		||||
# AOS or SOA
 | 
			
		||||
DATA_LAYOUT ?= AOS
 | 
			
		||||
# Padding byte for AoS
 | 
			
		||||
PADDING ?= false
 | 
			
		||||
# Measure cycles for each gather separately
 | 
			
		||||
MEASURE_GATHER_CYCLES ?= false
 | 
			
		||||
# Gather data only for first dimension (one gather per iteration)
 | 
			
		||||
ONLY_FIRST_DIMENSION ?= false
 | 
			
		||||
 | 
			
		||||
# Trace memory addresses for cache simulator
 | 
			
		||||
MEM_TRACER ?= false
 | 
			
		||||
# Test correctness of gather kernels
 | 
			
		||||
TEST ?= false
 | 
			
		||||
							
								
								
									
										9
									
								
								util/gather-bench/include_CLANG.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								util/gather-bench/include_CLANG.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
CC  = clang
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP   =# -fopenmp
 | 
			
		||||
CFLAGS   = -Ofast -std=c11 -march=core-avx2 -mavx -mfma  $(OPENMP)
 | 
			
		||||
LFLAGS   = $(OPENMP) -march=core-avx2 -mavx -mfma
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE
 | 
			
		||||
INCLUDES =
 | 
			
		||||
LIBS     =
 | 
			
		||||
							
								
								
									
										11
									
								
								util/gather-bench/include_GCC.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								util/gather-bench/include_GCC.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,11 @@
 | 
			
		||||
CC  = gcc
 | 
			
		||||
AS  = as
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP   = -fopenmp
 | 
			
		||||
CFLAGS   = -Ofast -std=c11 -mavx2 -mfma $(OPENMP)
 | 
			
		||||
ASFLAGS  =
 | 
			
		||||
LFLAGS   = $(OPENMP) -mavx2 -mfma
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE
 | 
			
		||||
INCLUDES =
 | 
			
		||||
LIBS     =
 | 
			
		||||
							
								
								
									
										9
									
								
								util/gather-bench/include_ICC.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								util/gather-bench/include_ICC.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
			
		||||
CC  = icc
 | 
			
		||||
LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP   = -qopenmp
 | 
			
		||||
CFLAGS   = -Ofast -xhost -std=c11 $(OPENMP)
 | 
			
		||||
LFLAGS   = $(OPENMP)
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE
 | 
			
		||||
INCLUDES =
 | 
			
		||||
LIBS     =
 | 
			
		||||
							
								
								
									
										10
									
								
								util/gather-bench/include_LIKWID.mk
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								util/gather-bench/include_LIKWID.mk
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,10 @@
 | 
			
		||||
LIKWID_INC ?= -I/usr/local/include
 | 
			
		||||
LIKWID_DEFINES ?= -DLIKWID_PERFMON
 | 
			
		||||
LIKWID_LIB ?= -L/usr/local/lib
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(ENABLE_LIKWID)),true)
 | 
			
		||||
INCLUDES += ${LIKWID_INC}
 | 
			
		||||
DEFINES +=  ${LIKWID_DEFINES}
 | 
			
		||||
LIBS += -llikwid
 | 
			
		||||
LFLAGS += ${LIKWID_LIB}
 | 
			
		||||
endif
 | 
			
		||||
							
								
								
									
										57
									
								
								util/gather-bench/src/allocate.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								util/gather-bench/src/allocate.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,57 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <errno.h>
 | 
			
		||||
 | 
			
		||||
void* allocate (int alignment, size_t bytesize)
 | 
			
		||||
{
 | 
			
		||||
    int errorCode;
 | 
			
		||||
    void* ptr;
 | 
			
		||||
 | 
			
		||||
    errorCode =  posix_memalign(&ptr, alignment, bytesize);
 | 
			
		||||
 | 
			
		||||
    if (errorCode) {
 | 
			
		||||
        if (errorCode == EINVAL) {
 | 
			
		||||
            fprintf(stderr,
 | 
			
		||||
                    "Error: Alignment parameter is not a power of two\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
        if (errorCode == ENOMEM) {
 | 
			
		||||
            fprintf(stderr,
 | 
			
		||||
                    "Error: Insufficient memory to fulfill the request\n");
 | 
			
		||||
            exit(EXIT_FAILURE);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (ptr == NULL) {
 | 
			
		||||
        fprintf(stderr, "Error: posix_memalign failed!\n");
 | 
			
		||||
        exit(EXIT_FAILURE);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return ptr;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										63
									
								
								util/gather-bench/src/avx2/gather.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								util/gather-bench/src/avx2/gather.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,63 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather
 | 
			
		||||
.type gather, @function
 | 
			
		||||
gather :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
vpcmpeqd ymm0, ymm0, ymm0
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
vmovups xmm1, [rsi + rax * 4]
 | 
			
		||||
vmovups xmm2, [rsi + rax * 4 + 16]
 | 
			
		||||
vmovups xmm3, [rsi + rax * 4 + 32]
 | 
			
		||||
vmovups xmm4, [rsi + rax * 4 + 48]
 | 
			
		||||
vmovdqa ymm5, ymm0
 | 
			
		||||
vmovdqa ymm6, ymm0
 | 
			
		||||
vmovdqa ymm7, ymm0
 | 
			
		||||
vmovdqa ymm8, ymm0
 | 
			
		||||
vxorpd ymm9,  ymm9,  ymm9
 | 
			
		||||
vxorpd ymm10, ymm10, ymm10
 | 
			
		||||
vxorpd ymm11, ymm11, ymm11
 | 
			
		||||
vxorpd ymm12, ymm12, ymm12
 | 
			
		||||
vgatherdpd ymm9,  [rdi + xmm1 * 8], ymm5
 | 
			
		||||
vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
 | 
			
		||||
vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
 | 
			
		||||
vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovapd [rcx + rax * 8],      ymm9
 | 
			
		||||
vmovapd [rcx + rax * 8 + 32], ymm10
 | 
			
		||||
vmovapd [rcx + rax * 8 + 64], ymm11
 | 
			
		||||
vmovapd [rcx + rax * 8 + 96], ymm12
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 16
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather, .-gather
 | 
			
		||||
							
								
								
									
										71
									
								
								util/gather-bench/src/avx2/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								util/gather-bench/src/avx2/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,71 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_aos
 | 
			
		||||
.type gather_aos, @function
 | 
			
		||||
gather_aos :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
vpcmpeqd ymm8, ymm8, ymm8
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd xmm4, xmm3, xmm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd xmm3, xmm4, xmm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd xmm3, xmm3, xmm4
 | 
			
		||||
#endif
 | 
			
		||||
vmovdqa ymm5, ymm8
 | 
			
		||||
vmovdqa ymm6, ymm8
 | 
			
		||||
vmovdqa ymm7, ymm8
 | 
			
		||||
vxorpd ymm0, ymm0, ymm0
 | 
			
		||||
vxorpd ymm1, ymm1, ymm1
 | 
			
		||||
vxorpd ymm2, ymm2, ymm2
 | 
			
		||||
vgatherdpd ymm0, [     rdi + xmm3 * 8], ymm5
 | 
			
		||||
vgatherdpd ymm1, [8  + rdi + xmm3 * 8], ymm6
 | 
			
		||||
vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], ymm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], ymm1
 | 
			
		||||
lea r9,  [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r9  + rax * 8], ymm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 4
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_aos, .-gather_aos
 | 
			
		||||
							
								
								
									
										67
									
								
								util/gather-bench/src/avx2/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								util/gather-bench/src/avx2/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_soa
 | 
			
		||||
.type gather_soa, @function
 | 
			
		||||
gather_soa :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor rax, rax
 | 
			
		||||
vpcmpeqd ymm8, ymm8, ymm8
 | 
			
		||||
lea r8, [rdi + rdx * 8]
 | 
			
		||||
lea r9, [r8  + rdx * 8]
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vmovdqa ymm5, ymm8
 | 
			
		||||
vmovdqa ymm6, ymm8
 | 
			
		||||
vmovdqa ymm7, ymm8
 | 
			
		||||
vxorpd ymm0, ymm0, ymm0
 | 
			
		||||
vxorpd ymm1, ymm1, ymm1
 | 
			
		||||
vxorpd ymm2, ymm2, ymm2
 | 
			
		||||
vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5
 | 
			
		||||
vgatherdpd ymm1, [r8  + xmm3 * 8], ymm6
 | 
			
		||||
vgatherdpd ymm2, [r9  + xmm3 * 8], ymm7
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], ymm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], ymm1
 | 
			
		||||
lea r10, [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r10 + rax * 8], ymm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 4
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_soa, .-gather_soa
 | 
			
		||||
							
								
								
									
										62
									
								
								util/gather-bench/src/avx512/gather.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								util/gather-bench/src/avx512/gather.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,62 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather
 | 
			
		||||
.type gather, @function
 | 
			
		||||
gather :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
vpcmpeqb k1, xmm0, xmm0
 | 
			
		||||
vpcmpeqb k2, xmm0, xmm0
 | 
			
		||||
vpcmpeqb k3, xmm0, xmm0
 | 
			
		||||
vpcmpeqb k4, xmm0, xmm0
 | 
			
		||||
vmovdqu ymm0, [rsi + rax * 4]
 | 
			
		||||
vmovdqu ymm1, [rsi + rax * 4 + 32]
 | 
			
		||||
vmovdqu ymm2, [rsi + rax * 4 + 64]
 | 
			
		||||
vmovdqu ymm3, [rsi + rax * 4 + 96]
 | 
			
		||||
vpxord zmm4, zmm4, zmm4
 | 
			
		||||
vpxord zmm5, zmm5, zmm5
 | 
			
		||||
vpxord zmm6, zmm6, zmm6
 | 
			
		||||
vpxord zmm7, zmm7, zmm7
 | 
			
		||||
vgatherdpd zmm4{k1}, [rdi + ymm0 * 8]
 | 
			
		||||
vgatherdpd zmm5{k2}, [rdi + ymm1 * 8]
 | 
			
		||||
vgatherdpd zmm6{k3}, [rdi + ymm2 * 8]
 | 
			
		||||
vgatherdpd zmm7{k4}, [rdi + ymm3 * 8]
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovapd [rcx + rax * 8],       zmm4
 | 
			
		||||
vmovapd [rcx + rax * 8 + 64],  zmm5
 | 
			
		||||
vmovapd [rcx + rax * 8 + 128], zmm6
 | 
			
		||||
vmovapd [rcx + rax * 8 + 192], zmm7
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 32
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather, .-gather
 | 
			
		||||
							
								
								
									
										151
									
								
								util/gather-bench/src/avx512/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								util/gather-bench/src/avx512/gather_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,151 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
# r8  -> cycles
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_aos
 | 
			
		||||
.type gather_aos, @function
 | 
			
		||||
gather_aos :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd ymm4, ymm3, ymm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd ymm3, ymm4, ymm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd ymm3, ymm3, ymm4
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
# Prefetching instructions
 | 
			
		||||
#mov ebx, DWORD PTR[rsi + rax*4]
 | 
			
		||||
#mov r9d, DWORD PTR[4 + rsi + rax*4]
 | 
			
		||||
#mov r10d, DWORD PTR[8 + rsi + rax*4]
 | 
			
		||||
#mov r11d, DWORD PTR[12 + rsi + rax*4]
 | 
			
		||||
#mov r12d, DWORD PTR[16 + rsi + rax*4]
 | 
			
		||||
#mov r13d, DWORD PTR[20 + rsi + rax*4]
 | 
			
		||||
#mov r14d, DWORD PTR[24 + rsi + rax*4]
 | 
			
		||||
#mov r15d, DWORD PTR[28 + rsi + rax*4]
 | 
			
		||||
#lea ebx, DWORD PTR[rbx]
 | 
			
		||||
#lea r9d, DWORD PTR[r9]
 | 
			
		||||
#lea r10d, DWORD PTR[r10]
 | 
			
		||||
#lea r11d, DWORD PTR[r11]
 | 
			
		||||
#lea r12d, DWORD PTR[r12]
 | 
			
		||||
#lea r13d, DWORD PTR[r13]
 | 
			
		||||
#lea r14d, DWORD PTR[r14]
 | 
			
		||||
#lea r15d, DWORD PTR[r15]
 | 
			
		||||
 | 
			
		||||
vpcmpeqb k1, xmm5, xmm5
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpcmpeqb k2, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k3, xmm5, xmm5
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vpxord zmm0, zmm0, zmm0
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpxord zmm1, zmm1, zmm1
 | 
			
		||||
vpxord zmm2, zmm2, zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef MEASURE_GATHER_CYCLES
 | 
			
		||||
 | 
			
		||||
mov r9, rax
 | 
			
		||||
mov r10, rdx
 | 
			
		||||
xor r11, r11
 | 
			
		||||
add r11, rax
 | 
			
		||||
add r11, rax
 | 
			
		||||
add r11, rax
 | 
			
		||||
#shr r11, 3
 | 
			
		||||
 | 
			
		||||
xor rbx, rbx
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
add ebx, eax
 | 
			
		||||
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
sub eax, ebx
 | 
			
		||||
#movdiri [r8 + r11], rax
 | 
			
		||||
movnti [r8 + r11], rax
 | 
			
		||||
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
xor rbx, rbx
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
add ebx, eax
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
sub eax, ebx
 | 
			
		||||
#movdiri [8 + r8 + r11], rax
 | 
			
		||||
movnti [8 + r8 + r11], rax
 | 
			
		||||
 | 
			
		||||
xor rbx, rbx
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
add ebx, eax
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
lfence
 | 
			
		||||
rdtsc
 | 
			
		||||
sub eax, ebx
 | 
			
		||||
#movdiri [16 + r8 + r11], rax
 | 
			
		||||
movnti [16 + r8 + r11], rax
 | 
			
		||||
#endif // ONLY_FIRST_DIMENSION
 | 
			
		||||
 | 
			
		||||
mov rax, r9
 | 
			
		||||
mov rdx, r10
 | 
			
		||||
 | 
			
		||||
#else // MEASURE_GATHER_CYCLES
 | 
			
		||||
 | 
			
		||||
vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 | 
			
		||||
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#endif // MEASURE_GATHER_CYCLES
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r9,  [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r9  + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 8
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_aos, .-gather_aos
 | 
			
		||||
							
								
								
									
										147
									
								
								util/gather-bench/src/avx512/gather_md_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								util/gather-bench/src/avx512/gather_md_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,147 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
.section .rodata, "a"
 | 
			
		||||
.align 64
 | 
			
		||||
.align 64
 | 
			
		||||
.ymm_reg_mask.1:
 | 
			
		||||
	.long	0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
 | 
			
		||||
	.type	.ymm_reg_mask.1,@object
 | 
			
		||||
	.size	.ymm_reg_mask.1,32
 | 
			
		||||
	.align 8
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> neighbors
 | 
			
		||||
# rdx -> numneighs[i]
 | 
			
		||||
# rcx -> &t[t_idx]
 | 
			
		||||
# r8  -> ntest
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_md_aos
 | 
			
		||||
.type gather_md_aos, @function
 | 
			
		||||
gather_md_aos :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
 | 
			
		||||
mov r15, rdx
 | 
			
		||||
xor rax, rax
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd ymm4, ymm3, ymm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd ymm3, ymm4, ymm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd ymm3, ymm3, ymm4
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
# Prefetching instructions
 | 
			
		||||
#mov ebx, DWORD PTR[rsi + rax*4]
 | 
			
		||||
#mov r9d, DWORD PTR[4 + rsi + rax*4]
 | 
			
		||||
#mov r10d, DWORD PTR[8 + rsi + rax*4]
 | 
			
		||||
#mov r11d, DWORD PTR[12 + rsi + rax*4]
 | 
			
		||||
#mov r12d, DWORD PTR[16 + rsi + rax*4]
 | 
			
		||||
#mov r13d, DWORD PTR[20 + rsi + rax*4]
 | 
			
		||||
#mov r14d, DWORD PTR[24 + rsi + rax*4]
 | 
			
		||||
#mov r15d, DWORD PTR[28 + rsi + rax*4]
 | 
			
		||||
#lea ebx, DWORD PTR[rbx]
 | 
			
		||||
#lea r9d, DWORD PTR[r9]
 | 
			
		||||
#lea r10d, DWORD PTR[r10]
 | 
			
		||||
#lea r11d, DWORD PTR[r11]
 | 
			
		||||
#lea r12d, DWORD PTR[r12]
 | 
			
		||||
#lea r13d, DWORD PTR[r13]
 | 
			
		||||
#lea r14d, DWORD PTR[r14]
 | 
			
		||||
#lea r15d, DWORD PTR[r15]
 | 
			
		||||
 | 
			
		||||
vpcmpeqb k1, xmm5, xmm5
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpcmpeqb k2, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k3, xmm5, xmm5
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vpxord zmm0, zmm0, zmm0
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vpxord zmm1, zmm1, zmm1
 | 
			
		||||
vpxord zmm2, zmm2, zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + r8  * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r10, [rbx + r8  * 8]
 | 
			
		||||
vmovupd  [r10 + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
# TODO: see if this logic can be optimized
 | 
			
		||||
addq rax, 8
 | 
			
		||||
subq r15, 8
 | 
			
		||||
cmpq r15, 8
 | 
			
		||||
jge 1b
 | 
			
		||||
 | 
			
		||||
cmpq r15, 0
 | 
			
		||||
jle .end_func
 | 
			
		||||
 | 
			
		||||
vpbroadcastd ymm6, r15d
 | 
			
		||||
vpcmpgtd k1, ymm6, ymm7
 | 
			
		||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpaddd ymm4, ymm3, ymm3
 | 
			
		||||
#ifdef PADDING
 | 
			
		||||
vpaddd ymm3, ymm4, ymm4
 | 
			
		||||
#else
 | 
			
		||||
vpaddd ymm3, ymm3, ymm4
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vpxord    zmm0, zmm1, zmm2
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
kmovw     k2, k1
 | 
			
		||||
kmovw     k3, k1
 | 
			
		||||
vpxord    zmm1, zmm1, zmm1
 | 
			
		||||
vpxord    zmm2, zmm2, zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
vgatherdpd zmm0{k1}, [     rdi + ymm3 * 8]
 | 
			
		||||
#ifndef ONLY_FIRST_DIMENSION
 | 
			
		||||
vgatherdpd zmm1{k2}, [8 +  rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + r8  * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r10, [rbx + r8  * 8]
 | 
			
		||||
vmovupd  [r10  + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, r15
 | 
			
		||||
 | 
			
		||||
.end_func:
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_md_aos, .-gather_md_aos
 | 
			
		||||
							
								
								
									
										67
									
								
								util/gather-bench/src/avx512/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										67
									
								
								util/gather-bench/src/avx512/gather_soa.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,67 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> a
 | 
			
		||||
# rsi -> idx
 | 
			
		||||
# rdx -> N
 | 
			
		||||
# rcx -> t
 | 
			
		||||
.text
 | 
			
		||||
.globl gather_soa
 | 
			
		||||
.type gather_soa, @function
 | 
			
		||||
gather_soa :
 | 
			
		||||
push rbp
 | 
			
		||||
mov rbp, rsp
 | 
			
		||||
push rbx
 | 
			
		||||
push r9
 | 
			
		||||
push r10
 | 
			
		||||
push r11
 | 
			
		||||
push r12
 | 
			
		||||
push r13
 | 
			
		||||
push r14
 | 
			
		||||
push r15
 | 
			
		||||
 | 
			
		||||
xor   rax, rax
 | 
			
		||||
vpcmpeqd ymm8, ymm8, ymm8
 | 
			
		||||
lea r8, [rdi + rdx * 8]
 | 
			
		||||
lea r9, [r8  + rdx * 8]
 | 
			
		||||
.align 16
 | 
			
		||||
1:
 | 
			
		||||
 | 
			
		||||
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
 | 
			
		||||
vpcmpeqb k1, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k2, xmm5, xmm5
 | 
			
		||||
vpcmpeqb k3, xmm5, xmm5
 | 
			
		||||
vpxord zmm0, zmm0, zmm0
 | 
			
		||||
vpxord zmm1, zmm1, zmm1
 | 
			
		||||
vpxord zmm2, zmm2, zmm2
 | 
			
		||||
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm1{k2}, [r8  + ymm3 * 8]
 | 
			
		||||
vgatherdpd zmm2{k3}, [r9  + ymm3 * 8]
 | 
			
		||||
 | 
			
		||||
#ifdef TEST
 | 
			
		||||
vmovupd  [rcx + rax * 8], zmm0
 | 
			
		||||
lea rbx, [rcx + rdx * 8]
 | 
			
		||||
vmovupd  [rbx + rax * 8], zmm1
 | 
			
		||||
lea r10, [rbx + rdx * 8]
 | 
			
		||||
vmovupd  [r10 + rax * 8], zmm2
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
addq rax, 8
 | 
			
		||||
cmpq rax, rdx
 | 
			
		||||
jl 1b
 | 
			
		||||
 | 
			
		||||
pop r15
 | 
			
		||||
pop r14
 | 
			
		||||
pop r13
 | 
			
		||||
pop r12
 | 
			
		||||
pop r11
 | 
			
		||||
pop r10
 | 
			
		||||
pop r9
 | 
			
		||||
pop rbx
 | 
			
		||||
mov  rsp, rbp
 | 
			
		||||
pop rbp
 | 
			
		||||
ret
 | 
			
		||||
.size gather_soa, .-gather_soa
 | 
			
		||||
							
								
								
									
										23
									
								
								util/gather-bench/src/avx512/load_aos.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								util/gather-bench/src/avx512/load_aos.S
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,23 @@
 | 
			
		||||
.intel_syntax noprefix
 | 
			
		||||
.data
 | 
			
		||||
.align 64
 | 
			
		||||
SCALAR:
 | 
			
		||||
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
 | 
			
		||||
 | 
			
		||||
# rdi -> &a[i * snbytes]
 | 
			
		||||
 | 
			
		||||
.text
 | 
			
		||||
.globl load_aos
 | 
			
		||||
.type load_aos, @function
 | 
			
		||||
load_aos :
 | 
			
		||||
 | 
			
		||||
vmovsd xmm0, QWORD PTR [rdi]
 | 
			
		||||
vmovsd xmm1, QWORD PTR [8  + rdi]
 | 
			
		||||
vmovsd xmm2, QWORD PTR [16 + rdi]
 | 
			
		||||
 | 
			
		||||
vbroadcastsd zmm3, xmm0
 | 
			
		||||
vbroadcastsd zmm4, xmm1
 | 
			
		||||
vbroadcastsd zmm5, xmm2
 | 
			
		||||
 | 
			
		||||
ret
 | 
			
		||||
.size load_aos, .-load_aos
 | 
			
		||||
							
								
								
									
										32
									
								
								util/gather-bench/src/includes/allocate.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								util/gather-bench/src/includes/allocate.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,32 @@
 | 
			
		||||
/*
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 *
 | 
			
		||||
 *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
 | 
			
		||||
 *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
 | 
			
		||||
 *
 | 
			
		||||
 *      Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
			
		||||
 *      of this software and associated documentation files (the "Software"), to deal
 | 
			
		||||
 *      in the Software without restriction, including without limitation the rights
 | 
			
		||||
 *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
			
		||||
 *      copies of the Software, and to permit persons to whom the Software is
 | 
			
		||||
 *      furnished to do so, subject to the following conditions:
 | 
			
		||||
 *
 | 
			
		||||
 *      The above copyright notice and this permission notice shall be included in all
 | 
			
		||||
 *      copies or substantial portions of the Software.
 | 
			
		||||
 *
 | 
			
		||||
 *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
			
		||||
 *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
			
		||||
 *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
			
		||||
 *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
			
		||||
 *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
			
		||||
 *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 | 
			
		||||
 *      SOFTWARE.
 | 
			
		||||
 *
 | 
			
		||||
 * =======================================================================================
 | 
			
		||||
 */
 | 
			
		||||
#ifndef __ALLOCATE_H_
 | 
			
		||||
#define __ALLOCATE_H_
 | 
			
		||||
 | 
			
		||||
extern void* allocate (int alignment, size_t bytesize);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user