33 Commits

Author SHA1 Message Date
rafaelravedutti
a6a269703d Merge pull request #7 from RRZE-HPC/mucosim23
Mucosim23
2024-01-17 15:14:08 +01:00
TejeshPala
7ee250161a omp_get_max_threads instead of omp_get_num_threads for gcc compiler adaption
Signed-off-by: TejeshPala <tejesh.pala@fau.de>
2024-01-13 15:09:03 +01:00
TejeshPala
c73efea786 include openmp in ICC
Signed-off-by: TejeshPala <tejesh.pala@fau.de>
2024-01-11 17:16:17 +01:00
TejeshPala
4cfa664533 schedule options for force kernels and to print in main fn
Signed-off-by: TejeshPala <tejesh.pala@fau.de>
2024-01-11 17:09:18 +01:00
Rafael Ravedutti
1837403326 Merge branch 'master' of github.com:RRZE-HPC/MD-Bench 2023-12-13 10:52:55 +01:00
Rafael Ravedutti
02629612a9 Fix explicit types for CUDA and provide option to write initial state of system
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-12-13 10:52:47 +01:00
TEJESH PALA
ce00aa0042 Merge pull request #6 from RRZE-HPC/mucosim23
omp print threads
2023-11-21 17:11:18 +01:00
TejeshPala
c4e5e87265 omp print threads 2023-11-21 15:31:27 +01:00
Rafael Ravedutti
da3b1dd53f Add extended parameter option --param
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-11-21 15:27:11 +01:00
Rafael Ravedutti
2f13291817 Change function get_num_threads to get_cuda_num_threads
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-11-21 14:40:19 +01:00
Rafael Ravedutti
a460fffa19 Fix PBC case
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-10-10 12:53:43 +02:00
19209bdcce Cleanup and move gather-bench to util folder 2023-08-15 15:21:21 +02:00
Rafael Ravedutti
151f0c0e6f Add extendend param option
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-05-29 02:27:32 +02:00
Rafael Ravedutti
72f486f9bf Merge branch 'master' of github.com:RRZE-HPC/MD-Bench 2023-04-09 03:44:53 +02:00
Rafael Ravedutti
8253b31ee0 Include masked out interactions from remainder in atoms_outside_cutoff
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-09 03:44:21 +02:00
Rafael Ravedutti
e206c3566d Merge branch 'master' of github.com:RRZE-HPC/MD-Bench 2023-04-09 01:23:45 +02:00
Rafael Ravedutti
7ff1673399 Update config.mk with SORT_ATOMS
Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>
2023-04-09 01:23:39 +02:00
Rafael Ravedutti
b6982d56f5 Fix atom sorting
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-09 01:19:12 +02:00
Rafael Ravedutti
1ad981a059 Add static analysis for gromacs-avx2-dp on Zen3
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-09 00:07:04 +02:00
Rafael Ravedutti
c438fc6832 Fix GROMACS AVX2 code
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-07 21:54:07 +02:00
Rafael Ravedutti
17e239ed6d Add uiCA reference to its analyses
Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>
2023-04-05 23:58:52 +02:00
Rafael Ravedutti
d151b9b3e4 Update scripts with division factor
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-05 23:56:35 +02:00
Rafael Ravedutti
98257b746c Add scripts to properly generate agr data
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-05 23:19:48 +02:00
Rafael Ravedutti
a101f8588a Add analyses with llvm-mca
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-05 22:11:55 +02:00
Rafael Ravedutti
c14a6b2186 Add outputs for uiCA
Signed-off-by: Rafael Ravedutti <rafael.r.ravedutti@fau.de>
2023-04-05 19:51:09 +02:00
Rafael Ravedutti
300776f512 Add outputs for new analyses
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-05 19:48:04 +02:00
Rafael Ravedutti
4e5fe27c0f Add object files for new static analyses
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-05 19:46:05 +02:00
Rafael Ravedutti
989bec2c7d Add first analyses with GROMACS changes
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-05 02:44:50 +02:00
Rafael Ravedutti
2971ddcc63 Separate log by hostname and allow to set prefetchers to be used
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-04 21:56:03 +02:00
Rafael Ravedutti
5341938b60 Increase cutoff for Argon case
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-04-03 15:06:32 +02:00
Rafael Ravedutti
039de0be99 Fix stubbed versions and debug messages
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-03-30 03:49:57 +02:00
Rafael Ravedutti
43259eb3cf Adjust neighbor lists layout to keep neighbor ids contiguous in memory
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-03-30 01:57:26 +02:00
Rafael Ravedutti
3eb7170a65 Adapt stubbed version for new neighbor lists in GROMACS
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2023-03-29 21:54:33 +02:00
108 changed files with 2425 additions and 28818 deletions

View File

@@ -30,6 +30,10 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifeq ($(strip $(SORT_ATOMS)),true)
DEFINES += -DSORT_ATOMS
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
@@ -152,7 +156,7 @@ $(BUILD_DIR)/%.o: %.s
clean:
$(info ===> CLEAN)
@rm -rf $(BUILD_DIR)
@rm -rf MDBench-$(IDENTIFIER)
@rm -rf $(TARGET)*
@rm -f tags
cleanall:

View File

View File

@@ -1,626 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
# mark_description "ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
# parameter 5: %r8d
# parameter 6: %r9d
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#121.112
pushq %rbp #121.112
.cfi_def_cfa_offset 16
movq %rsp, %rbp #121.112
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #121.112
pushq %r12 #121.112
pushq %r13 #121.112
pushq %r14 #121.112
pushq %r15 #121.112
pushq %rbx #121.112
subq $88, %rsp #121.112
xorl %eax, %eax #124.16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r15 #121.112
movq %rsi, %r12 #121.112
movq %rdi, %rbx #121.112
..___tag_value_computeForce.11:
# getTimeStamp()
call getTimeStamp #124.16
..___tag_value_computeForce.12:
# LOE rbx r12 r15 xmm0
..B1.51: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 24(%rsp) #124.16[spill]
# LOE rbx r12 r15
..B1.2: # Preds ..B1.51
# Execution count [1.00e+00]
movl 4(%r12), %r13d #125.18
movq 64(%r12), %r9 #127.20
movq 72(%r12), %r14 #127.45
movq 80(%r12), %r8 #127.70
vmovsd 72(%rbx), %xmm2 #129.27
vmovsd 8(%rbx), %xmm1 #130.23
vmovsd (%rbx), %xmm0 #131.24
testl %r13d, %r13d #134.24
jle ..B1.43 # Prob 50% #134.24
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %ebx, %ebx #134.5
movl %r13d, %edx #134.5
xorl %ecx, %ecx #134.5
movl $1, %esi #134.5
xorl %eax, %eax #135.17
shrl $1, %edx #134.5
je ..B1.7 # Prob 9% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rcx,%r9) #135.9
incq %rbx #134.5
movq %rax, (%rcx,%r14) #136.9
movq %rax, (%rcx,%r8) #137.9
movq %rax, 8(%rcx,%r9) #135.9
movq %rax, 8(%rcx,%r14) #136.9
movq %rax, 8(%rcx,%r8) #137.9
addq $16, %rcx #134.5
cmpq %rdx, %rbx #134.5
jb ..B1.5 # Prob 63% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rbx,%rbx), %esi #135.9
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%rsi), %edx #134.5
cmpl %r13d, %edx #134.5
jae ..B1.9 # Prob 9% #134.5
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %esi, %rsi #134.5
movq %rax, -8(%r9,%rsi,8) #135.9
movq %rax, -8(%r14,%rsi,8) #136.9
movq %rax, -8(%r8,%rsi,8) #137.9
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
movq %r8, 32(%rsp) #141.5[spill]
movq %r9, 80(%rsp) #141.5[spill]
vmovsd %xmm2, (%rsp) #141.5[spill]
vmovsd %xmm1, 8(%rsp) #141.5[spill]
vmovsd %xmm0, 16(%rsp) #141.5[spill]
..___tag_value_computeForce.18:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.19:
# LOE r12 r14 r15 r13d
..B1.10: # Preds ..B1.9
# Execution count [9.00e-01]
vmovsd 16(%rsp), %xmm0 #[spill]
xorl %esi, %esi #143.15
vmovsd (%rsp), %xmm2 #[spill]
xorl %eax, %eax #143.5
vmulsd %xmm2, %xmm2, %xmm13 #129.45
xorl %edi, %edi #143.5
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
vmovsd 8(%rsp), %xmm1 #[spill]
vbroadcastsd %xmm13, %zmm14 #129.25
vbroadcastsd %xmm1, %zmm13 #130.21
vbroadcastsd %xmm0, %zmm9 #197.45
movslq %r13d, %r13 #143.5
movq 24(%r15), %r10 #145.25
movslq 16(%r15), %rdx #144.43
movq 8(%r15), %rcx #144.19
movq 32(%rsp), %r8 #[spill]
movq 16(%r12), %rbx #146.25
shlq $2, %rdx #126.5
movq %r13, 64(%rsp) #143.5[spill]
movq %r10, 72(%rsp) #143.5[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.41 ..B1.10
# Execution count [5.00e+00]
movq 72(%rsp), %r9 #145.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #149.22
vmovapd %xmm24, %xmm18 #150.22
movl (%r9,%rax,4), %r10d #145.25
vmovapd %xmm18, %xmm4 #151.22
vmovsd (%rdi,%rbx), %xmm10 #146.25
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
testl %r10d, %r10d #173.32
jle ..B1.41 # Prob 50% #173.32
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #149.22
vmovaps %zmm8, %zmm7 #150.22
vmovaps %zmm7, %zmm11 #151.22
cmpl $8, %r10d #173.13
jl ..B1.48 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
cmpl $1200, %r10d #173.13
jl ..B1.47 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [4.50e+00]
movq %rdx, %r15 #144.43
imulq %rsi, %r15 #144.43
addq %rcx, %r15 #126.5
movq %r15, %r11 #173.13
andq $63, %r11 #173.13
testl $3, %r11d #173.13
je ..B1.16 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.14
# Execution count [2.25e+00]
xorl %r11d, %r11d #173.13
jmp ..B1.18 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.14
# Execution count [2.25e+00]
testl %r11d, %r11d #173.13
je ..B1.18 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.16
# Execution count [2.50e+01]
negl %r11d #173.13
addl $64, %r11d #173.13
shrl $2, %r11d #173.13
cmpl %r11d, %r10d #173.13
cmovl %r10d, %r11d #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
# Execution count [5.00e+00]
movl %r10d, %r13d #173.13
subl %r11d, %r13d #173.13
andl $7, %r13d #173.13
negl %r13d #173.13
addl %r10d, %r13d #173.13
cmpl $1, %r11d #173.13
jb ..B1.26 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.18
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #173.13
xorl %r12d, %r12d #173.13
vpbroadcastd %r11d, %ymm3 #173.13
vbroadcastsd %xmm10, %zmm2 #146.23
vbroadcastsd %xmm6, %zmm1 #147.23
vbroadcastsd %xmm12, %zmm0 #148.23
movslq %r11d, %r9 #173.13
movq %r8, 32(%rsp) #173.13[spill]
movq %r14, (%rsp) #173.13[spill]
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.20: # Preds ..B1.24 ..B1.19
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
kmovw %k3, %r14d #173.13
vpaddd %ymm17, %ymm17, %ymm18 #175.40
vpaddd %ymm18, %ymm17, %ymm17 #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.23: # Preds ..B1.20
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vpxord %zmm19, %zmm19, %zmm19 #175.40
vpxord %zmm20, %zmm20, %zmm20 #175.40
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.24: # Preds ..B1.23
# Execution count [2.50e+01]
addq $8, %r12 #173.13
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
#vrcp14pd %zmm25, %zmm24 #195.42
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
#vfpclasspd $30, %zmm24, %k0 #195.42
#kmovw %k2, %r8d #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm25, %zmm17 #195.42
#andl %r8d, %r14d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
#kmovw %r14d, %k3 #198.21
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
cmpq %r9, %r12 #173.13
jb ..B1.20 # Prob 82% #173.13
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24
# Execution count [4.50e+00]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
cmpl %r11d, %r10d #173.13
je ..B1.40 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
# Execution count [2.50e+01]
lea 8(%r11), %r9d #173.13
cmpl %r9d, %r13d #173.13
jl ..B1.34 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.26
# Execution count [4.50e+00]
movq %rdx, %r12 #144.43
imulq %rsi, %r12 #144.43
vbroadcastsd %xmm10, %zmm1 #146.23
vbroadcastsd %xmm6, %zmm0 #147.23
vbroadcastsd %xmm12, %zmm2 #148.23
movslq %r11d, %r9 #173.13
addq %rcx, %r12 #126.5
movq %rdi, 8(%rsp) #126.5[spill]
movq %rdx, 16(%rsp) #126.5[spill]
movq %rcx, 40(%rsp) #126.5[spill]
movq %rax, 48(%rsp) #126.5[spill]
movq %rsi, 56(%rsp) #126.5[spill]
movq %r8, 32(%rsp) #126.5[spill]
movq %r14, (%rsp) #126.5[spill]
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.28: # Preds ..B1.32 ..B1.27
# Execution count [2.50e+01]
vmovdqu (%r12,%r9,4), %ymm3 #174.25
vpaddd %ymm3, %ymm3, %ymm4 #175.40
vpaddd %ymm4, %ymm3, %ymm3 #175.40
movl (%r12,%r9,4), %r14d #174.25
movl 4(%r12,%r9,4), %r8d #174.25
movl 8(%r12,%r9,4), %edi #174.25
movl 12(%r12,%r9,4), %esi #174.25
lea (%r14,%r14,2), %r14d #175.40
movl 16(%r12,%r9,4), %ecx #174.25
lea (%r8,%r8,2), %r8d #175.40
movl 20(%r12,%r9,4), %edx #174.25
lea (%rdi,%rdi,2), %edi #175.40
movl 24(%r12,%r9,4), %eax #174.25
lea (%rsi,%rsi,2), %esi #175.40
movl 28(%r12,%r9,4), %r15d #174.25
lea (%rcx,%rcx,2), %ecx #175.40
lea (%rdx,%rdx,2), %edx #175.40
lea (%rax,%rax,2), %eax #175.40
lea (%r15,%r15,2), %r15d #175.40
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.31: # Preds ..B1.28
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
vpxord %zmm4, %zmm4, %zmm4 #175.40
vpxord %zmm17, %zmm17, %zmm17 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.32: # Preds ..B1.31
# Execution count [2.50e+01]
addl $8, %r11d #173.13
addq $8, %r9 #173.13
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
#vrcp14pd %zmm3, %zmm22 #195.42
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
#vfpclasspd $30, %zmm22, %k0 #195.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
#knotw %k0, %k1 #195.42
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
cmpl %r13d, %r11d #173.13
jb ..B1.28 # Prob 82% #173.13
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32
# Execution count [4.50e+00]
movq 8(%rsp), %rdi #[spill]
movq 16(%rsp), %rdx #[spill]
movq 40(%rsp), %rcx #[spill]
movq 48(%rsp), %rax #[spill]
movq 56(%rsp), %rsi #[spill]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
# Execution count [5.00e+00]
lea 1(%r13), %r9d #173.13
cmpl %r10d, %r9d #173.13
ja ..B1.40 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.35: # Preds ..B1.34
# Execution count [2.50e+01]
imulq %rdx, %rsi #144.43
vbroadcastsd %xmm10, %zmm4 #146.23
subl %r13d, %r10d #173.13
addq %rcx, %rsi #126.5
vpbroadcastd %r10d, %ymm0 #173.13
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
movslq %r13d, %r13 #173.13
kmovw %k3, %r9d #173.13
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
vpaddd %ymm1, %ymm1, %ymm2 #175.40
vpaddd %ymm2, %ymm1, %ymm0 #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.38: # Preds ..B1.35
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm1, %zmm1, %zmm1 #175.40
vpxord %zmm2, %zmm2, %zmm2 #175.40
vpxord %zmm3, %zmm3, %zmm3 #175.40
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.38
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #147.23
#vbroadcastsd %xmm12, %zmm12 #148.23
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
#vrcp14pd %zmm19, %zmm18 #195.42
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
#vfpclasspd $30, %zmm18, %k0 #195.42
#kmovw %k2, %esi #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm19, %zmm0 #195.42
#andl %esi, %r9d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
#kmovw %r9d, %k3 #198.21
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
vpermd %zmm11, %zmm19, %zmm0 #151.22
vpermd %zmm7, %zmm19, %zmm6 #150.22
vpermd %zmm8, %zmm19, %zmm20 #149.22
vaddpd %zmm11, %zmm0, %zmm11 #151.22
vaddpd %zmm7, %zmm6, %zmm7 #150.22
vaddpd %zmm8, %zmm20, %zmm8 #149.22
vpermpd $78, %zmm11, %zmm1 #151.22
vpermpd $78, %zmm7, %zmm10 #150.22
vpermpd $78, %zmm8, %zmm21 #149.22
vaddpd %zmm1, %zmm11, %zmm2 #151.22
vaddpd %zmm10, %zmm7, %zmm12 #150.22
vaddpd %zmm21, %zmm8, %zmm22 #149.22
vpermpd $177, %zmm2, %zmm3 #151.22
vpermpd $177, %zmm12, %zmm17 #150.22
vpermpd $177, %zmm22, %zmm23 #149.22
vaddpd %zmm3, %zmm2, %zmm4 #151.22
vaddpd %zmm17, %zmm12, %zmm18 #150.22
vaddpd %zmm23, %zmm22, %zmm24 #149.22
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40 ..B1.11
# Execution count [5.00e+00]
movq 80(%rsp), %rsi #208.9[spill]
addq $24, %rdi #143.5
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
vmovsd %xmm0, (%rsi,%rax,8) #208.9
movslq %eax, %rsi #143.32
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
vmovsd %xmm1, (%r14,%rax,8) #209.9
incq %rsi #143.32
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
vmovsd %xmm2, (%r8,%rax,8) #210.9
incq %rax #143.5
cmpq 64(%rsp), %rax #143.5[spill]
jb ..B1.11 # Prob 82% #143.5
jmp ..B1.44 # Prob 100% #143.5
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.43: # Preds ..B1.2
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
..___tag_value_computeForce.48:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.49:
# LOE
..B1.44: # Preds ..B1.41 ..B1.43
# Execution count [1.00e+00]
movl $.L_2__STRING.0, %edi #219.5
vzeroupper #219.5
..___tag_value_computeForce.50:
# likwid_markerStopRegion(const char *)
call likwid_markerStopRegion #219.5
..___tag_value_computeForce.51:
# LOE
..B1.45: # Preds ..B1.44
# Execution count [1.00e+00]
xorl %eax, %eax #221.16
..___tag_value_computeForce.52:
# getTimeStamp()
call getTimeStamp #221.16
..___tag_value_computeForce.53:
# LOE xmm0
..B1.46: # Preds ..B1.45
# Execution count [1.00e+00]
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
addq $88, %rsp #224.14
.cfi_restore 3
popq %rbx #224.14
.cfi_restore 15
popq %r15 #224.14
.cfi_restore 14
popq %r14 #224.14
.cfi_restore 13
popq %r13 #224.14
.cfi_restore 12
popq %r12 #224.14
movq %rbp, %rsp #224.14
popq %rbp #224.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #224.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.47: # Preds ..B1.13
# Execution count [4.50e-01]: Infreq
movl %r10d, %r13d #173.13
xorl %r11d, %r11d #173.13
andl $-8, %r13d #173.13
jmp ..B1.26 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.48: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
xorl %r13d, %r13d #173.13
jmp ..B1.34 # Prob 100% #173.13
.align 16,0x90
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
.L_2__STRING.0:
.long 1668444006
.word 101
.type .L_2__STRING.0,@object
.size .L_2__STRING.0,6
.data
.section .note.GNU-stack, ""
# End

View File

@@ -1,585 +0,0 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#103.87
pushq %rbp #103.87
.cfi_def_cfa_offset 16
movq %rsp, %rbp #103.87
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #103.87
pushq %r12 #103.87
pushq %r13 #103.87
pushq %r14 #103.87
subq $104, %rsp #103.87
xorl %eax, %eax #106.16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r14 #103.87
movq %rsi, %r13 #103.87
movq %rdi, %r12 #103.87
..___tag_value_computeForce.9:
# getTimeStamp()
call getTimeStamp #106.16
..___tag_value_computeForce.10:
# LOE rbx r12 r13 r14 r15 xmm0
..B1.48: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 16(%rsp) #106.16[spill]
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.48
# Execution count [1.00e+00]
movl 4(%r13), %ecx #107.18
movq 64(%r13), %r11 #109.20
movq 72(%r13), %r10 #109.45
movq 80(%r13), %r9 #109.70
vmovsd 72(%r12), %xmm2 #111.27
vmovsd 8(%r12), %xmm1 #112.23
vmovsd (%r12), %xmm0 #113.24
testl %ecx, %ecx #116.24
jle ..B1.42 # Prob 50% #116.24
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %edi, %edi #116.5
movl %ecx, %edx #116.5
xorl %esi, %esi #116.5
movl $1, %r8d #116.5
xorl %eax, %eax #117.17
shrl $1, %edx #116.5
je ..B1.7 # Prob 9% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rsi,%r11) #117.9
incq %rdi #116.5
movq %rax, (%rsi,%r10) #118.9
movq %rax, (%rsi,%r9) #119.9
movq %rax, 8(%rsi,%r11) #117.9
movq %rax, 8(%rsi,%r10) #118.9
movq %rax, 8(%rsi,%r9) #119.9
addq $16, %rsi #116.5
cmpq %rdx, %rdi #116.5
jb ..B1.5 # Prob 63% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rdi,%rdi), %r8d #117.9
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%r8), %edx #116.5
cmpl %ecx, %edx #116.5
jae ..B1.9 # Prob 9% #116.5
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %r8d, %r8 #116.5
movq %rax, -8(%r11,%r8,8) #117.9
movq %rax, -8(%r10,%r8,8) #118.9
movq %rax, -8(%r9,%r8,8) #119.9
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [9.00e-01]
vmulsd %xmm2, %xmm2, %xmm13 #111.45
xorl %edi, %edi #124.15
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
vbroadcastsd %xmm13, %zmm14 #111.25
vbroadcastsd %xmm1, %zmm13 #112.21
vbroadcastsd %xmm0, %zmm9 #177.45
movq 16(%r13), %rdx #127.25
xorl %r8d, %r8d #124.5
movslq %ecx, %r12 #124.5
xorl %eax, %eax #124.5
movq 24(%r14), %r13 #126.25
movslq 16(%r14), %rcx #125.43
movq 8(%r14), %rsi #125.19
shlq $2, %rcx #108.5
movq %r12, 80(%rsp) #124.5[spill]
movq %r13, 88(%rsp) #124.5[spill]
movq %r11, 96(%rsp) #124.5[spill]
movq %r15, 8(%rsp) #124.5[spill]
movq %rbx, (%rsp) #124.5[spill]
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.10: # Preds ..B1.40 ..B1.9
# Execution count [5.00e+00]
movq 88(%rsp), %rbx #126.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #130.22
vmovapd %xmm24, %xmm18 #131.22
movl (%rbx,%r8,4), %r11d #126.25
vmovapd %xmm18, %xmm4 #132.22
vmovsd (%rax,%rdx), %xmm10 #127.25
vmovsd 8(%rax,%rdx), %xmm6 #128.25
vmovsd 16(%rax,%rdx), %xmm12 #129.25
testl %r11d, %r11d #153.32
jle ..B1.40 # Prob 50% #153.32
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.10
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #130.22
vmovaps %zmm8, %zmm7 #131.22
vmovaps %zmm7, %zmm11 #132.22
cmpl $8, %r11d #153.13
jl ..B1.45 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
cmpl $1200, %r11d #153.13
jl ..B1.44 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
movq %rcx, %r15 #125.43
imulq %rdi, %r15 #125.43
addq %rsi, %r15 #108.5
movq %r15, %r12 #153.13
andq $63, %r12 #153.13
testl $3, %r12d #153.13
je ..B1.15 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [2.25e+00]
xorl %r12d, %r12d #153.13
jmp ..B1.17 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.13
# Execution count [2.25e+00]
testl %r12d, %r12d #153.13
je ..B1.17 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.15
# Execution count [2.50e+01]
negl %r12d #153.13
addl $64, %r12d #153.13
shrl $2, %r12d #153.13
cmpl %r12d, %r11d #153.13
cmovl %r11d, %r12d #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
# Execution count [5.00e+00]
movl %r11d, %r14d #153.13
subl %r12d, %r14d #153.13
andl $7, %r14d #153.13
negl %r14d #153.13
addl %r11d, %r14d #153.13
cmpl $1, %r12d #153.13
jb ..B1.25 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.17
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #153.13
xorl %r13d, %r13d #153.13
vpbroadcastd %r12d, %ymm3 #153.13
vbroadcastsd %xmm10, %zmm2 #127.23
vbroadcastsd %xmm6, %zmm1 #128.23
vbroadcastsd %xmm12, %zmm0 #129.23
movslq %r12d, %rbx #153.13
movq %r9, 24(%rsp) #153.13[spill]
movq %r10, 32(%rsp) #153.13[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.23 ..B1.18
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
kmovw %k3, %r10d #153.13
vpaddd %ymm17, %ymm17, %ymm18 #155.40
vpaddd %ymm18, %ymm17, %ymm17 #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.22: # Preds ..B1.19
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vpxord %zmm19, %zmm19, %zmm19 #155.40
vpxord %zmm20, %zmm20, %zmm20 #155.40
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.23: # Preds ..B1.22
# Execution count [2.50e+01]
addq $8, %r13 #153.13
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
#vrcp14pd %zmm25, %zmm24 #175.42
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
#vfpclasspd $30, %zmm24, %k0 #175.42
#kmovw %k2, %r9d #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm25, %zmm17 #175.42
#andl %r9d, %r10d #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
#kmovw %r10d, %k3 #178.21
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
cmpq %rbx, %r13 #153.13
jb ..B1.19 # Prob 82% #153.13
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.24: # Preds ..B1.23
# Execution count [4.50e+00]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
cmpl %r12d, %r11d #153.13
je ..B1.39 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
# Execution count [2.50e+01]
lea 8(%r12), %ebx #153.13
cmpl %ebx, %r14d #153.13
jl ..B1.33 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25
# Execution count [4.50e+00]
movq %rcx, %r13 #125.43
imulq %rdi, %r13 #125.43
vbroadcastsd %xmm10, %zmm1 #127.23
vbroadcastsd %xmm6, %zmm0 #128.23
vbroadcastsd %xmm12, %zmm2 #129.23
movslq %r12d, %rbx #153.13
addq %rsi, %r13 #108.5
movq %rax, 40(%rsp) #108.5[spill]
movq %rcx, 48(%rsp) #108.5[spill]
movq %rsi, 56(%rsp) #108.5[spill]
movq %r8, 64(%rsp) #108.5[spill]
movq %rdi, 72(%rsp) #108.5[spill]
movq %r9, 24(%rsp) #108.5[spill]
movq %r10, 32(%rsp) #108.5[spill]
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.31 ..B1.26
# Execution count [2.50e+01]
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
vpaddd %ymm3, %ymm3, %ymm4 #155.40
vpaddd %ymm4, %ymm3, %ymm3 #155.40
movl (%r13,%rbx,4), %r10d #154.25
movl 4(%r13,%rbx,4), %r9d #154.25
movl 8(%r13,%rbx,4), %r8d #154.25
movl 12(%r13,%rbx,4), %edi #154.25
lea (%r10,%r10,2), %r10d #155.40
movl 16(%r13,%rbx,4), %esi #154.25
lea (%r9,%r9,2), %r9d #155.40
movl 20(%r13,%rbx,4), %ecx #154.25
lea (%r8,%r8,2), %r8d #155.40
movl 24(%r13,%rbx,4), %eax #154.25
lea (%rdi,%rdi,2), %edi #155.40
movl 28(%r13,%rbx,4), %r15d #154.25
lea (%rsi,%rsi,2), %esi #155.40
lea (%rcx,%rcx,2), %ecx #155.40
lea (%rax,%rax,2), %eax #155.40
lea (%r15,%r15,2), %r15d #155.40
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.30: # Preds ..B1.27
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
vpxord %zmm4, %zmm4, %zmm4 #155.40
vpxord %zmm17, %zmm17, %zmm17 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.31: # Preds ..B1.30
# Execution count [2.50e+01]
addl $8, %r12d #153.13
addq $8, %rbx #153.13
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
#vrcp14pd %zmm3, %zmm22 #175.42
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
#vfpclasspd $30, %zmm22, %k0 #175.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
#knotw %k0, %k1 #175.42
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
cmpl %r14d, %r12d #153.13
jb ..B1.27 # Prob 82% #153.13
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.32: # Preds ..B1.31
# Execution count [4.50e+00]
movq 40(%rsp), %rax #[spill]
movq 48(%rsp), %rcx #[spill]
movq 56(%rsp), %rsi #[spill]
movq 64(%rsp), %r8 #[spill]
movq 72(%rsp), %rdi #[spill]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
# Execution count [5.00e+00]
lea 1(%r14), %ebx #153.13
cmpl %r11d, %ebx #153.13
ja ..B1.39 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33
# Execution count [2.50e+01]
imulq %rcx, %rdi #125.43
vbroadcastsd %xmm10, %zmm4 #127.23
subl %r14d, %r11d #153.13
addq %rsi, %rdi #108.5
vpbroadcastd %r11d, %ymm0 #153.13
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
movslq %r14d, %r14 #153.13
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
kmovw %k3, %edi #153.13
vpaddd %ymm1, %ymm1, %ymm2 #155.40
vpaddd %ymm2, %ymm1, %ymm0 #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.37: # Preds ..B1.34
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm1, %zmm1, %zmm1 #155.40
vpxord %zmm2, %zmm2, %zmm2 #155.40
vpxord %zmm3, %zmm3, %zmm3 #155.40
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.38: # Preds ..B1.37
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #128.23
#vbroadcastsd %xmm12, %zmm12 #129.23
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
#vrcp14pd %zmm19, %zmm18 #175.42
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
#vfpclasspd $30, %zmm18, %k0 #175.42
#kmovw %k2, %ebx #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm19, %zmm0 #175.42
#andl %ebx, %edi #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
#kmovw %edi, %k3 #178.21
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
vpermd %zmm11, %zmm19, %zmm0 #132.22
vpermd %zmm7, %zmm19, %zmm6 #131.22
vpermd %zmm8, %zmm19, %zmm20 #130.22
vaddpd %zmm11, %zmm0, %zmm11 #132.22
vaddpd %zmm7, %zmm6, %zmm7 #131.22
vaddpd %zmm8, %zmm20, %zmm8 #130.22
vpermpd $78, %zmm11, %zmm1 #132.22
vpermpd $78, %zmm7, %zmm10 #131.22
vpermpd $78, %zmm8, %zmm21 #130.22
vaddpd %zmm1, %zmm11, %zmm2 #132.22
vaddpd %zmm10, %zmm7, %zmm12 #131.22
vaddpd %zmm21, %zmm8, %zmm22 #130.22
vpermpd $177, %zmm2, %zmm3 #132.22
vpermpd $177, %zmm12, %zmm17 #131.22
vpermpd $177, %zmm22, %zmm23 #130.22
vaddpd %zmm3, %zmm2, %zmm4 #132.22
vaddpd %zmm17, %zmm12, %zmm18 #131.22
vaddpd %zmm23, %zmm22, %zmm24 #130.22
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.40: # Preds ..B1.39 ..B1.10
# Execution count [5.00e+00]
movq 96(%rsp), %rbx #188.9[spill]
addq $24, %rax #124.5
movslq %r8d, %rdi #124.32
incq %rdi #124.32
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
#vmovsd %xmm1, (%r10,%r8,8) #189.9
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
#vmovsd %xmm2, (%r9,%r8,8) #190.9
incq %r8 #124.5
cmpq 80(%rsp), %r8 #124.5[spill]
jb ..B1.10 # Prob 82% #124.5
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40
# Execution count [9.00e-01]
movq 8(%rsp), %r15 #[spill]
.cfi_restore 15
movq (%rsp), %rbx #[spill]
.cfi_restore 3
# LOE rbx r15
..B1.42: # Preds ..B1.2 ..B1.41
# Execution count [1.00e+00]
xorl %eax, %eax #201.16
vzeroupper #201.16
..___tag_value_computeForce.43:
# getTimeStamp()
call getTimeStamp #201.16
..___tag_value_computeForce.44:
# LOE rbx r15 xmm0
..B1.43: # Preds ..B1.42
# Execution count [1.00e+00]
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
addq $104, %rsp #204.14
.cfi_restore 14
popq %r14 #204.14
.cfi_restore 13
popq %r13 #204.14
.cfi_restore 12
popq %r12 #204.14
movq %rbp, %rsp #204.14
popq %rbp #204.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #204.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.44: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
movl %r11d, %r14d #153.13
xorl %r12d, %r12d #153.13
andl $-8, %r14d #153.13
jmp ..B1.25 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.45: # Preds ..B1.11
# Execution count [4.50e-01]: Infreq
xorl %r14d, %r14d #153.13
jmp ..B1.33 # Prob 100% #153.13
.align 16,0x90
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.data
.section .note.GNU-stack, ""
# End

View File

@@ -1,324 +0,0 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForce
computeForce:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

View File

@@ -1,326 +0,0 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForceLJ
computeForceLJ:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
#sub rsp, 64
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
#add rsp, 64
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForceLJ,@function
.size computeForceLJ,.-computeForceLJ
..LNcomputeForce.0:
.data
# -- End computeForceLJ
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -21,6 +21,7 @@ typedef struct {
char* input_file;
char* vtk_file;
char* xtc_file;
char* write_atom_file;
MD_FLOAT epsilon;
MD_FLOAT sigma;
MD_FLOAT sigma6;

View File

@@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0b1100);
t0 = _mm256_blend_pd(t0, t1, 0xC);
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
@@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
}
// Functions used in LAMMPS kernel
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }

View File

@@ -7,8 +7,8 @@
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp();
extern double getTimeResolution();
extern double getTimeStamp_();
extern double getTimeStamp(void);
extern double getTimeResolution(void);
extern double getTimeStamp_(void);
#endif

View File

@@ -39,8 +39,8 @@ extern double myrandom(int*);
extern void random_reset(int *seed, int ibase, double *coord);
extern int str2ff(const char *string);
extern const char* ff2str(int ff);
extern int get_num_threads();
extern void readline(char *line, FILE *fp);
extern void debug_printf(const char *format, ...);
extern int get_cuda_num_threads();
#endif

View File

@@ -17,6 +17,7 @@ void initParameter(Parameter *param) {
param->vtk_file = NULL;
param->xtc_file = NULL;
param->eam_file = NULL;
param->write_atom_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma = 1.0;
@@ -169,6 +170,11 @@ void printParameter(Parameter *param) {
printf("\tNumber of timesteps: %d\n", param->ntimes);
printf("\tReport stats every (timesteps): %d\n", param->nstat);
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
#ifdef SORT_ATOMS
printf("\tSort atoms when reneighboring: yes\n");
#else
printf("\tSort atoms when reneighboring: no\n");
#endif
printf("\tPrune every (timesteps): %d\n", param->prune_every);
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);

View File

@@ -79,7 +79,7 @@ const char* ff2str(int ff) {
return "invalid";
}
int get_num_threads() {
int get_cuda_num_threads() {
const char *num_threads_env = getenv("NUM_THREADS");
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
}

View File

@@ -3,7 +3,7 @@ TAG ?= ICC
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= AVX512
# Optimization scheme (lammps/gromacs/clusters_per_bin)
OPT_SCHEME ?= gromacs
OPT_SCHEME ?= lammps
# Enable likwid (true or false)
ENABLE_LIKWID ?= true
# SP or DP
@@ -15,6 +15,8 @@ ASM_SYNTAX ?= ATT
# Debug
DEBUG ?= false
# Sort atoms when reneighboring (true or false)
SORT_ATOMS ?= true
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
@@ -36,7 +38,7 @@ USE_REFERENCE_VERSION ?= false
# Enable XTC output
XTC_OUTPUT ?= false
# Check if cj is local when decreasing reaction force
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
# Configurations for CUDA
# Use CUDA host memory to optimize transfers

View File

@@ -6,7 +6,7 @@ dt 0.001
temp 80
x_out_freq 500
v_out_freq 5
cutforce 0.9
skin 0.05
cutforce 1.8
skin 0.1
reneigh_every 100
nstat 125000

Submodule gather-bench deleted from 2f654cb043

View File

@@ -45,7 +45,7 @@ static inline void gmx_load_simd_4xn_interactions(
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ begin\n");
int Nlocal = atom->Nlocal;
NeighborCluster* neighs;
int *neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@@ -66,7 +66,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp for schedule(runtime)
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
int ci_cj1 = CJ1_FROM_CI(ci);
@@ -77,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
int numneighs = neighbor->numneigh[ci];
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int any = 0;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -158,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
NeighborCluster* neighs;
int *neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@@ -213,7 +213,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
#endif
*/
#pragma omp for
#pragma omp for schedule(runtime)
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@@ -240,9 +240,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
//int imask = neighs[k].imask;
//int imask = neighs_imask[k];
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
//MD_SIMD_MASK interact0;
@@ -331,7 +331,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
@@ -401,7 +401,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
NeighborCluster* neighs;
int *neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@@ -427,7 +427,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp for schedule(runtime)
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@@ -454,9 +454,8 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3;
@@ -507,7 +506,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@@ -570,7 +569,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal;
NeighborCluster* neighs;
int *neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@@ -596,7 +595,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp for schedule(runtime)
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@@ -635,9 +634,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@@ -741,9 +739,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@@ -846,7 +843,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal;
NeighborCluster* neighs;
int *neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@@ -872,7 +869,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp for schedule(runtime)
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@@ -911,9 +908,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
@@ -991,9 +987,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k].cj;
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);

View File

@@ -25,11 +25,6 @@
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
typedef struct {
int cj;
unsigned int imask;
} NeighborCluster;
typedef struct {
int every;
int ncalls;
@@ -37,7 +32,8 @@ typedef struct {
int* numneigh;
int* numneigh_masked;
int half_neigh;
NeighborCluster* neighbors;
int* neighbors;
unsigned int* neighbors_imask;
} Neighbor;
extern void initNeighbor(Neighbor*, Parameter*);

View File

@@ -60,18 +60,15 @@ void init(Parameter *param) {
param->eam_file = NULL;
}
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
const int maxneighs = nneighs * nreps;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
if(pattern == P_RAND && ncj <= nneighs) {
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
@@ -80,6 +77,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
int m = (pattern == P_SEQ) ? ncj : nneighs;
int k = 0;
@@ -90,6 +88,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
do {
int cj = rand() % ncj;
neighptr[k] = cj;
neighptr_imask[k] = imask;
found = 0;
for(int l = 0; l < k; l++) {
if(neighptr[l] == cj) {
@@ -99,6 +98,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
} while(found == 1);
} else {
neighptr[k] = j;
neighptr_imask[k] = imask;
j = (j + 1) % m;
}
}
@@ -106,10 +106,12 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
for(int r = 1; r < nreps; r++) {
for(int k = 0; k < nneighs; k++) {
neighptr[r * nneighs + k] = neighptr[k];
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
}
}
neighbor->numneigh[ci] = nneighs * nreps;
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
}
}
@@ -125,12 +127,13 @@ int main(int argc, const char *argv[]) {
int niclusters = 256; // Number of local i-clusters
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
int masked = 0; // Use masked loop
int nreps = 1;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG("Initializing parameters...\n");
DEBUG_MESSAGE("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
@@ -156,6 +159,10 @@ int main(int argc, const char *argv[]) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-m") == 0)) {
masked = 1;
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
@@ -206,11 +213,11 @@ int main(int argc, const char *argv[]) {
}
if(param.force_field == FF_EAM) {
DEBUG("Initializing EAM parameters...\n");
DEBUG_MESSAGE("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG("Initializing atoms...\n");
DEBUG_MESSAGE("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
@@ -226,7 +233,7 @@ int main(int argc, const char *argv[]) {
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG("Creating atoms...\n");
DEBUG_MESSAGE("Creating atoms...\n");
while(atom->Nmax < niclusters * iclusters_natoms) {
growAtom(atom);
}
@@ -281,13 +288,13 @@ int main(int argc, const char *argv[]) {
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG("Defining j-clusters...\n");
DEBUG_MESSAGE("Defining j-clusters...\n");
defineJClusters(atom);
DEBUG("Initializing neighbor lists...\n");
DEBUG_MESSAGE("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG("Computing forces...\n");
DEBUG_MESSAGE("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
DEBUG_MESSAGE("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {

View File

@@ -5,7 +5,9 @@
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <omp.h>
//--
#include <likwid-marker.h>
//--
@@ -117,7 +119,7 @@ int main(int argc, char** argv) {
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0)) {
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
@@ -308,6 +310,30 @@ int main(int argc, char** argv) {
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
int nthreads = 0;
int chunkSize = 0;
omp_sched_t schedKind;
char schedType[10];
#pragma omp parallel
#pragma omp master
{
omp_get_schedule(&schedKind, &chunkSize);
switch (schedKind)
{
case omp_sched_static: strcpy(schedType, "static"); break;
case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
case omp_sched_guided: strcpy(schedType, "guided"); break;
case omp_sched_auto: strcpy(schedType, "auto"); break;
}
nthreads = omp_get_max_threads();
}
printf("Num threads: %d\n", nthreads);
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS

View File

@@ -58,6 +58,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
neighbor->numneigh = NULL;
neighbor->numneigh_masked = NULL;
neighbor->neighbors = NULL;
neighbor->neighbors_imask = NULL;
}
void setupNeighbor(Parameter *param, Atom *atom) {
@@ -229,10 +230,13 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
if(atom->Nclusters_local > nmax) {
nmax = atom->Nclusters_local;
if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
if(neighbor->neighbors) free(neighbor->neighbors);
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
}
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@@ -248,7 +252,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj1 = CJ1_FROM_CI(ci);
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
int n = 0, nmasked = 0;
int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
@@ -324,15 +329,17 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
imask = get_imask_simd_4xn(1, ci, cj);
#endif
if(imask == NBNXN_INTERACTION_MASK_ALL) {
neighptr[n].cj = cj;
neighptr[n].imask = imask;
} else {
neighptr[n].cj = neighptr[nmasked].cj;
neighptr[n].imask = neighptr[nmasked].imask;
neighptr[nmasked].cj = cj;
neighptr[nmasked].imask = imask;
nmasked++;
if(n < neighbor->maxneighs) {
if(imask == NBNXN_INTERACTION_MASK_ALL) {
neighptr[n] = cj;
neighptr_imask[n] = imask;
} else {
neighptr[n] = neighptr[nmasked];
neighptr_imask[n] = neighptr_imask[nmasked];
neighptr[nmasked] = cj;
neighptr_imask[nmasked] = imask;
nmasked++;
}
}
n++;
@@ -357,8 +364,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
// Fill neighbor list with dummy values to fit vector width
if(CLUSTER_N < VECTOR_WIDTH) {
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
neighptr[n].imask = 0;
neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
neighptr_imask[n] = 0;
n++;
}
}
@@ -375,10 +382,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
}
if(resize) {
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2;
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
free(neighbor->neighbors);
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
free(neighbor->neighbors_imask);
neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
}
}
@@ -433,20 +442,21 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
MD_FLOAT cutsq = cutneighsq;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
int numneighs_masked = neighbor->numneigh_masked[ci];
int k = 0;
// Remove dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
while(neighs[numneighs - 1] == atom->dummy_cj) {
numneighs--;
}
}
while(k < numneighs) {
int cj = neighs[k].cj;
int cj = neighs[k];
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
k++;
} else {
@@ -461,8 +471,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
// Readd dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
neighs[numneighs].imask = 0;
neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
neighs_imask[numneighs] = 0;
numneighs++;
}
}

View File

@@ -13,7 +13,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
NeighborCluster* neighs;
int *neighs;
unsigned int *neighs_imask;
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@@ -34,7 +35,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k].cj;
int j = neighs[k];
MEM_TRACE(j, 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');

View File

@@ -1,7 +1,7 @@
CC = icc
LINKER = $(CC)
OPENMP = #-qopenmp
OPENMP = -qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)

View File

@@ -502,6 +502,21 @@ int readAtom_in(Atom* atom, Parameter* param) {
return natoms;
}
void writeAtom(Atom *atom, Parameter *param) {
FILE *fp = fopen(param->write_atom_file, "w");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
atom->type[i], 1.0,
atom_x(i), atom_y(i), atom_z(i),
atom_vx(i), atom_vy(i), atom_vz(i));
}
fclose(fp);
fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
param->write_atom_file, param->xprd, param->yprd, param->zprd);
}
void growAtom(Atom *atom) {
DeviceAtom *d_atom = &(atom->d_atom);
int nold = atom->Nmax;

View File

@@ -29,7 +29,7 @@ extern "C" {
}
// cuda kernel
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= Nlocal) {
return;
@@ -46,6 +46,10 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
for(int k = 0; k < numneighs; k++) {
int j = neigh_neighbors[Nlocal * k + i];
MD_FLOAT delx = xtmp - atom_x(j);
@@ -55,7 +59,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const int type_ij = type_i * ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
@@ -109,7 +113,7 @@ extern "C" {
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_num_threads();
const int num_threads_per_block = get_cuda_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
@@ -123,7 +127,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_num_threads();
const int num_threads_per_block = get_cuda_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
@@ -136,13 +140,11 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
}
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
const int num_threads_per_block = get_num_threads();
const int num_threads_per_block = get_cuda_num_threads();
int Nlocal = atom->Nlocal;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
/*
int nDevices;
@@ -165,7 +167,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
double S = getTimeStamp();
LIKWID_MARKER_START("force");
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
cuda_assert("calc_force", cudaPeekAtLastError());
cuda_assert("calc_force", cudaDeviceSynchronize());
cudaProfilerStop();

View File

@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
__global__ void compute_neighborhood(
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= nlocal) {
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
#ifdef EXPLICIT_TYPES
int type_j = atom->type[j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
#else
const MD_FLOAT cutoff = cutneighsq;
#endif
@@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
const int num_threads_per_block = get_num_threads();
const int num_threads_per_block = get_cuda_num_threads();
int nall = atom->Nlocal + atom->Nghost;
cudaProfilerStart();
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
c_new_maxneighs,
cutneighsq);
cutneighsq, atom->ntypes);
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());

View File

@@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
const int num_threads_per_block = get_num_threads();
const int num_threads_per_block = get_cuda_num_threads();
if(reneigh) {
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
@@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
}
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
const int num_threads_per_block = get_num_threads();
const int num_threads_per_block = get_cuda_num_threads();
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;

View File

@@ -14,6 +14,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
@@ -22,6 +23,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
}

View File

@@ -41,7 +41,7 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp for schedule(runtime)
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
@@ -90,6 +90,12 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
atom_fy(i) += fiy;
atom_fz(i) += fiz;
#ifdef USE_REFERENCE_VERSION
if(numneighs % VECTOR_WIDTH > 0) {
addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
}
#endif
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
@@ -125,7 +131,7 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
{
LIKWID_MARKER_START("forceLJ-halfneigh");
#pragma omp for
#pragma omp for schedule(runtime)
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
@@ -221,7 +227,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
{
LIKWID_MARKER_START("force");
#pragma omp for
#pragma omp for schedule(runtime)
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];

View File

@@ -73,6 +73,7 @@ extern int readAtom_pdb(Atom*, Parameter*);
extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*);
extern int readAtom_in(Atom*, Parameter*);
extern void writeAtom(Atom*, Parameter*);
extern void growAtom(Atom*);
#ifdef AOS

View File

@@ -59,12 +59,6 @@ void init(Parameter *param) {
param->eam_file = NULL;
}
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
@@ -125,7 +119,7 @@ int main(int argc, const char *argv[]) {
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG("Initializing parameters...\n");
DEBUG_MESSAGE("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
@@ -196,11 +190,11 @@ int main(int argc, const char *argv[]) {
}
if(param.force_field == FF_EAM) {
DEBUG("Initializing EAM parameters...\n");
DEBUG_MESSAGE("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG("Initializing atoms...\n");
DEBUG_MESSAGE("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
@@ -216,7 +210,7 @@ int main(int argc, const char *argv[]) {
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG("Creating atoms...\n");
DEBUG_MESSAGE("Creating atoms...\n");
for(int i = 0; i < natoms; ++i) {
while(atom->Nlocal > atom->Nmax - natoms) {
growAtom(atom);
@@ -247,11 +241,11 @@ int main(int argc, const char *argv[]) {
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG("Initializing neighbor lists...\n");
DEBUG_MESSAGE("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG("Creating neighbor lists...\n");
DEBUG_MESSAGE("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG("Computing forces...\n");
DEBUG_MESSAGE("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {

View File

@@ -11,6 +11,7 @@
#include <limits.h>
#include <math.h>
#include <float.h>
#include <omp.h>
#include <likwid-marker.h>
@@ -63,6 +64,10 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
setupNeighbor(param);
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
#ifdef SORT_ATOMS
atom->Nghost = 0;
sortAtom(atom);
#endif
setupPbc(atom, param);
initDevice(atom, neighbor);
updatePbc(atom, param, true);
@@ -76,9 +81,12 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateAtomsPbc(atom, param);
#ifdef SORT_ATOMS
atom->Nghost = 0;
sortAtom(atom);
#endif
setupPbc(atom, param);
updatePbc(atom, param, true);
//sortAtom(atom);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
@@ -145,7 +153,7 @@ int main(int argc, char** argv) {
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0)) {
if((strcmp(argv[i], "-p") == 0) || strcmp(argv[i], "--params") == 0) {
readParameter(&param, argv[++i]);
continue;
}
@@ -200,19 +208,25 @@ int main(int argc, char** argv) {
param.vtk_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-w") == 0)) {
param.write_atom_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj, eam or dem), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf("-p / --params <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj, eam or dem), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-half <int>: use half (1) or full (0) neighbor lists\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("-w <file>: write input atoms to file\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
@@ -229,6 +243,10 @@ int main(int argc, char** argv) {
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
if(param.write_atom_file != NULL) {
writeAtom(&atom, &param);
}
//writeInput(&param, &atom);
timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
@@ -275,6 +293,30 @@ int main(int argc, char** argv) {
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
int nthreads = 0;
int chunkSize = 0;
omp_sched_t schedKind;
char schedType[10];
#pragma omp parallel
#pragma omp master
{
omp_get_schedule(&schedKind, &chunkSize);
switch (schedKind)
{
case omp_sched_static: strcpy(schedType, "static"); break;
case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
case omp_sched_guided: strcpy(schedType, "guided"); break;
case omp_sched_auto: strcpy(schedType, "auto"); break;
}
nthreads = omp_get_max_threads();
}
printf("Num threads: %d\n", nthreads);
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS

View File

@@ -326,45 +326,45 @@ void sortAtom(Atom* atom) {
int Nmax = atom->Nmax;
int* binpos = bincount;
for(int i=1; i<mbins; i++) {
binpos[i] += binpos[i-1];
for(int i = 1; i < mbins; i++) {
binpos[i] += binpos[i - 1];
}
#ifdef AOS
#ifdef AOS
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
#else
#else
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
#endif
#endif
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
for(int mybin = 0; mybin<mbins; mybin++) {
int start = mybin>0?binpos[mybin-1]:0;
for(int mybin = 0; mybin < mbins; mybin++) {
int start = mybin > 0 ? binpos[mybin - 1] : 0;
int count = binpos[mybin] - start;
for(int k=0; k<count; k++) {
for(int k = 0; k < count; k++) {
int new_i = start + k;
int old_i = bins[mybin * atoms_per_bin + k];
#ifdef AOS
#ifdef AOS
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
#else
#else
new_x[new_i] = old_x[old_i];
new_y[new_i] = old_y[old_i];
new_z[new_i] = old_z[old_i];
new_vx[new_i] = old_vx[old_i];
new_vy[new_i] = old_vy[old_i];
new_vz[new_i] = old_vz[old_i];
#endif
#endif
}
}
@@ -372,7 +372,7 @@ void sortAtom(Atom* atom) {
free(atom->vx);
atom->x = new_x;
atom->vx = new_vx;
#ifndef AOS
#ifndef AOS
free(atom->y);
free(atom->z);
free(atom->vy);
@@ -381,5 +381,5 @@ void sortAtom(Atom* atom) {
atom->z = new_z;
atom->vy = new_vy;
atom->vz = new_vz;
#endif
#endif
}

View File

@@ -125,7 +125,7 @@ void setupPbc(Atom *atom, Parameter *param) {
if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (x < Cutneigh && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (x < Cutneigh && y >= Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (x < Cutneigh && y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (x < Cutneigh && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (x >= (xprd-Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,-1,+1); }

View File

@@ -1,88 +0,0 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Initializing parameters...
Initializing atoms...
Creating atoms...
Pattern: seq
Number of timesteps: 200
Number of atoms: 256
Number of neighbors per atom: 1024
Number of times to replicate neighbor lists: 1
Estimated total data volume (kB): 1062.9120
Estimated atom data volume (kB): 6.1440
Estimated neighborlist data volume (kB): 1050.6240
Initializing neighbor lists...
Creating neighbor lists...
Computing forces...
Total time: 0.2735, Mega atom updates/s: 0.1872
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
Statistics:
Vector width: 8, Processor frequency: 2.0000 GHz
Average neighbors per atom: 1018.9055
Average SIMD iterations per atom: 127.3632
Total number of computed pair interactions: 52428800
Total number of SIMD iterations: 6553600
Useful read data volume for force computation: 1.47GB
Cycles/SIMD iteration: 83.4598
--------------------------------------------------------------------------------
Region force, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 0.110776 |
| call count | 200 |
+-------------------+------------+
+------------------------------------------+---------+------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+------------+
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
| CAS_COUNT_RD | MBOX0C0 | 8643 |
| CAS_COUNT_WR | MBOX0C1 | 1367 |
| CAS_COUNT_RD | MBOX1C0 | 9124 |
| CAS_COUNT_WR | MBOX1C1 | 1354 |
| CAS_COUNT_RD | MBOX2C0 | 9138 |
| CAS_COUNT_WR | MBOX2C1 | 1356 |
| CAS_COUNT_RD | MBOX3C0 | 5586 |
| CAS_COUNT_WR | MBOX3C1 | 1297 |
| CAS_COUNT_RD | MBOX4C0 | 5328 |
| CAS_COUNT_WR | MBOX4C1 | 1269 |
| CAS_COUNT_RD | MBOX5C0 | 5280 |
| CAS_COUNT_WR | MBOX5C1 | 1295 |
+------------------------------------------+---------+------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 0.1108 |
| Runtime unhalted [s] | 0.0878 |
| Clock [MHz] | 1995.2564 |
| CPI | 0.8202 |
| Energy [J] | 10.9296 |
| Power [W] | 98.6643 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 14233.3287 |
| AVX DP [MFLOP/s] | 14231.8898 |
| Packed [MUOPS/s] | 1778.9862 |
| Scalar [MUOPS/s] | 1.4389 |
| Memory read bandwidth [MBytes/s] | 24.9001 |
| Memory read data volume [GBytes] | 0.0028 |
| Memory write bandwidth [MBytes/s] | 4.5861 |
| Memory write data volume [GBytes] | 0.0005 |
| Memory bandwidth [MBytes/s] | 29.4863 |
| Memory data volume [GBytes] | 0.0033 |
| Operational intensity | 482.7104 |
+-----------------------------------+------------+

View File

@@ -1,168 +0,0 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Parameters:
Force field: lj
Kernel: plain-C
Data layout: AoS
Floating-point precision: double
Unit cells (nx, ny, nz): 32, 32, 32
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
Periodic (x, y, z): 1, 1, 1
Lattice size: 1.679596e+00
Epsilon: 1.000000e+00
Sigma: 1.000000e+00
Spring constant: 1.000000e+00
Damping constant: 1.000000e+00
Temperature: 1.440000e+00
RHO: 8.442000e-01
Mass: 1.000000e+00
Number of types: 4
Number of timesteps: 200
Report stats every (timesteps): 100
Reneighbor every (timesteps): 20
Prune every (timesteps): 1000
Output positions every (timesteps): 20
Output velocities every (timesteps): 5
Delta time (dt): 5.000000e-03
Cutoff radius: 2.500000e+00
Skin: 3.000000e-01
Half neighbor lists: 0
Processor frequency (GHz): 2.0000
----------------------------------------------------------------------------
step temp pressure
0 1.440000e+00 1.215639e+00
100 8.200895e-01 6.923143e-01
200 7.961495e-01 6.721043e-01
----------------------------------------------------------------------------
System: 131072 atoms 47265 ghost atoms, Steps: 200
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
----------------------------------------------------------------------------
Performance: 2.28 million atom updates per second
Statistics:
Vector width: 8, Processor frequency: 2.0000 GHz
Average neighbors per atom: 76.0352
Average SIMD iterations per atom: 9.9181
Total number of computed pair interactions: 2003182862
Total number of SIMD iterations: 261297661
Useful read data volume for force computation: 57.46GB
Cycles/SIMD iteration: 40.4432
--------------------------------------------------------------------------------
Region force, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.115807 |
| call count | 201 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.1158 |
| Runtime unhalted [s] | 4.0885 |
| Clock [MHz] | 1995.2508 |
| CPI | 0.8098 |
| Energy [J] | 307.9429 |
| Power [W] | 60.1944 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 12644.6041 |
| AVX DP [MFLOP/s] | 12629.1535 |
| Packed [MUOPS/s] | 1578.6442 |
| Scalar [MUOPS/s] | 15.4506 |
| Memory read bandwidth [MBytes/s] | 1713.4438 |
| Memory read data volume [GBytes] | 8.7656 |
| Memory write bandwidth [MBytes/s] | 86.5003 |
| Memory write data volume [GBytes] | 0.4425 |
| Memory bandwidth [MBytes/s] | 1799.9442 |
| Memory data volume [GBytes] | 9.2082 |
| Operational intensity | 7.0250 |
+-----------------------------------+------------+
Region reneighbour, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.897385 |
| call count | 10 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.8974 |
| Runtime unhalted [s] | 4.7026 |
| Clock [MHz] | 1995.2473 |
| CPI | 0.6440 |
| Energy [J] | 338.9000 |
| Power [W] | 57.4661 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 1059.4978 |
| AVX DP [MFLOP/s] | 1.3335 |
| Packed [MUOPS/s] | 0.1667 |
| Scalar [MUOPS/s] | 1058.1643 |
| Memory read bandwidth [MBytes/s] | 136.3006 |
| Memory read data volume [GBytes] | 0.8038 |
| Memory write bandwidth [MBytes/s] | 72.2612 |
| Memory write data volume [GBytes] | 0.4262 |
| Memory bandwidth [MBytes/s] | 208.5618 |
| Memory data volume [GBytes] | 1.2300 |
| Operational intensity | 5.0800 |
+-----------------------------------+------------+

View File

@@ -1,88 +0,0 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Initializing parameters...
Initializing atoms...
Creating atoms...
Pattern: seq
Number of timesteps: 200
Number of atoms: 256
Number of neighbors per atom: 1024
Number of times to replicate neighbor lists: 1
Estimated total data volume (kB): 1056.7680
Estimated atom data volume (kB): 3.0720
Estimated neighborlist data volume (kB): 1050.6240
Initializing neighbor lists...
Creating neighbor lists...
Computing forces...
Total time: 0.2466, Mega atom updates/s: 0.2076
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
Statistics:
Vector width: 16, Processor frequency: 2.0000 GHz
Average neighbors per atom: 1018.9055
Average SIMD iterations per atom: 63.6816
Total number of computed pair interactions: 52428800
Total number of SIMD iterations: 3276800
Useful read data volume for force computation: 0.84GB
Cycles/SIMD iteration: 150.4999
--------------------------------------------------------------------------------
Region force, Group 1: MEM_SP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 0.085843 |
| call count | 200 |
+-------------------+------------+
+------------------------------------------+---------+------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+------------+
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
| CAS_COUNT_RD | MBOX0C0 | 8354 |
| CAS_COUNT_WR | MBOX0C1 | 1126 |
| CAS_COUNT_RD | MBOX1C0 | 7863 |
| CAS_COUNT_WR | MBOX1C1 | 1105 |
| CAS_COUNT_RD | MBOX2C0 | 7990 |
| CAS_COUNT_WR | MBOX2C1 | 1113 |
| CAS_COUNT_RD | MBOX3C0 | 4775 |
| CAS_COUNT_WR | MBOX3C1 | 1112 |
| CAS_COUNT_RD | MBOX4C0 | 4201 |
| CAS_COUNT_WR | MBOX4C1 | 1127 |
| CAS_COUNT_RD | MBOX5C0 | 4035 |
| CAS_COUNT_WR | MBOX5C1 | 1120 |
+------------------------------------------+---------+------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 0.0858 |
| Runtime unhalted [s] | 0.0691 |
| Clock [MHz] | 1995.2787 |
| CPI | 1.3277 |
| Energy [J] | 9.2849 |
| Power [W] | 108.1610 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| SP [MFLOP/s] | 16606.5397 |
| AVX SP [MFLOP/s] | 16604.7458 |
| Packed [MUOPS/s] | 1037.7966 |
| Scalar [MUOPS/s] | 1.7940 |
| Memory read bandwidth [MBytes/s] | 27.7476 |
| Memory read data volume [GBytes] | 0.0024 |
| Memory write bandwidth [MBytes/s] | 4.9974 |
| Memory write data volume [GBytes] | 0.0004 |
| Memory bandwidth [MBytes/s] | 32.7450 |
| Memory data volume [GBytes] | 0.0028 |
| Operational intensity | 507.1471 |
+-----------------------------------+------------+

View File

@@ -1,168 +0,0 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Parameters:
Force field: lj
Kernel: plain-C
Data layout: AoS
Floating-point precision: single
Unit cells (nx, ny, nz): 32, 32, 32
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
Periodic (x, y, z): 1, 1, 1
Lattice size: 1.679596e+00
Epsilon: 1.000000e+00
Sigma: 1.000000e+00
Spring constant: 1.000000e+00
Damping constant: 1.000000e+00
Temperature: 1.440000e+00
RHO: 8.442000e-01
Mass: 1.000000e+00
Number of types: 4
Number of timesteps: 200
Report stats every (timesteps): 100
Reneighbor every (timesteps): 20
Prune every (timesteps): 1000
Output positions every (timesteps): 20
Output velocities every (timesteps): 5
Delta time (dt): 5.000000e-03
Cutoff radius: 2.500000e+00
Skin: 3.000000e-01
Half neighbor lists: 0
Processor frequency (GHz): 2.0000
----------------------------------------------------------------------------
step temp pressure
0 1.440000e+00 1.215639e+00
100 8.200897e-01 6.923144e-01
200 7.961481e-01 6.721031e-01
----------------------------------------------------------------------------
System: 131072 atoms 47265 ghost atoms, Steps: 200
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
----------------------------------------------------------------------------
Performance: 2.42 million atom updates per second
Statistics:
Vector width: 16, Processor frequency: 2.0000 GHz
Average neighbors per atom: 76.0351
Average SIMD iterations per atom: 5.0875
Total number of computed pair interactions: 2003181259
Total number of SIMD iterations: 134032075
Useful read data volume for force computation: 32.79GB
Cycles/SIMD iteration: 68.9511
--------------------------------------------------------------------------------
Region force, Group 1: MEM_SP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 4.452877 |
| call count | 201 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
| CAS_COUNT_WR | MBOX0C1 | 595747 |
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
| CAS_COUNT_WR | MBOX1C1 | 597090 |
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
| CAS_COUNT_WR | MBOX2C1 | 595219 |
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
| CAS_COUNT_WR | MBOX3C1 | 632443 |
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
| CAS_COUNT_WR | MBOX4C1 | 633169 |
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
| CAS_COUNT_WR | MBOX5C1 | 634112 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 4.4529 |
| Runtime unhalted [s] | 3.5585 |
| Clock [MHz] | 1995.2693 |
| CPI | 1.1947 |
| Energy [J] | 265.5057 |
| Power [W] | 59.6257 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| SP [MFLOP/s] | 14156.9661 |
| AVX SP [MFLOP/s] | 14139.2165 |
| Packed [MUOPS/s] | 883.7010 |
| Scalar [MUOPS/s] | 17.7496 |
| Memory read bandwidth [MBytes/s] | 1708.8254 |
| Memory read data volume [GBytes] | 7.6092 |
| Memory write bandwidth [MBytes/s] | 53.0035 |
| Memory write data volume [GBytes] | 0.2360 |
| Memory bandwidth [MBytes/s] | 1761.8288 |
| Memory data volume [GBytes] | 7.8452 |
| Operational intensity | 8.0354 |
+-----------------------------------+------------+
Region reneighbour, Group 1: MEM_SP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.935627 |
| call count | 10 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
| CAS_COUNT_WR | MBOX0C1 | 975760 |
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
| CAS_COUNT_WR | MBOX1C1 | 977433 |
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
| CAS_COUNT_WR | MBOX2C1 | 979122 |
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
| CAS_COUNT_WR | MBOX3C1 | 967621 |
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
| CAS_COUNT_WR | MBOX4C1 | 967179 |
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
| CAS_COUNT_WR | MBOX5C1 | 969349 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.9356 |
| Runtime unhalted [s] | 4.7334 |
| Clock [MHz] | 1995.2675 |
| CPI | 0.6483 |
| Energy [J] | 340.7903 |
| Power [W] | 57.4144 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| SP [MFLOP/s] | 1052.6723 |
| AVX SP [MFLOP/s] | 1.3249 |
| Packed [MUOPS/s] | 0.0828 |
| Scalar [MUOPS/s] | 1051.3474 |
| Memory read bandwidth [MBytes/s] | 114.9736 |
| Memory read data volume [GBytes] | 0.6824 |
| Memory write bandwidth [MBytes/s] | 62.9308 |
| Memory write data volume [GBytes] | 0.3735 |
| Memory bandwidth [MBytes/s] | 177.9044 |
| Memory data volume [GBytes] | 1.0560 |
| Operational intensity | 5.9171 |
+-----------------------------------+------------+

View File

@@ -1,148 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-avx512-dp-ICX.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 47.68 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 42.0 0.0 | 12.5 | 5.0 5.0 | 5.0 5.0 | 0.0 | 42.0 | 12.5 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | movsxd rbx, dword ptr [r12+r14*4]
| 1 | | 1.0 | | | | | | | lea rcx, ptr [rbx+rbx*2]
| 1 | | | | | | | 1.0 | | shl rcx, 0x6
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40]
| 1 | | | | | | 1.0 | | | vsubpd zmm4, zmm3, zmm29
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x140]
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm30
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rbx+rbx*1]
| 1 | | | | | | | 1.0 | | cmp rdi, rcx
| 1 | | | | | | | 1.0 | | setnz dl
| 1 | | | | | | | 1.0 | | setz cl
| 1 | | 1.0 | | | | | | | lea ebx, ptr [rbx+rbx*1+0x1]
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm25, zmm31
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm3, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm4, zmm4
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm18
| 1 | | 1.0 | | | | | | | cmp rdi, rbx
| 1 | | | | | | | 1.0 | | setz bl
| 1* | | | | | | | | | mov ebp, ebx
| 1 | | | | | | 1.0 | | | vmulpd zmm20, zmm19, zmm22
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm19, zmm19
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm21, zmm20
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm21, zmmword ptr [rsp+0x80]
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm21, zmm29
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm1, zmm19
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
| 1 | 1.0 | | | | | | | | vaddpd zmm20, zmm20, zmm2
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm20, zmmword ptr [rsp+0x100]
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm20, zmm30
| 1 | | 1.0 | | | | | | | not bpl
| 1 | | 1.0 | | | | | | | sub bpl, cl
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm18, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm26, zmm31
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15{k1}, zmm19, zmm4
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm18, zmm18
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm4, zmm21, zmm21
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm19, zmm3
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm4
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rdx+rdx*1]
| 1* | | | | | | | | | mov eax, ebx
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm3, zmm22
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm3, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm19, zmm17
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm29
| 1 | | | | | | | 1.0 | | shl al, 0x5
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm1, zmm3
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
| 1 | | | | | | 1.0 | | | vaddpd zmm17, zmm17, zmm2
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm23, zmm30
| 1 | | 0.5 | | | | | 0.5 | | sub cl, al
| 1 | | 0.5 | | | | | 0.5 | | add cl, 0xfd
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm4, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vsubpd zmm4, zmm27, zmm31
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14{k1}, zmm3, zmm21
| 1 | 1.0 | | | | | | | | vmulpd zmm21, zmm4, zmm4
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm21, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm3, zmm20
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm20, zmm21
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm22
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm20, zmm20
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm1, zmm20
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm3
| 1 | | | | | | 1.0 | | | vaddpd zmm3, zmm3, zmm2
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
| 1* | | | | | | | | | mov ecx, ebx
| 1 | | | | | | | 1.0 | | shl cl, 0x6
| 1 | | 0.5 | | | | | 0.5 | | sub al, cl
| 1 | | 0.5 | | | | | 0.5 | | add al, 0xfb
| 1 | | | | | | 1.0 | | | kmovd k1, eax
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm21, zmm0, 0x1
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180]
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm18, zmm29
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm24, zmm30
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm28, zmm31
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm16{k1}, zmm3, zmm19
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm21
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm18, zmm18
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k1}, zmm3, zmm17
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm3, zmm4
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm17, zmm22
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm17, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm1, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm4, zmm3
| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
| 1 | | | | | | | 1.0 | | shl dl, 0x3
| 1 | | | | | | | 1.0 | | shl bl, 0x7
| 1 | | 1.0 | | | | | | | sub dl, bl
| 1 | | 1.0 | | | | | | | add dl, 0xf7
| 1 | | | | | | 1.0 | | | kmovd k1, edx
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k1}, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k1}, zmm3, zmm20
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k1}, zmm3, zmm21
| 1 | | 0.5 | | | | | 0.5 | | inc r14
| 1* | | | | | | | | | cmp r11, r14
| 0*F | | | | | | | | | jnz 0xfffffffffffffd99
Total Num Of Uops: 123
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -1,159 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-avx512-dp-ICX.s
Architecture: CSX
Timestamp: 2023-01-03 00:07:20
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
2287 | | | | | | | | || | | .LBB5_11: #
2288 | | | | | | | | || | | # Parent Loop BB5_6 Depth=1
2289 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
2290 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r12,%r14,4), %rbx
2291 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rbx,%rbx,2), %rcx
2292 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rcx
2293 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rsi,%rcx), %zmm29
2294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rsi,%rcx), %zmm30
2295 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rsi,%rcx), %zmm31
2296 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
2297 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm4
2298 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 320(%rsp), %zmm3 # 64-byte Reload
2299 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm30, %zmm3, %zmm3
2300 | | 1.00 | | | | 0.00 | | || | | leal (%rbx,%rbx), %ecx
2301 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %rdi
2302 | 0.00 | | | | | | 1.00 | || | | setne %dl
2303 | 0.00 | | | | | | 1.00 | || | | sete %cl
2304 | | 1.00 | | | | | | || | | leal 1(%rbx,%rbx), %ebx
2305 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm31, %zmm25, %zmm17
2306 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18
2307 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
2308 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
2309 | 2.75 | | | | | 0.25 | | || 8.0 | | vrcp14pd %zmm18, %zmm19
2310 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | cmpq %rbx, %rdi
2311 | 0.00 | | | | | | 1.00 | || | | sete %bl
2312 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ebp
2313 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm19, %zmm20
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm19, %zmm21
2315 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm21, %zmm20
2316 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm21 # 64-byte Reload
2317 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm21, %zmm21
2318 | 0.00 | | | | | | 1.00 | || | | shlb $4, %bpl
2319 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm1, %zmm19
2320 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
2321 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm20, %zmm20
2322 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
2323 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 256(%rsp), %zmm20 # 64-byte Reload
2324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm20, %zmm20
2325 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | notb %bpl
2326 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | subb %cl, %bpl
2327 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2328 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
2329 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm26, %zmm18
2330 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
2331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm4
2332 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
2333 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
2334 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
2335 | 2.25 | | | | | 0.75 | | || | | vrcp14pd %zmm4, %zmm3
2336 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %ecx
2337 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %eax
2338 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm3, %zmm17
2340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm19
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm19, %zmm17
2342 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm19 # 64-byte Reload
2343 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm19, %zmm19
2344 | 0.00 | | | | | | 1.00 | || | | shlb $5, %al
2345 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm1, %zmm3
2346 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
2347 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm17, %zmm17
2348 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
2349 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm23, %zmm17
2350 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %al, %cl
2351 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | addb $-3, %cl
2352 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
2353 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
2354 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm27, %zmm4
2355 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
2356 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm4, %zmm4, %zmm21
2357 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
2358 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
2359 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
2360 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm21, %zmm20
2361 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
2362 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm20, %zmm3
2363 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm18
2364 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
2365 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm1, %zmm18
2366 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm18
2367 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm3, %zmm3
2368 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
2369 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
2370 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ecx
2371 | 0.00 | | | | | | 1.00 | || | | shlb $6, %cl
2372 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %cl, %al
2373 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-5, %al
2374 | 1.00 | | | | | | | || | | kmovd %eax, %k1
2375 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
2376 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
2377 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18
2378 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm24, %zmm20
2379 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm28, %zmm21
2380 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
2381 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm21, %zmm21, %zmm19
2382 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
2383 | 0.25 | | | | | 0.75 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
2384 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
2385 | 2.00 | | | | | 1.00 | | || | | vrcp14pd %zmm19, %zmm17
2386 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
2387 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm22, %zmm17, %zmm3
2388 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm17, %zmm4
2389 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
2390 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm1, %zmm4
2391 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm4
2392 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3
2393 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
2394 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
2395 | 0.00 | | | | | | 1.00 | || | | shlb $7, %bl
2396 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %bl, %dl
2397 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addb $-9, %dl
2398 | 1.00 | | | | | | | || | | kmovd %edx, %k1
2399 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2400 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
2401 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
2402 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
2403 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %r14
2404 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %r14, %r11
2405 | | | | | | | | || | | * jne .LBB5_11
40.0 14.5 5.00 5.00 5.00 5.00 40.0 14.5 50.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
2402 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
2401 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
2400 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
2386 | 4.0 | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
2384 | 4.0 | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
2380 | 4.0 | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
2361 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
2359 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
2355 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
2338 | 4.0 | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
2334 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
2330 | 4.0 | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
2394 | 3.0 | shlb $3, %dl | [2394, 2396, 2397]
2318 | 3.0 | shlb $4, %bpl | [2318, 2325, 2326]
2403 | 1.0 | incq %r14 | [2403]

File diff suppressed because it is too large Load Diff

View File

@@ -1,198 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icc-avx512-dp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
| 1 | | | | | | | 1.0 | | inc rsi
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
| 1 | | | | | | | 1.0 | | mov edx, 0x0
| 1 | | | | | | | 1.0 | | setz dl
| 1 | | 1.0 | | | | | | | cmp eax, r11d
| 1 | | | | | | | 1.0 | | mov eax, 0x0
| 1* | | | | | | | | | mov r13d, edx
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
| 1 | | | | | | | 1.0 | | setz al
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
| 1 | | 1.0 | | | | | | | neg r13d
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
| 1* | | | | | | | | | mov r12d, eax
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
| 1 | | 1.0 | | | | | | | add r13d, 0xff
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
| 1 | | | | | | | 1.0 | | nop
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
| 1 | | 1.0 | | | | | | | sub r13d, r12d
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
| 1* | | | | | | | | | mov r13d, eax
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
| 1 | | | | | | | 1.0 | | neg r12d
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
| 1 | | 1.0 | | | | | | | add r12d, 0xff
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
| 1 | | 1.0 | | | | | | | sub r12d, r13d
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
| 1* | | | | | | | | | mov r12d, eax
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
| 1 | | | | | | | 1.0 | | neg r13d
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
| 1 | | | | | | | 1.0 | | add r13d, 0xff
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
| 1 | | | | | | | 1.0 | | shl edx, 0x3
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
| 1 | | 1.0 | | | | | | | neg edx
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
| 1 | | 1.0 | | | | | | | sub r13d, r12d
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
| 1 | | 1.0 | | | | | | | add edx, 0xff
| 1 | | | | | | | 1.0 | | shl eax, 0x7
| 1 | | 1.0 | | | | | | | sub edx, eax
| 1 | 1.0 | | | | | | | | kmovb eax, k6
| 1 | | | | | | 1.0 | | | kmovb k6, eax
| 1 | 1.0 | | | | | | | | kmovw eax, k7
| 1 | | | | | | 1.0 | | | kmovb k7, eax
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
| 1 | | | | | | 1.0 | | | kmovb k6, edx
| 1 | 1.0 | | | | | | | | kmovb edx, k7
| 1 | | | | | | 1.0 | | | kmovw k7, edx
| 1 | 1.0 | | | | | | | | kmovw edx, k0
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
| 1 | 1.0 | | | | | | | | kmovb eax, k6
| 1 | | | | | | 1.0 | | | kmovb k6, eax
| 1 | | | | | | 1.0 | | | kmovb k0, edx
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
| 1* | | | | | | | | | cmp rsi, rdi
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
Total Num Of Uops: 187
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -1,152 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icc-avx512-sp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
| 1* | | | | | | | | | mov r12d, r13d
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
| 1 | | 1.0 | | | | | | | inc rax
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
| 1 | | | | | | | 1.0 | | setz r12b
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
| 1 | | | | | | | 1.0 | | shl r14, 0x5
| 1* | | | | | | | | | mov r8d, r12d
| 1 | | 1.0 | | | | | | | neg r8d
| 1* | | | | | | | | | mov r11d, r12d
| 1 | | 1.0 | | | | | | | add r8d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
| 1 | | | | | | | 1.0 | | neg r9d
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
| 1 | | | | | | | 1.0 | | add r9d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
| 1 | | | | | | | 1.0 | | neg r10d
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
| 1 | | 1.0 | | | | | | | add r10d, r12d
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
| 1 | | | | | | | 1.0 | | add r10d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
| 1 | | 1.0 | | | | | | | sub r12d, r11d
| 1 | | 1.0 | | | | | | | add r12d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
| 1* | | | | | | | | | cmp rax, rdx
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
Total Num Of Uops: 142
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -1,154 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icx-avx512-dp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
| 1 | | | | | | | 1.0 | | cmp r11, rdx
| 1 | | | | | | | 1.0 | | setnz dl
| 1 | | | | | | | 1.0 | | setz al
| 1 | | 1.0 | | | | | | | add ecx, ecx
| 1 | | 1.0 | | | | | | | inc ecx
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
| 1 | | | | | | | 1.0 | | setz cl
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
| 1 | | | | | | | 1.0 | | setnz dil
| 1* | | | | | | | | | mov ebp, edi
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
| 1 | | 1.0 | | | | | | | sub bpl, al
| 1 | | 1.0 | | | | | | | add bpl, 0xef
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
| 1* | | | | | | | | | mov ebp, edi
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
| 1 | | 1.0 | | | | | | | or bpl, al
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
| 1 | | | | | | | 1.0 | | shl dil, 0x6
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
| 1 | | | | | | 1.0 | | | kmovd k1, edi
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
| 1 | | | | | | | 1.0 | | shl dl, 0x3
| 1 | | | | | | | 1.0 | | shl cl, 0x7
| 1 | | 1.0 | | | | | | | or cl, dl
| 1 | | 1.0 | | | | | | | add cl, 0xf7
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
| 1* | | | | | | | | | cmp r9, rbx
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
Total Num Of Uops: 129
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -1,288 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 12200
Total Cycles: 4745
Total uOps: 14000
Dispatch Width: 6
uOps Per Cycle: 2.95
IPC: 2.57
Block RThroughput: 34.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
1 1 0.50 shlq $6, %rdx
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
2 8 0.50 * vmovupd 16(%rsp), %zmm3
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
2 8 0.50 * vmovupd 336(%rsp), %zmm16
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
3 4 2.00 vrcp14pd %zmm17, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
1 1 0.50 leal (%rcx,%rcx), %edx
1 1 0.25 cmpq %rdx, %r11
1 1 0.50 setne %dl
1 1 0.50 sete %al
1 1 0.25 addl %ecx, %ecx
1 1 0.25 incl %ecx
1 1 0.25 cmpq %rcx, %r11
1 1 0.50 sete %cl
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
2 8 0.50 * vmovupd 528(%rsp), %zmm19
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
1 1 0.50 setne %dil
1 1 0.25 movl %edi, %ebp
1 1 0.50 shlb $4, %bpl
1 1 0.25 subb %al, %bpl
1 1 0.25 addb $-17, %bpl
1 1 1.00 kmovd %ebp, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2 8 0.50 * vmovupd 272(%rsp), %zmm17
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
1 1 0.50 leal (%rdx,%rdx), %eax
1 1 0.25 movl %edi, %ebp
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
3 4 2.00 vrcp14pd %zmm3, %zmm16
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
2 8 0.50 * vmovupd 464(%rsp), %zmm31
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
1 1 0.50 shlb $5, %bpl
1 1 0.25 orb %al, %bpl
1 1 0.25 orb $-35, %bpl
1 1 1.00 kmovd %ebp, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2 8 0.50 * vmovupd 208(%rsp), %zmm3
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
3 4 2.00 vrcp14pd %zmm19, %zmm17
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
1 1 0.50 leal (,%rdx,4), %eax
1 1 0.50 shlb $6, %dil
1 1 0.25 orb %al, %dil
1 1 0.25 orb $-69, %dil
1 1 1.00 kmovd %edi, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2 8 0.50 * vmovupd 400(%rsp), %zmm17
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
3 4 2.00 vrcp14pd %zmm28, %zmm3
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
1 1 0.50 shlb $3, %dl
1 1 0.50 shlb $7, %cl
1 1 0.25 orb %dl, %cl
1 1 0.25 addb $-9, %cl
1 1 1.00 kmovd %ecx, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
1 1 0.25 incq %rbx
1 1 0.25 cmpq %rbx, %r9
1 1 0.50 jne .LBB5_12
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
- - - - - - - - 1.00 - cmpq %rdx, %r11
- - - - - - - - 1.00 - setne %dl
- - 0.44 - - - - - 0.56 - sete %al
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
- - - 0.53 - - - 0.46 0.01 - incl %ecx
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
- - 0.02 - - - - - 0.98 - sete %cl
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
- - 0.04 - - - - - 0.96 - setne %dil
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
- - - 0.96 - - - - 0.04 - subb %al, %bpl
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
- - - - - - - 1.00 - - kmovd %ebp, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
- - - 0.94 - - - - 0.06 - orb %al, %bpl
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
- - - - - - - 1.00 - - kmovd %ebp, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
- - - - - - - - 1.00 - shlb $6, %dil
- - - 0.02 - - - - 0.98 - orb %al, %dil
- - - 0.48 - - - - 0.52 - orb $-69, %dil
- - - - - - - 1.00 - - kmovd %edi, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
- - - - - - - - 1.00 - shlb $3, %dl
- - - - - - - - 1.00 - shlb $7, %cl
- - - 1.00 - - - - - - orb %dl, %cl
- - - 0.52 - - - - 0.48 - addb $-9, %cl
- - - - - - - 1.00 - - kmovd %ecx, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
- - - 0.48 - - - - 0.52 - incq %rbx
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
- - - - - - - - 1.00 - jne .LBB5_12

View File

@@ -1,167 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-dp.s
Architecture: ICX
Timestamp: 2023-02-14 12:51:57
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
------------------------------------------------------------------------------------------------------------------------
2241 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
2242 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
2243 | | | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
2244 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r10,%rbx,4), %rcx
2246 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
2247 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rdx
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
2252 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm3
2253 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
2255 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16
2256 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
2257 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
2258 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
2259 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm17, %zmm18
2260 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
2261 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2262 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2263 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
2264 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm22, %zmm18
2265 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
2266 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm30, %zmm25, %zmm20
2267 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
2268 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rdx, %r11
2269 | 0.00 | | | | | | 1.00 | | | || | | setne %dl
2270 | 0.00 | | | | | | 1.00 | | | || | | sete %al
2271 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl %ecx, %ecx
2272 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | incl %ecx
2273 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rcx, %r11
2274 | 0.00 | | | | | | 1.00 | | | || | | sete %cl
2275 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
2277 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm28, %zmm19, %zmm19
2278 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
2279 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
2280 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $4, %bpl
2281 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | subb %al, %bpl
2282 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | addb $-17, %bpl
2283 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
2284 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
2286 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm17, %zmm17
2287 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rdx,%rdx), %eax
2288 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
2289 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
2290 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
2291 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm3
2292 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
2293 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
2294 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
2295 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm3, %zmm16
2296 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
2297 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm21, %zmm18
2298 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
2299 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
2300 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm18, %zmm31
2301 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm22, %zmm16
2302 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm31, %zmm16, %zmm16
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
2304 | 0.75 | | | | | 0.250 | | | | || | | vsubpd %zmm28, %zmm31, %zmm31
2305 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $5, %bpl
2306 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb %al, %bpl
2307 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb $-35, %bpl
2308 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
2309 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
2311 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm3, %zmm3
2312 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm18, %zmm16
2313 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm26, %zmm18
2314 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
2315 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
2316 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm19
2317 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
2318 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
2319 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
2320 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm19, %zmm17
2321 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
2322 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm21, %zmm16
2323 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
2324 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
2325 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm16, %zmm20
2326 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm22, %zmm17
2327 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm17, %zmm17
2328 | 0.75 | | | | | 0.250 | | | | || | | vmulpd %zmm17, %zmm16, %zmm16
2329 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (,%rdx,4), %eax
2330 | 0.00 | | | | | | 1.00 | | | || | | shlb $6, %dil
2331 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %al, %dil
2332 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb $-69, %dil
2333 | 1.00 | | | | | | | | | || | | kmovd %edi, %k1
2334 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
2336 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm17, %zmm17
2337 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm23, %zmm19
2338 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm20
2339 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
2340 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
2341 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm28
2342 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
2343 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
2344 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
2345 | 2.00 | | | | | 1.000 | | | | || | | vrcp14pd %zmm28, %zmm3
2346 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
2347 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm21, %zmm16
2348 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
2349 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
2350 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm1, %zmm16, %zmm18
2351 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm22, %zmm3
2352 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm18, %zmm3, %zmm3
2353 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm16, %zmm3
2354 | 0.00 | | | | | | 1.00 | | | || | | shlb $3, %dl
2355 | 0.00 | | | | | | 1.00 | | | || | | shlb $7, %cl
2356 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %dl, %cl
2357 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addb $-9, %cl
2358 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k1
2359 | 0.00 | | | | | 1.000 | | | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
2360 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm2, %zmm3, %zmm3
2361 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
2362 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
2363 | 0.24 | | | | | 0.760 | | | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
2364 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rbx
2365 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rbx, %r9
2366 | | | | | | | | | | || | | * jne .LBB5_12
2367 | | | | | | | | | | || | | # LLVM-MCA-END
44.0 15.0 5.50 5.50 5.50 5.50 43.99 15.0 71 6.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
2364 | 1.0 | incq %rbx | [2364]

View File

@@ -1,167 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-dp.s
Architecture: CSX
Timestamp: 2023-02-10 16:30:53
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
2366 | | | | | | | | || | | * jne .LBB5_12
2367 | | | | | | | | || | | # LLVM-MCA-END
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
2364 | 1.0 | incq %rbx | [2364]

View File

@@ -1,162 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icx-avx512-sp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
| 1* | | | | | | | | | mov rsi, rax
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
| 1* | | | | | | | | | xor esi, esi
| 1* | | | | | | | | | xor edi, edi
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
| 1 | | | | | | | 1.0 | | setz sil
| 1 | | | | | | | 1.0 | | setnz dil
| 1 | | 1.0 | | | | | | | mov eax, 0xff
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
| 1 | | 1.0 | | | | | | | xor esi, 0xff
| 1 | | | | | | 1.0 | | | kmovd k1, esi
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
| 1 | | | | | | | 1.0 | | or esi, 0xfc
| 1 | | | | | | 1.0 | | | kmovd k1, esi
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
| 1 | | | | | | 1.0 | | | kmovd k1, eax
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
| 1* | | | | | | | | | cmp r10, rdx
| 0*F | | | | | | | | | jz 0x34
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
| 1 | | 1.0 | | | | | | | inc rdx
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
Total Num Of Uops: 140
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -1,304 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 13000
Total Cycles: 5640
Total uOps: 15400
Dispatch Width: 6
uOps Per Cycle: 2.73
IPC: 2.30
Block RThroughput: 40.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 5 0.50 * movslq (%r11,%rdx,4), %rax
1 1 0.25 movq %rax, %rsi
1 1 0.50 shlq $5, %rsi
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
2 8 0.50 * vmovups 128(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
2 8 0.50 * vmovups 320(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
2 8 0.50 * vmovups (%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
2 8 0.50 * vmovups 256(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
2 8 0.50 * vmovups 448(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
2 8 0.50 * vmovups 192(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
2 8 0.50 * vmovups 384(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
3 4 2.00 vrcp14ps %zmm27, %zmm31
3 4 2.00 vrcp14ps %zmm28, %zmm1
3 4 2.00 vrcp14ps %zmm29, %zmm2
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
3 4 2.00 vrcp14ps %zmm30, %zmm3
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
1 0 0.17 xorl %esi, %esi
1 0 0.17 xorl %edi, %edi
1 1 0.25 testl $2147483647, %eax
1 1 0.50 sete %sil
1 1 0.50 setne %dil
1 1 0.25 movl $255, %eax
1 1 0.50 cmovel %r8d, %eax
1 1 0.25 movl $255, %ecx
1 1 0.50 cmovel %r9d, %ecx
1 1 0.25 xorl $255, %esi
1 1 1.00 kmovd %esi, %k1
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
1 1 0.50 leal (%rdi,%rdi,2), %esi
1 1 0.25 orl $252, %esi
1 1 1.00 kmovd %esi, %k1
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
1 1 1.00 kmovd %eax, %k1
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
1 1 1.00 kmovd %ecx, %k1
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
1 5 0.50 * movq 176(%r15), %rax
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
1 1 0.25 cmpq %rdx, %r10
1 1 0.50 je .LBB4_18
1 5 0.50 * movq 160(%r15), %rdi
1 1 0.25 incq %rdx
1 1 0.50 jmp .LBB4_8
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
- - - - - - - - 1.00 - movq %rax, %rsi
- - - - - - - - 1.00 - shlq $5, %rsi
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
- - - - - - - - - - xorl %esi, %esi
- - - - - - - - - - xorl %edi, %edi
- - - - - - - - 1.00 - testl $2147483647, %eax
- - - - - - - - 1.00 - sete %sil
- - - - - - - - 1.00 - setne %dil
- - - 1.00 - - - - - - movl $255, %eax
- - - - - - - - 1.00 - cmovel %r8d, %eax
- - - 1.00 - - - - - - movl $255, %ecx
- - - - - - - - 1.00 - cmovel %r9d, %ecx
- - - 1.00 - - - - - - xorl $255, %esi
- - - - - - - 1.00 - - kmovd %esi, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
- - - - - - - - 1.00 - orl $252, %esi
- - - - - - - 1.00 - - kmovd %esi, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
- - - - - - - 1.00 - - kmovd %eax, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
- - - - - - - 1.00 - - kmovd %ecx, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
- - - - 1.00 - - - - - movq 176(%r15), %rax
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
- - - - - - - - 1.00 - cmpq %rdx, %r10
- - - - - - - - 1.00 - je .LBB4_18
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
- - - 1.00 - - - - - - incq %rdx
- - - - - - - - 1.00 - jmp .LBB4_8

View File

@@ -1,116 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-sp.s
Architecture: ICX
Timestamp: 2023-02-14 12:51:43
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
------------------------------------------------------------------------------------------------------------------------
1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1
1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx
1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx
1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16
1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3]
1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3]
1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18
1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17
1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16
1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22
1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22
1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22
1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23
1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24
1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25
1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23
1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23
1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23
1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi
1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp
1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12
1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx
1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl
1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx
1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx
1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12
1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl
1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx
1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl
1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx
1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d
1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d
1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d
1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx
1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx
1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d
1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00
1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx
1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2}
1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21
1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20
1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19
1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22
1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18
1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18
1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18
1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17
1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16
1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22
1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17
1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17
1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16
1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx
1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx
1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx
1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx
1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx
1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx
1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB
1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2}
1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16
1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax
1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10
1420 | | | | | | | | | | || | | * jne .LBB2_12
1421 | | | | | | | | | | || | | # LLVM-MCA-END
22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
1418 | 1.0 | incq %rax | [1418]

View File

@@ -1,161 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-sp.s
Architecture: CSX
Timestamp: 2023-02-10 16:31:04
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
1791 | | | | | | | | || | | * je .LBB4_18
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
1796 | | | | | | | | || | | # LLVM-MCA-END
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
1794 | 1.0 | incq %rdx | [1794]

View File

@@ -1,88 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - lammps-icc-avx2.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
| 1* | | | | | | | | | mov r8d, ecx
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
| 1* | | | | | | | | | mov r14d, r15d
| 1 | | | | | | | 1.0 | | shr r15, 0x20
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
| 1 | | | | | | | 1.0 | | add rdx, 0x4
| 1* | | | | | | | | | cmp rdx, rsi
| 0*F | | | | | | | | | jb 0xffffffffffffff02
Total Num Of Uops: 62
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -1,156 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 5600
Total Cycles: 2352
Total uOps: 6300
Dispatch Width: 6
uOps Per Cycle: 2.68
IPC: 2.38
Block RThroughput: 10.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
1 2 1.00 vmovq %xmm0, %rcx
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
1 2 1.00 vmovq %xmm2, %r15
1 1 0.25 movl %ecx, %r8d
1 1 0.50 shrq $32, %rcx
1 1 0.50 leal (%rcx,%rcx,2), %r14d
1 1 0.50 leal (%r8,%r8,2), %r8d
1 1 0.25 movslq %r8d, %rcx
1 1 0.25 movslq %r14d, %r8
1 1 0.25 movl %r15d, %r14d
1 1 0.50 shrq $32, %r15
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
1 1 0.50 leal (%r14,%r14,2), %r14d
1 1 0.25 movslq %r14d, %r14
1 1 0.50 leal (%r15,%r15,2), %r15d
1 1 0.25 movslq %r15d, %r15
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
2 3 1.00 vptest %ymm7, %ymm1
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
1 1 0.25 addq $4, %rdx
1 1 0.25 cmpq %rsi, %rdx
1 1 0.50 jb ..B1.22
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
- - 1.00 - - - - - - - vmovq %xmm2, %r15
- - - - - - - - 1.00 - movl %ecx, %r8d
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
- - 0.51 - - - - - 0.49 - shrq $32, %r15
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
- - 0.01 - - - - - 0.99 - addq $4, %rdx
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
- - 0.45 - - - - - 0.55 - jb ..B1.22

View File

@@ -1,158 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 5600
Total Cycles: 2306
Total uOps: 6300
Dispatch Width: 6
uOps Per Cycle: 2.73
IPC: 2.43
Block RThroughput: 10.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
1 2 1.00 vmovq %xmm0, %rcx
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
1 2 1.00 vmovq %xmm2, %r15
1 1 0.25 movl %ecx, %r8d
1 1 0.50 shrq $32, %rcx
1 1 0.50 leal (%rcx,%rcx,2), %r14d
1 1 0.50 leal (%r8,%r8,2), %r8d
1 1 0.25 movslq %r8d, %rcx
1 1 0.25 movslq %r14d, %r8
1 1 0.25 movl %r15d, %r14d
1 1 0.50 shrq $32, %r15
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
1 1 0.50 leal (%r14,%r14,2), %r14d
1 1 0.25 movslq %r14d, %r14
1 1 0.50 leal (%r15,%r15,2), %r15d
1 1 0.25 movslq %r15d, %r15
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
2 3 1.00 vptest %ymm7, %ymm1
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
1 1 0.25 addq $4, %rdx
1 1 0.25 cmpq %rsi, %rdx
1 1 0.50 jb ..B1.22
Resources:
[0] - ICXDivider
[1] - ICXFPDivider
[2] - ICXPort0
[3] - ICXPort1
[4] - ICXPort2
[5] - ICXPort3
[6] - ICXPort4
[7] - ICXPort5
[8] - ICXPort6
[9] - ICXPort7
[10] - ICXPort8
[11] - ICXPort9
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
- - - - - - - - 1.00 - - - movl %ecx, %r8d
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
- - 0.01 - - - - - 0.99 - - - jb ..B1.22

View File

@@ -1,97 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx2.s
Architecture: CSX
Timestamp: 2023-02-10 16:29:58
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
----------------------------------------------------------------------------------------------------
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
259 | | | | | | | | || | | # Execution count [2.50e+01]
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
299 | | | | | | | | || | | # Execution count [1.25e+01]
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
319 | | | | | | | | || | | # Execution count [2.50e+01]
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
323 | | | | | | | | || | | # LLVM-MCA-END
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
320 | 1.0 | addq $4, %rdx #59.9| [320]

View File

@@ -1,97 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx2.s
Architecture: ICX
Timestamp: 2023-02-10 16:29:48
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
-----------------------------------------------------------------------------------------------------------------------
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
323 | | | | | | | | | | || | | # LLVM-MCA-END
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
320 | 1.0 | addq $4, %rdx #59.9| [320]

View File

@@ -1,75 +0,0 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - lammps-icc-avx512.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
| 1 | | | | | | | 1.0 | | add r15, 0x8
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
| 1 | 1.0 | | | | | | | | kmovw k2, k5
| 1 | 1.0 | | | | | | | | kmovw k3, k5
| 1 | 1.0 | | | | | | | | kmovw k1, k5
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
| 1* | | | | | | | | | vmovaps zmm23, zmm31
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k4, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
| 1* | | | | | | | | | cmp r15, r14
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
Total Num Of Uops: 57
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@@ -1,128 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 4200
Total Cycles: 2465
Total uOps: 5800
Dispatch Width: 6
uOps Per Cycle: 2.35
IPC: 1.70
Block RThroughput: 13.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
1 1 0.25 addq $8, %r15
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
1 1 1.00 kmovw %k5, %k2
1 1 1.00 kmovw %k5, %k3
1 1 1.00 kmovw %k5, %k1
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
3 4 2.00 vrcp14pd %zmm31, %zmm30
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
1 4 1.00 vfpclasspd $30, %zmm30, %k0
1 1 0.50 vmovaps %zmm31, %zmm23
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
1 1 1.00 knotw %k0, %k4
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
1 1 0.25 cmpq %r14, %r15
1 1 0.50 jb ..B1.16
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
- - 1.00 - - - - - - - kmovw %k5, %k2
- - 1.00 - - - - - - - kmovw %k5, %k3
- - 1.00 - - - - - - - kmovw %k5, %k1
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
- - 1.00 - - - - - - - knotw %k0, %k4
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
- - 0.14 - - - - - 0.86 - jb ..B1.16

View File

@@ -1,130 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 4200
Total Cycles: 2465
Total uOps: 5800
Dispatch Width: 6
uOps Per Cycle: 2.35
IPC: 1.70
Block RThroughput: 13.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
1 1 0.25 addq $8, %r15
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
1 1 1.00 kmovw %k5, %k2
1 1 1.00 kmovw %k5, %k3
1 1 1.00 kmovw %k5, %k1
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
3 4 2.00 vrcp14pd %zmm31, %zmm30
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
1 4 1.00 vfpclasspd $30, %zmm30, %k0
1 1 0.50 vmovaps %zmm31, %zmm23
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
1 1 1.00 knotw %k0, %k4
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
1 1 0.25 cmpq %r14, %r15
1 1 0.50 jb ..B1.16
Resources:
[0] - ICXDivider
[1] - ICXFPDivider
[2] - ICXPort0
[3] - ICXPort1
[4] - ICXPort2
[5] - ICXPort3
[6] - ICXPort4
[7] - ICXPort5
[8] - ICXPort6
[9] - ICXPort7
[10] - ICXPort8
[11] - ICXPort9
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
- - 1.00 - - - - - - - - - kmovw %k5, %k2
- - 1.00 - - - - - - - - - kmovw %k5, %k3
- - 1.00 - - - - - - - - - kmovw %k5, %k1
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
- - 1.00 - - - - - - - - - knotw %k0, %k4
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
- - 0.14 - - - - - 0.86 - - - jb ..B1.16

View File

@@ -1,77 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx512.s
Architecture: CSX
Timestamp: 2023-02-10 16:30:08
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
203 | | | | | | | | || | | # Execution count [2.50e+01]
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
246 | | | | | | | | || | | # LLVM-MCA-END
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
208 | 1.0 | addq $8, %r15 #59.9| [208]
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]

View File

@@ -1,77 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx512.s
Architecture: ICX
Timestamp: 2023-02-10 16:29:42
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
------------------------------------------------------------------------------------------------------------------------
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
246 | | | | | | | | | | || | | # LLVM-MCA-END
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
208 | 1.0 | addq $8, %r15 #59.9| [208]
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]

View File

@@ -1,197 +0,0 @@
[0] Code Region
Iterations: 100
Instructions: 7000
Total Cycles: 3866
Total uOps: 7900
Dispatch Width: 6
uOps Per Cycle: 2.04
IPC: 1.81
Block RThroughput: 21.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
2 4 1.50 vpmovsxdq %xmm11, %ymm1
1 1 0.50 vpsllq $3, %ymm1, %ymm1
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
1 1 1.00 vmovq %xmm1, %r14
2 1 1.00 vpextrq $1, %xmm1, %r9
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
1 8 0.50 * vmovsd (%r14), %xmm2
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
2 4 1.50 vpmovsxdq %xmm6, %ymm6
1 1 0.50 vpsllq $3, %ymm6, %ymm6
1 1 1.00 vmovq %xmm1, %rdi
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
1 1 1.00 vmovq %xmm6, %rcx
2 1 1.00 vpextrq $1, %xmm1, %rbx
2 1 1.00 vpextrq $1, %xmm6, %rax
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
1 8 0.50 * vmovsd (%rdi), %xmm6
1 1 1.00 vmovq %xmm1, %rdi
2 1 1.00 vpextrq $1, %xmm1, %rsi
1 8 0.50 * vmovsd (%rdi), %xmm1
1 8 0.50 * vmovsd (%rcx), %xmm7
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
2 4 1.50 vpmovsxdq %xmm4, %ymm4
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
1 1 0.50 vpsllq $3, %ymm4, %ymm4
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
2 1 1.00 vpextrq $1, %xmm4, %rax
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
1 1 1.00 vmovq %xmm4, %rcx
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
1 1 1.00 vmovq %xmm4, %rsi
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
2 1 1.00 vpextrq $1, %xmm4, %rdi
1 8 0.50 * vmovsd (%rsi), %xmm4
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
1 8 0.50 * vmovsd (%rcx), %xmm6
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
1 1 0.25 addq $4, %rbp
1 1 0.25 cmpq %rdx, %rbp
1 1 0.50 jb .LBB0_9
Resources:
[0] - Zn3AGU0
[1] - Zn3AGU1
[2] - Zn3AGU2
[3] - Zn3ALU0
[4] - Zn3ALU1
[5] - Zn3ALU2
[6] - Zn3ALU3
[7] - Zn3BRU1
[8] - Zn3FPP0
[9] - Zn3FPP1
[10] - Zn3FPP2
[11] - Zn3FPP3
[12.0] - Zn3FPP45
[12.1] - Zn3FPP45
[13] - Zn3FPSt
[14.0] - Zn3LSU
[14.1] - Zn3LSU
[14.2] - Zn3LSU
[15.0] - Zn3Load
[15.1] - Zn3Load
[15.2] - Zn3Load
[16.0] - Zn3Store
[16.1] - Zn3Store
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9

View File

@@ -1,108 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icx-avx2zen.s
Architecture: ZEN3
Timestamp: 2023-02-10 16:31:30
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
--------------------------------------------------------------------------------------------------------------------------------------------
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
247 | 1.0 | addq $4, %rbp | [247]
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,640 +0,0 @@
.text
.file "force_lj.c"
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
.LCPI0_0:
.quad 4631952216750555136 # 48
.LCPI0_3:
.quad 4607182418800017408 # 1
.LCPI0_4:
.quad -4620693217682128896 # -0.5
.section .rodata.cst4,"aM",@progbits,4
.p2align 2
.LCPI0_1:
.long 3 # 0x3
.LCPI0_2:
.long 2 # 0x2
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI0_5:
.zero 16,255
.text
.globl computeForceLJFullNeigh_plain_c
.p2align 4, 0x90
.type computeForceLJFullNeigh_plain_c,@function
computeForceLJFullNeigh_plain_c: #
.LcomputeForceLJFullNeigh_plain_c$local:
.cfi_startproc
# %bb.0: #
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $264, %rsp # imm = 0x108
.cfi_def_cfa_offset 320
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, %rbx
movq %rdx, %r15
movq %rsi, %r12
movl 4(%rsi), %r14d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
testl %r14d, %r14d
jle .LBB0_2
# %bb.1: #
movq 64(%r12), %rdi
leaq (,%r14,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB0_2: #
xorl %eax, %eax
callq getTimeStamp
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
movl $.L.str, %edi
callq likwid_markerStartRegion
testl %r14d, %r14d
jle .LBB0_19
# %bb.3: #
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm13
movq 16(%r15), %r11
movq 24(%r15), %rsi
movslq 8(%r15), %rdi
movq 16(%r12), %r15
movq 64(%r12), %r8
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
movq %rbx, 24(%rsp) # 8-byte Spill
vmovdqu (%rbx), %xmm14
decq %r14
vmovq %r15, %xmm0
vpbroadcastq %xmm0, %ymm3
vbroadcastsd %xmm13, %ymm2
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
vbroadcastsd %xmm12, %ymm8
vbroadcastsd %xmm15, %ymm9
shlq $2, %rdi
xorl %r10d, %r10d
movq %r14, 56(%rsp) # 8-byte Spill
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
movq %rsi, 48(%rsp) # 8-byte Spill
movq %rdi, 40(%rsp) # 8-byte Spill
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
jmp .LBB0_6
.p2align 4, 0x90
.LBB0_17: #
# in Loop: Header=BB0_6 Depth=1
movq %r13, %rdx
.LBB0_5: #
# in Loop: Header=BB0_6 Depth=1
vaddsd (%r8,%r12,8), %xmm10, %xmm0
vmovsd %xmm0, (%r8,%r12,8)
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
vmovsd %xmm0, (%r8,%rbx,8)
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
vmovsd %xmm0, (%r8,%rbp,8)
leal 3(%r13), %eax
addl $6, %r13d
testl %eax, %eax
cmovnsl %eax, %r13d
sarl $2, %r13d
movslq %r13d, %rax
vmovq %rax, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm14, %xmm14
addq %rdi, %r11
cmpq %r14, %r10
leaq 1(%r10), %r10
je .LBB0_18
.LBB0_6: #
# =>This Loop Header: Depth=1
# Child Loop BB0_9 Depth 2
# Child Loop BB0_13 Depth 2
movl (%rsi,%r10,4), %r13d
leal (%r10,%r10,2), %r12d
leal (%r10,%r10,2), %ebx
incl %ebx
leal (%r10,%r10,2), %ebp
addl $2, %ebp
testl %r13d, %r13d
jle .LBB0_4
# %bb.7: #
# in Loop: Header=BB0_6 Depth=1
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
movq %r13, %rdx
movl $4294967292, %eax # imm = 0xFFFFFFFC
andq %rax, %rdx
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
vmovapd %xmm2, (%rsp) # 16-byte Spill
je .LBB0_16
# %bb.8: #
# in Loop: Header=BB0_6 Depth=1
movq %rbp, 64(%rsp) # 8-byte Spill
movq %rbx, 72(%rsp) # 8-byte Spill
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
vbroadcastsd %xmm0, %ymm14
vbroadcastsd %xmm1, %ymm5
vbroadcastsd %xmm2, %ymm10
vxorpd %xmm0, %xmm0, %xmm0
vxorpd %xmm15, %xmm15, %xmm15
vxorpd %xmm13, %xmm13, %xmm13
xorl %ebp, %ebp
vmovapd %ymm8, %ymm9
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
.p2align 4, 0x90
movl $111, %ebx # OSACA START MARKER
.byte 100 # OSACA START MARKER
.byte 103 # OSACA START MARKER
.byte 144 # OSACA START MARKER
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
# LLVM-MCA-BEGIN
.LBB0_9: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
vpmovsxdq %xmm11, %ymm1
vpsllq $3, %ymm1, %ymm1
vpaddq %ymm1, %ymm3, %ymm1
vmovq %xmm1, %r14
vpextrq $1, %xmm1, %r9
vextracti128 $1, %ymm1, %xmm1
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
vpsubd .LCPI0_5, %xmm11, %xmm6
vpmovsxdq %xmm6, %ymm6
vpsllq $3, %ymm6, %ymm6
vmovq %xmm1, %rdi
vpaddq %ymm6, %ymm3, %ymm6
vmovq %xmm6, %rcx
vpextrq $1, %xmm1, %rbx
vpextrq $1, %xmm6, %rax
vextracti128 $1, %ymm6, %xmm1
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
vmovq %xmm1, %rdi
vpextrq $1, %xmm1, %rsi
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
vpaddd %xmm12, %xmm11, %xmm4
vpmovsxdq %xmm4, %ymm4
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
vpsllq $3, %ymm4, %ymm4
vpaddq %ymm4, %ymm3, %ymm4
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
vpextrq $1, %xmm4, %rax
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
vmovq %xmm4, %rcx
vextracti128 $1, %ymm4, %xmm4
vmovq %xmm4, %rsi
vinsertf128 $1, %xmm6, %ymm2, %ymm2
vpextrq $1, %xmm4, %rdi
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
vsubpd %ymm2, %ymm14, %ymm2
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
vinsertf128 $1, %xmm1, %ymm7, %ymm1
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
vinsertf128 $1, %xmm4, %ymm6, %ymm4
vsubpd %ymm1, %ymm5, %ymm1
vsubpd %ymm4, %ymm10, %ymm4
vmulpd %ymm2, %ymm2, %ymm6
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
vdivpd %ymm6, %ymm7, %ymm7
vmulpd %ymm7, %ymm7, %ymm11
vmulpd %ymm9, %ymm11, %ymm11
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
vmulpd %ymm7, %ymm11, %ymm11
vaddpd %ymm12, %ymm11, %ymm12
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
vmulpd %ymm7, %ymm11, %ymm7
vmulpd %ymm7, %ymm12, %ymm7
vcmpltpd %ymm8, %ymm6, %ymm6
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
addq $4, %rbp
cmpq %rdx, %rbp
jb .LBB0_9
# LLVM-MCA-END
movl $222, %ebx # OSACA END MARKER
.byte 100 # OSACA END MARKER
.byte 103 # OSACA END MARKER
.byte 144 # OSACA END MARKER
# %bb.10: #
# in Loop: Header=BB0_6 Depth=1
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddsd %xmm1, %xmm0, %xmm1
vextractf128 $1, %ymm0, %xmm0
vaddsd %xmm0, %xmm1, %xmm1
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
vaddsd %xmm0, %xmm1, %xmm10
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
vaddsd %xmm1, %xmm15, %xmm1
vextractf128 $1, %ymm15, %xmm2
vaddsd %xmm2, %xmm1, %xmm1
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
vaddsd %xmm2, %xmm1, %xmm11
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
vaddsd %xmm1, %xmm13, %xmm1
vextractf128 $1, %ymm13, %xmm2
vaddsd %xmm2, %xmm1, %xmm1
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
vaddsd %xmm2, %xmm1, %xmm5
movq 56(%rsp), %r14 # 8-byte Reload
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
movq 48(%rsp), %rsi # 8-byte Reload
movq 40(%rsp), %rdi # 8-byte Reload
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
vmovapd %ymm9, %ymm8
movq 72(%rsp), %rbx # 8-byte Reload
movq 64(%rsp), %rbp # 8-byte Reload
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
cmpq %r13, %rdx
jae .LBB0_17
jmp .LBB0_11
.p2align 4, 0x90
.LBB0_4: #
# in Loop: Header=BB0_6 Depth=1
movslq %r13d, %rdx
vxorpd %xmm5, %xmm5, %xmm5
vxorpd %xmm11, %xmm11, %xmm11
vxorpd %xmm10, %xmm10, %xmm10
jmp .LBB0_5
.p2align 4, 0x90
.LBB0_16: #
# in Loop: Header=BB0_6 Depth=1
vxorpd %xmm10, %xmm10, %xmm10
vxorpd %xmm11, %xmm11, %xmm11
vxorpd %xmm5, %xmm5, %xmm5
cmpq %r13, %rdx
jae .LBB0_17
.LBB0_11: #
# in Loop: Header=BB0_6 Depth=1
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
jmp .LBB0_13
.p2align 4, 0x90
.LBB0_12: #
# in Loop: Header=BB0_13 Depth=2
incq %rdx
cmpq %rdx, %r13
je .LBB0_17
.LBB0_13: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
movl (%r11,%rdx,4), %eax
leal (%rax,%rax,2), %ecx
movslq %ecx, %rcx
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
leal (%rax,%rax,2), %ecx
incl %ecx
movslq %ecx, %rcx
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
leal 2(%rax,%rax,2), %eax
cltq
vmovapd (%rsp), %xmm1 # 16-byte Reload
vsubsd (%r15,%rax,8), %xmm1, %xmm1
vmulsd %xmm6, %xmm6, %xmm7
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
vucomisd %xmm13, %xmm7
jae .LBB0_12
# %bb.14: #
# in Loop: Header=BB0_13 Depth=2
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
vdivsd %xmm7, %xmm0, %xmm7
vmulsd %xmm7, %xmm7, %xmm0
vmulsd %xmm0, %xmm12, %xmm0
vmulsd %xmm7, %xmm0, %xmm0
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
vmulsd %xmm7, %xmm15, %xmm7
vmulsd %xmm0, %xmm7, %xmm0
vmulsd %xmm4, %xmm0, %xmm0
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
jmp .LBB0_12
.LBB0_18: #
movq 24(%rsp), %rax # 8-byte Reload
vmovdqu %xmm14, (%rax)
.LBB0_19: #
movl $.L.str, %edi
vzeroupper
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
addq $264, %rsp # imm = 0x108
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
.cfi_endproc
# -- End function
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function computeForceLJHalfNeigh
.LCPI1_0:
.quad 4631952216750555136 # 48
.LCPI1_1:
.quad 4607182418800017408 # 1
.LCPI1_2:
.quad -4620693217682128896 # -0.5
.text
.globl computeForceLJHalfNeigh
.p2align 4, 0x90
.type computeForceLJHalfNeigh,@function
computeForceLJHalfNeigh: #
.LcomputeForceLJHalfNeigh$local:
.cfi_startproc
# %bb.0: #
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $40, %rsp
.cfi_def_cfa_offset 96
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, 16(%rsp) # 8-byte Spill
movq %rdx, %r15
movq %rsi, %r12
movl 4(%rsi), %r13d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
testl %r13d, %r13d
jle .LBB1_2
# %bb.1: #
movq 64(%r12), %rdi
leaq (,%r13,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB1_2: #
xorl %eax, %eax
callq getTimeStamp
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
movl $.L.str.1, %edi
callq likwid_markerStartRegion
testl %r13d, %r13d
jle .LBB1_8
# %bb.3: #
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm12
movq 16(%r15), %rax
movq 24(%r15), %rcx
movq %rcx, 8(%rsp) # 8-byte Spill
movslq 8(%r15), %rdx
movq 16(%r12), %rsi
movq 64(%r12), %rdi
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
movq 16(%rsp), %rcx # 8-byte Reload
vmovdqu (%rcx), %xmm10
shlq $2, %rdx
movq %rdx, (%rsp) # 8-byte Spill
xorl %r12d, %r12d
jmp .LBB1_4
.p2align 4, 0x90
.LBB1_5: #
# in Loop: Header=BB1_4 Depth=1
vxorpd %xmm13, %xmm13, %xmm13
movq %r9, %rdx
vxorpd %xmm9, %xmm9, %xmm9
vxorpd %xmm14, %xmm14, %xmm14
.LBB1_6: #
# in Loop: Header=BB1_4 Depth=1
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
vmovsd %xmm0, (%rdi,%r15,8)
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
vmovsd %xmm0, (%rdi,%r10,8)
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
vmovsd %xmm0, (%rdi,%r11,8)
leal 3(%r9), %ecx
addl $6, %r9d
testl %ecx, %ecx
cmovnsl %ecx, %r9d
sarl $2, %r9d
movslq %r9d, %rcx
vmovq %rcx, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm10, %xmm10
incq %r12
addq (%rsp), %rax # 8-byte Folded Reload
cmpq %r13, %r12
je .LBB1_7
.LBB1_4: #
# =>This Loop Header: Depth=1
# Child Loop BB1_10 Depth 2
movq 8(%rsp), %rcx # 8-byte Reload
movslq (%rcx,%r12,4), %r9
leaq (%r12,%r12,2), %rcx
leal 1(%rcx), %r10d
leal 2(%rcx), %r11d
movl %ecx, %r15d
testq %r9, %r9
jle .LBB1_5
# %bb.9: #
# in Loop: Header=BB1_4 Depth=1
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
movl %r9d, %edx
vxorpd %xmm14, %xmm14, %xmm14
xorl %ecx, %ecx
vxorpd %xmm9, %xmm9, %xmm9
vxorpd %xmm13, %xmm13, %xmm13
jmp .LBB1_10
.p2align 4, 0x90
.LBB1_13: #
# in Loop: Header=BB1_10 Depth=2
incq %rcx
cmpq %rcx, %rdx
je .LBB1_6
.LBB1_10: #
# Parent Loop BB1_4 Depth=1
# => This Inner Loop Header: Depth=2
movslq (%rax,%rcx,4), %r8
leaq (%r8,%r8,2), %r14
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
movslq %r14d, %rbp
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
vmulsd %xmm2, %xmm2, %xmm6
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
vucomisd %xmm12, %xmm6
jae .LBB1_13
# %bb.11: #
# in Loop: Header=BB1_10 Depth=2
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
vdivsd %xmm6, %xmm3, %xmm6
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
vmulsd %xmm6, %xmm6, %xmm8
vmulsd %xmm3, %xmm8, %xmm3
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
vmulsd %xmm6, %xmm11, %xmm6
vmulsd %xmm3, %xmm6, %xmm3
vmulsd %xmm7, %xmm3, %xmm3
vmulsd %xmm2, %xmm3, %xmm6
vaddsd %xmm6, %xmm14, %xmm14
vmulsd %xmm5, %xmm3, %xmm2
vaddsd %xmm2, %xmm9, %xmm9
vmulsd %xmm0, %xmm3, %xmm0
vaddsd %xmm0, %xmm13, %xmm13
cmpl %r13d, %r8d
jge .LBB1_13
# %bb.12: #
# in Loop: Header=BB1_10 Depth=2
leaq 1(%rbp), %rbx
addq $2, %rbp
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm6, %xmm3, %xmm3
vmovsd %xmm3, (%rdi,%r14,8)
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm2, %xmm3, %xmm2
vmovsd %xmm2, (%rdi,%rbx,8)
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
vsubsd %xmm0, %xmm2, %xmm0
vmovsd %xmm0, (%rdi,%rbp,8)
jmp .LBB1_13
.LBB1_7: #
movq 16(%rsp), %rax # 8-byte Reload
vmovdqu %xmm10, (%rax)
.LBB1_8: #
movl $.L.str.1, %edi
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
addq $40, %rsp
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end1:
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
.cfi_endproc
# -- End function
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
.p2align 4, 0x90
.type computeForceLJFullNeigh_simd,@function
computeForceLJFullNeigh_simd: #
.LcomputeForceLJFullNeigh_simd$local:
.cfi_startproc
# %bb.0: #
pushq %rax
.cfi_def_cfa_offset 16
movl 4(%rsi), %eax
testl %eax, %eax
jle .LBB2_2
# %bb.1: #
movq 64(%rsi), %rdi
shlq $3, %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB2_2: #
xorl %eax, %eax
callq getTimeStamp
movq stderr(%rip), %rcx
movl $.L.str.2, %edi
movl $65, %esi
movl $1, %edx
callq fwrite
movl $-1, %edi
callq exit
.Lfunc_end2:
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
.cfi_endproc
# -- End function
.type .L.str,@object #
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "force"
.size .L.str, 6
.type .L.str.1,@object #
.L.str.1:
.asciz "forceLJ-halfneigh"
.size .L.str.1, 18
.type .L.str.2,@object #
.L.str.2:
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
.size .L.str.2, 66
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
.section ".note.GNU-stack","",@progbits

View File

@@ -1,105 +0,0 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: force_lj_icx_avx2_markers.s
Architecture: ZEN3
Timestamp: 2022-12-12 12:47:07
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
---------------------------------------------------------------------------------------------------------------------------------------------
172 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
173 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
174 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
175 | | 0.250 | 0.75 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
176 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
177 | 0.00 | 1.010 | 0.25 | 0.74 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
178 | | 0.000 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
179 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
180 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
181 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
182 | | 1.000 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
183 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
184 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
185 | 0.00 | 0.750 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
186 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
187 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
188 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
189 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
190 | | 1.000 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
191 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
192 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
193 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
194 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
195 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
196 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
197 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
198 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
199 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
200 | 0.00 | 0.000 | 0.62 | 0.38 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
201 | 0.00 | 0.750 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
202 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
203 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
204 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
205 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
206 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
207 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
208 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
209 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
210 | 0.00 | -0.01 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
211 | | 1.000 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
212 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
213 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
214 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
215 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
216 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
217 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
218 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
219 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
220 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
221 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
222 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
223 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
224 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
225 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
226 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
227 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
228 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
229 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
230 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
231 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
232 | 1.00 | 0.000 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
233 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
234 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
235 | | | 0.12 | 0.88 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
236 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
237 | 1.00 | 0.000 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
238 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
239 | 1.00 | 0.000 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
240 | 0.62 | 0.380 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
241 | 0.50 | 0.500 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
242 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
243 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
244 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
16.1 15.63 15.6 15.6 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
239 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
238 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
236 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
242 | 1.0 | addq $4, %rbp | [242]
241 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [241]
240 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [240]
237 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [237]

View File

@@ -1,638 +0,0 @@
.text
.file "force_lj.c"
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
.LCPI0_0:
.quad 4631952216750555136 # 48
.LCPI0_3:
.quad 4607182418800017408 # 1
.LCPI0_4:
.quad -4620693217682128896 # -0.5
.section .rodata.cst4,"aM",@progbits,4
.p2align 2
.LCPI0_1:
.long 3 # 0x3
.LCPI0_2:
.long 2 # 0x2
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI0_5:
.zero 16,255
.text
.globl computeForceLJFullNeigh_plain_c
.p2align 4, 0x90
.type computeForceLJFullNeigh_plain_c,@function
computeForceLJFullNeigh_plain_c: #
.LcomputeForceLJFullNeigh_plain_c$local:
.cfi_startproc
# %bb.0: #
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $264, %rsp # imm = 0x108
.cfi_def_cfa_offset 320
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, %rbx
movq %rdx, %r15
movq %rsi, %r12
movl 4(%rsi), %r14d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
testl %r14d, %r14d
jle .LBB0_2
# %bb.1: #
movq 64(%r12), %rdi
leaq (,%r14,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB0_2: #
xorl %eax, %eax
callq getTimeStamp
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
movl $.L.str, %edi
callq likwid_markerStartRegion
testl %r14d, %r14d
jle .LBB0_19
# %bb.3: #
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm13
movq 16(%r15), %r11
movq 24(%r15), %rsi
movslq 8(%r15), %rdi
movq 16(%r12), %r15
movq 64(%r12), %r8
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
movq %rbx, 24(%rsp) # 8-byte Spill
vmovdqu (%rbx), %xmm14
decq %r14
vmovq %r15, %xmm0
vpbroadcastq %xmm0, %ymm3
vbroadcastsd %xmm13, %ymm2
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
vbroadcastsd %xmm12, %ymm8
vbroadcastsd %xmm15, %ymm9
shlq $2, %rdi
xorl %r10d, %r10d
movq %r14, 56(%rsp) # 8-byte Spill
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
movq %rsi, 48(%rsp) # 8-byte Spill
movq %rdi, 40(%rsp) # 8-byte Spill
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
jmp .LBB0_6
.p2align 4, 0x90
.LBB0_17: #
# in Loop: Header=BB0_6 Depth=1
movq %r13, %rdx
.LBB0_5: #
# in Loop: Header=BB0_6 Depth=1
vaddsd (%r8,%r12,8), %xmm10, %xmm0
vmovsd %xmm0, (%r8,%r12,8)
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
vmovsd %xmm0, (%r8,%rbx,8)
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
vmovsd %xmm0, (%r8,%rbp,8)
leal 3(%r13), %eax
addl $6, %r13d
testl %eax, %eax
cmovnsl %eax, %r13d
sarl $2, %r13d
movslq %r13d, %rax
vmovq %rax, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm14, %xmm14
addq %rdi, %r11
cmpq %r14, %r10
leaq 1(%r10), %r10
je .LBB0_18
.LBB0_6: #
# =>This Loop Header: Depth=1
# Child Loop BB0_9 Depth 2
# Child Loop BB0_13 Depth 2
movl (%rsi,%r10,4), %r13d
leal (%r10,%r10,2), %r12d
leal (%r10,%r10,2), %ebx
incl %ebx
leal (%r10,%r10,2), %ebp
addl $2, %ebp
testl %r13d, %r13d
jle .LBB0_4
# %bb.7: #
# in Loop: Header=BB0_6 Depth=1
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
movq %r13, %rdx
movl $4294967292, %eax # imm = 0xFFFFFFFC
andq %rax, %rdx
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
vmovapd %xmm2, (%rsp) # 16-byte Spill
je .LBB0_16
# %bb.8: #
# in Loop: Header=BB0_6 Depth=1
movq %rbp, 64(%rsp) # 8-byte Spill
movq %rbx, 72(%rsp) # 8-byte Spill
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
vbroadcastsd %xmm0, %ymm14
vbroadcastsd %xmm1, %ymm5
vbroadcastsd %xmm2, %ymm10
vxorpd %xmm0, %xmm0, %xmm0
vxorpd %xmm15, %xmm15, %xmm15
vxorpd %xmm13, %xmm13, %xmm13
xorl %ebp, %ebp
vmovapd %ymm8, %ymm9
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
.p2align 4, 0x90
# OSACA-BEGIN
# LLVM-MCA-BEGIN
.LBB0_9: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
vpmovsxdq %xmm11, %ymm1
vpsllq $3, %ymm1, %ymm1
vpaddq %ymm1, %ymm3, %ymm1
vmovq %xmm1, %r14
vpextrq $1, %xmm1, %r9
vextracti128 $1, %ymm1, %xmm1
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
vpsubd .LCPI0_5, %xmm11, %xmm6
vpmovsxdq %xmm6, %ymm6
vpsllq $3, %ymm6, %ymm6
vmovq %xmm1, %rdi
vpaddq %ymm6, %ymm3, %ymm6
vmovq %xmm6, %rcx
vpextrq $1, %xmm1, %rbx
vpextrq $1, %xmm6, %rax
vextracti128 $1, %ymm6, %xmm1
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
vmovq %xmm1, %rdi
vpextrq $1, %xmm1, %rsi
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
vpaddd %xmm12, %xmm11, %xmm4
vpmovsxdq %xmm4, %ymm4
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
vpsllq $3, %ymm4, %ymm4
vpaddq %ymm4, %ymm3, %ymm4
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
vpextrq $1, %xmm4, %rax
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
vmovq %xmm4, %rcx
vextracti128 $1, %ymm4, %xmm4
vmovq %xmm4, %rsi
vinsertf128 $1, %xmm6, %ymm2, %ymm2
vpextrq $1, %xmm4, %rdi
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
vsubpd %ymm2, %ymm14, %ymm2
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
vinsertf128 $1, %xmm1, %ymm7, %ymm1
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
vinsertf128 $1, %xmm4, %ymm6, %ymm4
vsubpd %ymm1, %ymm5, %ymm1
vsubpd %ymm4, %ymm10, %ymm4
vmulpd %ymm2, %ymm2, %ymm6
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
vdivpd %ymm6, %ymm7, %ymm7
vmulpd %ymm7, %ymm7, %ymm11
vmulpd %ymm9, %ymm11, %ymm11
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
vmulpd %ymm7, %ymm11, %ymm11
vaddpd %ymm12, %ymm11, %ymm12
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
vmulpd %ymm7, %ymm11, %ymm7
vmulpd %ymm7, %ymm12, %ymm7
vcmpltpd %ymm8, %ymm6, %ymm6
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
addq $4, %rbp
cmpq %rdx, %rbp
jb .LBB0_9
# LLVM-MCA-END
# OSACA-END
# %bb.10: #
# in Loop: Header=BB0_6 Depth=1
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddsd %xmm1, %xmm0, %xmm1
vextractf128 $1, %ymm0, %xmm0
vaddsd %xmm0, %xmm1, %xmm1
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
vaddsd %xmm0, %xmm1, %xmm10
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
vaddsd %xmm1, %xmm15, %xmm1
vextractf128 $1, %ymm15, %xmm2
vaddsd %xmm2, %xmm1, %xmm1
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
vaddsd %xmm2, %xmm1, %xmm11
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
vaddsd %xmm1, %xmm13, %xmm1
vextractf128 $1, %ymm13, %xmm2
vaddsd %xmm2, %xmm1, %xmm1
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
vaddsd %xmm2, %xmm1, %xmm5
movq 56(%rsp), %r14 # 8-byte Reload
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
movq 48(%rsp), %rsi # 8-byte Reload
movq 40(%rsp), %rdi # 8-byte Reload
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
vmovapd %ymm9, %ymm8
movq 72(%rsp), %rbx # 8-byte Reload
movq 64(%rsp), %rbp # 8-byte Reload
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
cmpq %r13, %rdx
jae .LBB0_17
jmp .LBB0_11
.p2align 4, 0x90
.LBB0_4: #
# in Loop: Header=BB0_6 Depth=1
movslq %r13d, %rdx
vxorpd %xmm5, %xmm5, %xmm5
vxorpd %xmm11, %xmm11, %xmm11
vxorpd %xmm10, %xmm10, %xmm10
jmp .LBB0_5
.p2align 4, 0x90
.LBB0_16: #
# in Loop: Header=BB0_6 Depth=1
vxorpd %xmm10, %xmm10, %xmm10
vxorpd %xmm11, %xmm11, %xmm11
vxorpd %xmm5, %xmm5, %xmm5
cmpq %r13, %rdx
jae .LBB0_17
.LBB0_11: #
# in Loop: Header=BB0_6 Depth=1
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
jmp .LBB0_13
.p2align 4, 0x90
.LBB0_12: #
# in Loop: Header=BB0_13 Depth=2
incq %rdx
cmpq %rdx, %r13
je .LBB0_17
.LBB0_13: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
movl (%r11,%rdx,4), %eax
leal (%rax,%rax,2), %ecx
movslq %ecx, %rcx
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
leal (%rax,%rax,2), %ecx
incl %ecx
movslq %ecx, %rcx
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
leal 2(%rax,%rax,2), %eax
cltq
vmovapd (%rsp), %xmm1 # 16-byte Reload
vsubsd (%r15,%rax,8), %xmm1, %xmm1
vmulsd %xmm6, %xmm6, %xmm7
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
vucomisd %xmm13, %xmm7
jae .LBB0_12
# %bb.14: #
# in Loop: Header=BB0_13 Depth=2
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
vdivsd %xmm7, %xmm0, %xmm7
vmulsd %xmm7, %xmm7, %xmm0
vmulsd %xmm0, %xmm12, %xmm0
vmulsd %xmm7, %xmm0, %xmm0
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
vmulsd %xmm7, %xmm15, %xmm7
vmulsd %xmm0, %xmm7, %xmm0
vmulsd %xmm4, %xmm0, %xmm0
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
jmp .LBB0_12
.LBB0_18: #
movq 24(%rsp), %rax # 8-byte Reload
vmovdqu %xmm14, (%rax)
.LBB0_19: #
movl $.L.str, %edi
vzeroupper
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
addq $264, %rsp # imm = 0x108
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
.cfi_endproc
# -- End function
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function computeForceLJHalfNeigh
.LCPI1_0:
.quad 4631952216750555136 # 48
.LCPI1_1:
.quad 4607182418800017408 # 1
.LCPI1_2:
.quad -4620693217682128896 # -0.5
.text
.globl computeForceLJHalfNeigh
.p2align 4, 0x90
.type computeForceLJHalfNeigh,@function
computeForceLJHalfNeigh: #
.LcomputeForceLJHalfNeigh$local:
.cfi_startproc
# %bb.0: #
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $40, %rsp
.cfi_def_cfa_offset 96
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, 16(%rsp) # 8-byte Spill
movq %rdx, %r15
movq %rsi, %r12
movl 4(%rsi), %r13d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
testl %r13d, %r13d
jle .LBB1_2
# %bb.1: #
movq 64(%r12), %rdi
leaq (,%r13,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB1_2: #
xorl %eax, %eax
callq getTimeStamp
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
movl $.L.str.1, %edi
callq likwid_markerStartRegion
testl %r13d, %r13d
jle .LBB1_8
# %bb.3: #
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm12
movq 16(%r15), %rax
movq 24(%r15), %rcx
movq %rcx, 8(%rsp) # 8-byte Spill
movslq 8(%r15), %rdx
movq 16(%r12), %rsi
movq 64(%r12), %rdi
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
movq 16(%rsp), %rcx # 8-byte Reload
vmovdqu (%rcx), %xmm10
shlq $2, %rdx
movq %rdx, (%rsp) # 8-byte Spill
xorl %r12d, %r12d
jmp .LBB1_4
.p2align 4, 0x90
.LBB1_5: #
# in Loop: Header=BB1_4 Depth=1
vxorpd %xmm13, %xmm13, %xmm13
movq %r9, %rdx
vxorpd %xmm9, %xmm9, %xmm9
vxorpd %xmm14, %xmm14, %xmm14
.LBB1_6: #
# in Loop: Header=BB1_4 Depth=1
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
vmovsd %xmm0, (%rdi,%r15,8)
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
vmovsd %xmm0, (%rdi,%r10,8)
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
vmovsd %xmm0, (%rdi,%r11,8)
leal 3(%r9), %ecx
addl $6, %r9d
testl %ecx, %ecx
cmovnsl %ecx, %r9d
sarl $2, %r9d
movslq %r9d, %rcx
vmovq %rcx, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm10, %xmm10
incq %r12
addq (%rsp), %rax # 8-byte Folded Reload
cmpq %r13, %r12
je .LBB1_7
.LBB1_4: #
# =>This Loop Header: Depth=1
# Child Loop BB1_10 Depth 2
movq 8(%rsp), %rcx # 8-byte Reload
movslq (%rcx,%r12,4), %r9
leaq (%r12,%r12,2), %rcx
leal 1(%rcx), %r10d
leal 2(%rcx), %r11d
movl %ecx, %r15d
testq %r9, %r9
jle .LBB1_5
# %bb.9: #
# in Loop: Header=BB1_4 Depth=1
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
movl %r9d, %edx
vxorpd %xmm14, %xmm14, %xmm14
xorl %ecx, %ecx
vxorpd %xmm9, %xmm9, %xmm9
vxorpd %xmm13, %xmm13, %xmm13
jmp .LBB1_10
.p2align 4, 0x90
.LBB1_13: #
# in Loop: Header=BB1_10 Depth=2
incq %rcx
cmpq %rcx, %rdx
je .LBB1_6
.LBB1_10: #
# Parent Loop BB1_4 Depth=1
# => This Inner Loop Header: Depth=2
movslq (%rax,%rcx,4), %r8
leaq (%r8,%r8,2), %r14
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
movslq %r14d, %rbp
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
vmulsd %xmm2, %xmm2, %xmm6
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
vucomisd %xmm12, %xmm6
jae .LBB1_13
# %bb.11: #
# in Loop: Header=BB1_10 Depth=2
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
vdivsd %xmm6, %xmm3, %xmm6
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
vmulsd %xmm6, %xmm6, %xmm8
vmulsd %xmm3, %xmm8, %xmm3
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
vmulsd %xmm6, %xmm11, %xmm6
vmulsd %xmm3, %xmm6, %xmm3
vmulsd %xmm7, %xmm3, %xmm3
vmulsd %xmm2, %xmm3, %xmm6
vaddsd %xmm6, %xmm14, %xmm14
vmulsd %xmm5, %xmm3, %xmm2
vaddsd %xmm2, %xmm9, %xmm9
vmulsd %xmm0, %xmm3, %xmm0
vaddsd %xmm0, %xmm13, %xmm13
cmpl %r13d, %r8d
jge .LBB1_13
# %bb.12: #
# in Loop: Header=BB1_10 Depth=2
leaq 1(%rbp), %rbx
addq $2, %rbp
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm6, %xmm3, %xmm3
vmovsd %xmm3, (%rdi,%r14,8)
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm2, %xmm3, %xmm2
vmovsd %xmm2, (%rdi,%rbx,8)
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
vsubsd %xmm0, %xmm2, %xmm0
vmovsd %xmm0, (%rdi,%rbp,8)
jmp .LBB1_13
.LBB1_7: #
movq 16(%rsp), %rax # 8-byte Reload
vmovdqu %xmm10, (%rax)
.LBB1_8: #
movl $.L.str.1, %edi
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
addq $40, %rsp
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end1:
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
.cfi_endproc
# -- End function
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
.p2align 4, 0x90
.type computeForceLJFullNeigh_simd,@function
computeForceLJFullNeigh_simd: #
.LcomputeForceLJFullNeigh_simd$local:
.cfi_startproc
# %bb.0: #
pushq %rax
.cfi_def_cfa_offset 16
movl 4(%rsi), %eax
testl %eax, %eax
jle .LBB2_2
# %bb.1: #
movq 64(%rsi), %rdi
shlq $3, %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB2_2: #
xorl %eax, %eax
callq getTimeStamp
movl $.L.str, %edi
callq likwid_markerStartRegion
movq stderr(%rip), %rcx
movl $.L.str.2, %edi
movl $65, %esi
movl $1, %edx
callq fwrite
movl $-1, %edi
callq exit
.Lfunc_end2:
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
.cfi_endproc
# -- End function
.type .L.str,@object #
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "force"
.size .L.str, 6
.type .L.str.1,@object #
.L.str.1:
.asciz "forceLJ-halfneigh"
.size .L.str.1, 18
.type .L.str.2,@object #
.L.str.2:
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
.size .L.str.2, 66
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
.section ".note.GNU-stack","",@progbits

View File

@@ -15,7 +15,7 @@ ISA="${BIN_INFO##*-}"
CORE="${CORE:-0}"
FREQ="${FREQ:-2.4}"
NRUNS="${NRUNS:-3}"
LOG="${LOG:-latencies_and_cfds.log}"
LOG="${LOG:-latencies_and_cfds.$(hostname).log}"
STUB_ONLY="${STUB_ONLY:-false}"
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
@@ -37,10 +37,14 @@ CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
else
ALL_PREFETCHERS=""
PREFETCHERS=("IGNORE")
DEFAULT_PREFETCHERS=("IGNORE")
fi
if [ -z ${PREFETCHERS+x} ]; then
PREFETCHERS=${DEFAULT_PREFETCHERS}
fi
if [ "$OPT_SCHEME" == "gromacs" ]; then

52
util/gather-bench/.gitignore vendored Normal file
View File

@@ -0,0 +1,52 @@
# Prerequisites
*.d
# Object files
*.o
*.ko
*.obj
*.elf
# Linker output
*.ilk
*.map
*.exp
# Precompiled Headers
*.gch
*.pch
# Libraries
*.lib
*.a
*.la
*.lo
# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib
# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex
# Debug files
*.dSYM/
*.su
*.idb
*.pdb
# Kernel Module Compile Results
*.mod*
*.cmd
.tmp_versions/
modules.order
Module.symvers
Mkfile.old
dkms.conf

21
util/gather-bench/LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2021 RRZE-HPC
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

126
util/gather-bench/Makefile Normal file
View File

@@ -0,0 +1,126 @@
#CONFIGURE BUILD SYSTEM
TARGET = gather-bench-$(TAG)
BUILD_DIR = ./$(TAG)
SRC_DIR = ./src
MAKE_DIR = ./
ISA_DIR = ./src/$(ISA)
Q ?= @
#DO NOT EDIT BELOW
include $(MAKE_DIR)/config.mk
include $(MAKE_DIR)/include_$(TAG).mk
include $(MAKE_DIR)/include_LIKWID.mk
INCLUDES += -I./src/includes
VPATH = $(SRC_DIR) ${ISA_DIR}
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
ASM += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.f90))
OBJ = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
OBJ += $(patsubst $(SRC_DIR)/%.cpp, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cpp))
OBJ += $(patsubst $(SRC_DIR)/%.f90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.f90))
OBJ += $(patsubst $(SRC_DIR)/%.F90, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.F90))
OBJ += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
OBJ += $(patsubst $(ISA_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(ISA_DIR)/*.S))
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) -DISA_$(ISA)
ifneq ($(VARIANT),)
.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
endif
ifeq ($(strip $(DATA_LAYOUT)),AOS)
CPPFLAGS += -DAOS
endif
ifeq ($(strip $(TEST)),true)
CPPFLAGS += -DTEST
endif
ifeq ($(strip $(PADDING)),true)
CPPFLAGS += -DPADDING
endif
ifeq ($(strip $(MEASURE_GATHER_CYCLES)),true)
CPPFLAGS += -DMEASURE_GATHER_CYCLES
endif
ifeq ($(strip $(ONLY_FIRST_DIMENSION)),true)
CPPFLAGS += -DONLY_FIRST_DIMENSION
endif
ifeq ($(strip $(MEM_TRACER)),true)
CPPFLAGS += -DMEM_TRACER
endif
${TARGET}: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main.c
@echo "===> LINKING $(TARGET)"
$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET) $(SRC_DIR)/main.c $(OBJ) $(LIBS)
${TARGET}-%: $(BUILD_DIR) $(OBJ) $(SRC_DIR)/main-%.c
@echo "===> LINKING $(TARGET)-$* "
$(Q)${LINKER} ${CPPFLAGS} ${LFLAGS} -o $(TARGET)-$* $(SRC_DIR)/main-$*.c $(OBJ) $(LIBS)
asm: $(BUILD_DIR) $(ASM)
$(BUILD_DIR)/%.o: %.c
@echo "===> COMPILE $@"
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.s: %.c
@echo "===> GENERATE ASM $@"
$(Q)$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
$(BUILD_DIR)/%.s: %.f90
@echo "===> COMPILE $@"
$(Q)$(FC) -S $(FCFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.cc
@echo "===> COMPILE $@"
$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.o: %.cpp
@echo "===> COMPILE $@"
$(Q)$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
$(Q)$(CXX) $(CPPFLAGS) -MT $(@:.d=.o) -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.o: %.f90
@echo "===> COMPILE $@"
$(Q)$(FC) -c $(FCFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.F90
@echo "===> COMPILE $@"
$(Q)$(FC) -c $(CPPFLAGS) $(FCFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.s
@echo "===> ASSEMBLE $@"
$(Q)$(AS) $(ASFLAGS) $< -o $@
$(BUILD_DIR)/%.o: %.S
@echo "===> ASSEMBLE $@"
$(Q)$(CC) -c $(CPPFLAGS) $< -o $@
tags:
@echo "===> GENERATE TAGS"
$(Q)ctags -R
$(BUILD_DIR):
@mkdir $(BUILD_DIR)
ifeq ($(findstring $(MAKECMDGOALS),clean),)
-include $(OBJ:.o=.d)
endif
.PHONY: clean distclean
clean:
@echo "===> CLEAN"
@rm -rf $(BUILD_DIR)
@rm -f tags
distclean: clean
@echo "===> DIST CLEAN"
@rm -f $(TARGET)
@rm -f tags

View File

@@ -0,0 +1,2 @@
# gather-bench
A X86 gather instruction performance benchmark

View File

@@ -0,0 +1,22 @@
# Supported: GCC, CLANG, ICC
TAG ?= ICC
# Supported: avx2, avx512
ISA ?= avx512
# Use likwid?
ENABLE_LIKWID ?= false
# SP or DP
DATA_TYPE ?= DP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Padding byte for AoS
PADDING ?= false
# Measure cycles for each gather separately
MEASURE_GATHER_CYCLES ?= false
# Gather data only for first dimension (one gather per iteration)
ONLY_FIRST_DIMENSION ?= false
# Trace memory addresses for cache simulator
MEM_TRACER ?= false
# Test correctness of gather kernels
TEST ?= false

View File

@@ -0,0 +1,9 @@
CC = clang
LINKER = $(CC)
OPENMP =# -fopenmp
CFLAGS = -Ofast -std=c11 -march=core-avx2 -mavx -mfma $(OPENMP)
LFLAGS = $(OPENMP) -march=core-avx2 -mavx -mfma
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS =

View File

@@ -0,0 +1,11 @@
CC = gcc
AS = as
LINKER = $(CC)
OPENMP = -fopenmp
CFLAGS = -Ofast -std=c11 -mavx2 -mfma $(OPENMP)
ASFLAGS =
LFLAGS = $(OPENMP) -mavx2 -mfma
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS =

View File

@@ -0,0 +1,9 @@
CC = icc
LINKER = $(CC)
OPENMP = -qopenmp
CFLAGS = -Ofast -xhost -std=c11 $(OPENMP)
LFLAGS = $(OPENMP)
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS =

View File

@@ -0,0 +1,10 @@
LIKWID_INC ?= -I/usr/local/include
LIKWID_DEFINES ?= -DLIKWID_PERFMON
LIKWID_LIB ?= -L/usr/local/lib
ifeq ($(strip $(ENABLE_LIKWID)),true)
INCLUDES += ${LIKWID_INC}
DEFINES += ${LIKWID_DEFINES}
LIBS += -llikwid
LFLAGS += ${LIKWID_LIB}
endif

View File

@@ -0,0 +1,57 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
void* allocate (int alignment, size_t bytesize)
{
int errorCode;
void* ptr;
errorCode = posix_memalign(&ptr, alignment, bytesize);
if (errorCode) {
if (errorCode == EINVAL) {
fprintf(stderr,
"Error: Alignment parameter is not a power of two\n");
exit(EXIT_FAILURE);
}
if (errorCode == ENOMEM) {
fprintf(stderr,
"Error: Insufficient memory to fulfill the request\n");
exit(EXIT_FAILURE);
}
}
if (ptr == NULL) {
fprintf(stderr, "Error: posix_memalign failed!\n");
exit(EXIT_FAILURE);
}
return ptr;
}

View File

@@ -0,0 +1,63 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather
.type gather, @function
gather :
push rbp
mov rbp, rsp
push rbx
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm0, ymm0, ymm0
.align 16
1:
vmovups xmm1, [rsi + rax * 4]
vmovups xmm2, [rsi + rax * 4 + 16]
vmovups xmm3, [rsi + rax * 4 + 32]
vmovups xmm4, [rsi + rax * 4 + 48]
vmovdqa ymm5, ymm0
vmovdqa ymm6, ymm0
vmovdqa ymm7, ymm0
vmovdqa ymm8, ymm0
vxorpd ymm9, ymm9, ymm9
vxorpd ymm10, ymm10, ymm10
vxorpd ymm11, ymm11, ymm11
vxorpd ymm12, ymm12, ymm12
vgatherdpd ymm9, [rdi + xmm1 * 8], ymm5
vgatherdpd ymm10, [rdi + xmm2 * 8], ymm6
vgatherdpd ymm11, [rdi + xmm3 * 8], ymm7
vgatherdpd ymm12, [rdi + xmm4 * 8], ymm8
#ifdef TEST
vmovapd [rcx + rax * 8], ymm9
vmovapd [rcx + rax * 8 + 32], ymm10
vmovapd [rcx + rax * 8 + 64], ymm11
vmovapd [rcx + rax * 8 + 96], ymm12
#endif
addq rax, 16
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather, .-gather

View File

@@ -0,0 +1,71 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather_aos
.type gather_aos, @function
gather_aos :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm8, ymm8, ymm8
.align 16
1:
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
vpaddd xmm4, xmm3, xmm3
#ifdef PADDING
vpaddd xmm3, xmm4, xmm4
#else
vpaddd xmm3, xmm3, xmm4
#endif
vmovdqa ymm5, ymm8
vmovdqa ymm6, ymm8
vmovdqa ymm7, ymm8
vxorpd ymm0, ymm0, ymm0
vxorpd ymm1, ymm1, ymm1
vxorpd ymm2, ymm2, ymm2
vgatherdpd ymm0, [ rdi + xmm3 * 8], ymm5
vgatherdpd ymm1, [8 + rdi + xmm3 * 8], ymm6
vgatherdpd ymm2, [16 + rdi + xmm3 * 8], ymm7
#ifdef TEST
vmovupd [rcx + rax * 8], ymm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], ymm1
lea r9, [rbx + rdx * 8]
vmovupd [r9 + rax * 8], ymm2
#endif
addq rax, 4
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_aos, .-gather_aos

View File

@@ -0,0 +1,67 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather_soa
.type gather_soa, @function
gather_soa :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm8, ymm8, ymm8
lea r8, [rdi + rdx * 8]
lea r9, [r8 + rdx * 8]
.align 16
1:
vmovups xmm3, XMMWORD PTR [rsi + rax * 4]
vmovdqa ymm5, ymm8
vmovdqa ymm6, ymm8
vmovdqa ymm7, ymm8
vxorpd ymm0, ymm0, ymm0
vxorpd ymm1, ymm1, ymm1
vxorpd ymm2, ymm2, ymm2
vgatherdpd ymm0, [rdi + xmm3 * 8], ymm5
vgatherdpd ymm1, [r8 + xmm3 * 8], ymm6
vgatherdpd ymm2, [r9 + xmm3 * 8], ymm7
#ifdef TEST
vmovupd [rcx + rax * 8], ymm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], ymm1
lea r10, [rbx + rdx * 8]
vmovupd [r10 + rax * 8], ymm2
#endif
addq rax, 4
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_soa, .-gather_soa

View File

@@ -0,0 +1,62 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather
.type gather, @function
gather :
push rbp
mov rbp, rsp
push rbx
push r12
push r13
push r14
push r15
xor rax, rax
.align 16
1:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vpcmpeqb k4, xmm0, xmm0
vmovdqu ymm0, [rsi + rax * 4]
vmovdqu ymm1, [rsi + rax * 4 + 32]
vmovdqu ymm2, [rsi + rax * 4 + 64]
vmovdqu ymm3, [rsi + rax * 4 + 96]
vpxord zmm4, zmm4, zmm4
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
vpxord zmm7, zmm7, zmm7
vgatherdpd zmm4{k1}, [rdi + ymm0 * 8]
vgatherdpd zmm5{k2}, [rdi + ymm1 * 8]
vgatherdpd zmm6{k3}, [rdi + ymm2 * 8]
vgatherdpd zmm7{k4}, [rdi + ymm3 * 8]
#ifdef TEST
vmovapd [rcx + rax * 8], zmm4
vmovapd [rcx + rax * 8 + 64], zmm5
vmovapd [rcx + rax * 8 + 128], zmm6
vmovapd [rcx + rax * 8 + 192], zmm7
#endif
addq rax, 32
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather, .-gather

View File

@@ -0,0 +1,151 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
# r8 -> cycles
.text
.globl gather_aos
.type gather_aos, @function
gather_aos :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
# Prefetching instructions
#mov ebx, DWORD PTR[rsi + rax*4]
#mov r9d, DWORD PTR[4 + rsi + rax*4]
#mov r10d, DWORD PTR[8 + rsi + rax*4]
#mov r11d, DWORD PTR[12 + rsi + rax*4]
#mov r12d, DWORD PTR[16 + rsi + rax*4]
#mov r13d, DWORD PTR[20 + rsi + rax*4]
#mov r14d, DWORD PTR[24 + rsi + rax*4]
#mov r15d, DWORD PTR[28 + rsi + rax*4]
#lea ebx, DWORD PTR[rbx]
#lea r9d, DWORD PTR[r9]
#lea r10d, DWORD PTR[r10]
#lea r11d, DWORD PTR[r11]
#lea r12d, DWORD PTR[r12]
#lea r13d, DWORD PTR[r13]
#lea r14d, DWORD PTR[r14]
#lea r15d, DWORD PTR[r15]
vpcmpeqb k1, xmm5, xmm5
#ifndef ONLY_FIRST_DIMENSION
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
#endif
vpxord zmm0, zmm0, zmm0
#ifndef ONLY_FIRST_DIMENSION
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
#ifdef MEASURE_GATHER_CYCLES
mov r9, rax
mov r10, rdx
xor r11, r11
add r11, rax
add r11, rax
add r11, rax
#shr r11, 3
xor rbx, rbx
lfence
rdtsc
add ebx, eax
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
lfence
rdtsc
sub eax, ebx
#movdiri [r8 + r11], rax
movnti [r8 + r11], rax
#ifndef ONLY_FIRST_DIMENSION
xor rbx, rbx
lfence
rdtsc
add ebx, eax
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
lfence
rdtsc
sub eax, ebx
#movdiri [8 + r8 + r11], rax
movnti [8 + r8 + r11], rax
xor rbx, rbx
lfence
rdtsc
add ebx, eax
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
lfence
rdtsc
sub eax, ebx
#movdiri [16 + r8 + r11], rax
movnti [16 + r8 + r11], rax
#endif // ONLY_FIRST_DIMENSION
mov rax, r9
mov rdx, r10
#else // MEASURE_GATHER_CYCLES
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#endif // MEASURE_GATHER_CYCLES
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], zmm1
lea r9, [rbx + rdx * 8]
vmovupd [r9 + rax * 8], zmm2
#endif
addq rax, 8
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_aos, .-gather_aos

View File

@@ -0,0 +1,147 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
.section .rodata, "a"
.align 64
.align 64
.ymm_reg_mask.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .ymm_reg_mask.1,@object
.size .ymm_reg_mask.1,32
.align 8
# rdi -> a
# rsi -> neighbors
# rdx -> numneighs[i]
# rcx -> &t[t_idx]
# r8 -> ntest
.text
.globl gather_md_aos
.type gather_md_aos, @function
gather_md_aos :
push rbp
mov rbp, rsp
push rbx
push r10
push r11
push r12
push r13
push r14
push r15
vmovdqu ymm7, YMMWORD PTR .ymm_reg_mask.1[rip]
mov r15, rdx
xor rax, rax
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
# Prefetching instructions
#mov ebx, DWORD PTR[rsi + rax*4]
#mov r9d, DWORD PTR[4 + rsi + rax*4]
#mov r10d, DWORD PTR[8 + rsi + rax*4]
#mov r11d, DWORD PTR[12 + rsi + rax*4]
#mov r12d, DWORD PTR[16 + rsi + rax*4]
#mov r13d, DWORD PTR[20 + rsi + rax*4]
#mov r14d, DWORD PTR[24 + rsi + rax*4]
#mov r15d, DWORD PTR[28 + rsi + rax*4]
#lea ebx, DWORD PTR[rbx]
#lea r9d, DWORD PTR[r9]
#lea r10d, DWORD PTR[r10]
#lea r11d, DWORD PTR[r11]
#lea r12d, DWORD PTR[r12]
#lea r13d, DWORD PTR[r13]
#lea r14d, DWORD PTR[r14]
#lea r15d, DWORD PTR[r15]
vpcmpeqb k1, xmm5, xmm5
#ifndef ONLY_FIRST_DIMENSION
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
#endif
vpxord zmm0, zmm0, zmm0
#ifndef ONLY_FIRST_DIMENSION
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + r8 * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + r8 * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
# TODO: see if this logic can be optimized
addq rax, 8
subq r15, 8
cmpq r15, 8
jge 1b
cmpq r15, 0
jle .end_func
vpbroadcastd ymm6, r15d
vpcmpgtd k1, ymm6, ymm7
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rsi + rax * 4]
vpaddd ymm4, ymm3, ymm3
#ifdef PADDING
vpaddd ymm3, ymm4, ymm4
#else
vpaddd ymm3, ymm3, ymm4
#endif
vpxord zmm0, zmm1, zmm2
#ifndef ONLY_FIRST_DIMENSION
kmovw k2, k1
kmovw k3, k1
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
#endif
vgatherdpd zmm0{k1}, [ rdi + ymm3 * 8]
#ifndef ONLY_FIRST_DIMENSION
vgatherdpd zmm1{k2}, [8 + rdi + ymm3 * 8]
vgatherdpd zmm2{k3}, [16 + rdi + ymm3 * 8]
#endif
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + r8 * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + r8 * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
addq rax, r15
.end_func:
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_md_aos, .-gather_md_aos

View File

@@ -0,0 +1,67 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> a
# rsi -> idx
# rdx -> N
# rcx -> t
.text
.globl gather_soa
.type gather_soa, @function
gather_soa :
push rbp
mov rbp, rsp
push rbx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
vpcmpeqd ymm8, ymm8, ymm8
lea r8, [rdi + rdx * 8]
lea r9, [r8 + rdx * 8]
.align 16
1:
vmovdqu ymm3, YMMWORD PTR [rsi + rax * 4]
vpcmpeqb k1, xmm5, xmm5
vpcmpeqb k2, xmm5, xmm5
vpcmpeqb k3, xmm5, xmm5
vpxord zmm0, zmm0, zmm0
vpxord zmm1, zmm1, zmm1
vpxord zmm2, zmm2, zmm2
vgatherdpd zmm0{k1}, [rdi + ymm3 * 8]
vgatherdpd zmm1{k2}, [r8 + ymm3 * 8]
vgatherdpd zmm2{k3}, [r9 + ymm3 * 8]
#ifdef TEST
vmovupd [rcx + rax * 8], zmm0
lea rbx, [rcx + rdx * 8]
vmovupd [rbx + rax * 8], zmm1
lea r10, [rbx + rdx * 8]
vmovupd [r10 + rax * 8], zmm2
#endif
addq rax, 8
cmpq rax, rdx
jl 1b
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rbx
mov rsp, rbp
pop rbp
ret
.size gather_soa, .-gather_soa

View File

@@ -0,0 +1,23 @@
.intel_syntax noprefix
.data
.align 64
SCALAR:
.double 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
# rdi -> &a[i * snbytes]
.text
.globl load_aos
.type load_aos, @function
load_aos :
vmovsd xmm0, QWORD PTR [rdi]
vmovsd xmm1, QWORD PTR [8 + rdi]
vmovsd xmm2, QWORD PTR [16 + rdi]
vbroadcastsd zmm3, xmm0
vbroadcastsd zmm4, xmm1
vbroadcastsd zmm5, xmm2
ret
.size load_aos, .-load_aos

View File

@@ -0,0 +1,32 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* =======================================================================================
*/
#ifndef __ALLOCATE_H_
#define __ALLOCATE_H_
extern void* allocate (int alignment, size_t bytesize);
#endif

Some files were not shown because too many files have changed in this diff Show More