Compare commits
35 Commits
gromacs_ma
...
0094c3c4e1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0094c3c4e1 | ||
|
|
a13a0f3bae | ||
|
|
a6a269703d | ||
|
|
7ee250161a | ||
|
|
c73efea786 | ||
|
|
4cfa664533 | ||
|
|
1837403326 | ||
|
|
02629612a9 | ||
|
|
ce00aa0042 | ||
|
|
c4e5e87265 | ||
|
|
da3b1dd53f | ||
|
|
2f13291817 | ||
|
|
a460fffa19 | ||
| 19209bdcce | |||
|
|
151f0c0e6f | ||
|
|
72f486f9bf | ||
|
|
8253b31ee0 | ||
|
|
e206c3566d | ||
|
|
7ff1673399 | ||
|
|
b6982d56f5 | ||
|
|
1ad981a059 | ||
|
|
c438fc6832 | ||
|
|
17e239ed6d | ||
|
|
d151b9b3e4 | ||
|
|
98257b746c | ||
|
|
a101f8588a | ||
|
|
c14a6b2186 | ||
|
|
300776f512 | ||
|
|
4e5fe27c0f | ||
|
|
989bec2c7d | ||
|
|
2971ddcc63 | ||
|
|
5341938b60 | ||
|
|
039de0be99 | ||
|
|
43259eb3cf | ||
|
|
3eb7170a65 |
9
Makefile
9
Makefile
@@ -17,6 +17,9 @@ include $(MAKE_DIR)/include_ISA.mk
|
||||
include $(MAKE_DIR)/include_GROMACS.mk
|
||||
INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes
|
||||
|
||||
ifeq ($(strip $(OPT_SCHEME)),gromacs)
|
||||
DEFINES += -DGROMACS
|
||||
endif
|
||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||
DEFINES += -DAOS
|
||||
endif
|
||||
@@ -30,6 +33,10 @@ ifneq ($(ASM_SYNTAX), ATT)
|
||||
ASFLAGS += -masm=intel
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(SORT_ATOMS)),true)
|
||||
DEFINES += -DSORT_ATOMS
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
||||
DEFINES += -DEXPLICIT_TYPES
|
||||
endif
|
||||
@@ -152,7 +159,7 @@ $(BUILD_DIR)/%.o: %.s
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf $(BUILD_DIR)
|
||||
@rm -rf MDBench-$(IDENTIFIER)
|
||||
@rm -rf $(TARGET)*
|
||||
@rm -f tags
|
||||
|
||||
cleanall:
|
||||
|
||||
@@ -1,626 +0,0 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
|
||||
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
|
||||
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
|
||||
# mark_description "ICC/force.s";
|
||||
.file "force.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
.L_2__routine_start_computeForce_0:
|
||||
# -- Begin computeForce
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
|
||||
computeForce:
|
||||
# parameter 1: %rdi
|
||||
# parameter 2: %rsi
|
||||
# parameter 3: %rdx
|
||||
# parameter 4: %ecx
|
||||
# parameter 5: %r8d
|
||||
# parameter 6: %r9d
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_computeForce.1:
|
||||
..L2:
|
||||
#121.112
|
||||
pushq %rbp #121.112
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #121.112
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-64, %rsp #121.112
|
||||
pushq %r12 #121.112
|
||||
pushq %r13 #121.112
|
||||
pushq %r14 #121.112
|
||||
pushq %r15 #121.112
|
||||
pushq %rbx #121.112
|
||||
subq $88, %rsp #121.112
|
||||
xorl %eax, %eax #124.16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
|
||||
movq %rdx, %r15 #121.112
|
||||
movq %rsi, %r12 #121.112
|
||||
movq %rdi, %rbx #121.112
|
||||
..___tag_value_computeForce.11:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #124.16
|
||||
..___tag_value_computeForce.12:
|
||||
# LOE rbx r12 r15 xmm0
|
||||
..B1.51: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
vmovsd %xmm0, 24(%rsp) #124.16[spill]
|
||||
# LOE rbx r12 r15
|
||||
..B1.2: # Preds ..B1.51
|
||||
# Execution count [1.00e+00]
|
||||
movl 4(%r12), %r13d #125.18
|
||||
movq 64(%r12), %r9 #127.20
|
||||
movq 72(%r12), %r14 #127.45
|
||||
movq 80(%r12), %r8 #127.70
|
||||
vmovsd 72(%rbx), %xmm2 #129.27
|
||||
vmovsd 8(%rbx), %xmm1 #130.23
|
||||
vmovsd (%rbx), %xmm0 #131.24
|
||||
testl %r13d, %r13d #134.24
|
||||
jle ..B1.43 # Prob 50% #134.24
|
||||
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.3: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
xorl %ebx, %ebx #134.5
|
||||
movl %r13d, %edx #134.5
|
||||
xorl %ecx, %ecx #134.5
|
||||
movl $1, %esi #134.5
|
||||
xorl %eax, %eax #135.17
|
||||
shrl $1, %edx #134.5
|
||||
je ..B1.7 # Prob 9% #134.5
|
||||
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.5: # Preds ..B1.3 ..B1.5
|
||||
# Execution count [2.50e+00]
|
||||
movq %rax, (%rcx,%r9) #135.9
|
||||
incq %rbx #134.5
|
||||
movq %rax, (%rcx,%r14) #136.9
|
||||
movq %rax, (%rcx,%r8) #137.9
|
||||
movq %rax, 8(%rcx,%r9) #135.9
|
||||
movq %rax, 8(%rcx,%r14) #136.9
|
||||
movq %rax, 8(%rcx,%r8) #137.9
|
||||
addq $16, %rcx #134.5
|
||||
cmpq %rdx, %rbx #134.5
|
||||
jb ..B1.5 # Prob 63% #134.5
|
||||
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [9.00e-01]
|
||||
lea 1(%rbx,%rbx), %esi #135.9
|
||||
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.7: # Preds ..B1.3 ..B1.6
|
||||
# Execution count [1.00e+00]
|
||||
lea -1(%rsi), %edx #134.5
|
||||
cmpl %r13d, %edx #134.5
|
||||
jae ..B1.9 # Prob 9% #134.5
|
||||
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.8: # Preds ..B1.7
|
||||
# Execution count [9.00e-01]
|
||||
movslq %esi, %rsi #134.5
|
||||
movq %rax, -8(%r9,%rsi,8) #135.9
|
||||
movq %rax, -8(%r14,%rsi,8) #136.9
|
||||
movq %rax, -8(%r8,%rsi,8) #137.9
|
||||
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.9: # Preds ..B1.7 ..B1.8
|
||||
# Execution count [5.00e-01]
|
||||
movl $.L_2__STRING.0, %edi #141.5
|
||||
movq %r8, 32(%rsp) #141.5[spill]
|
||||
movq %r9, 80(%rsp) #141.5[spill]
|
||||
vmovsd %xmm2, (%rsp) #141.5[spill]
|
||||
vmovsd %xmm1, 8(%rsp) #141.5[spill]
|
||||
vmovsd %xmm0, 16(%rsp) #141.5[spill]
|
||||
..___tag_value_computeForce.18:
|
||||
# likwid_markerStartRegion(const char *)
|
||||
call likwid_markerStartRegion #141.5
|
||||
..___tag_value_computeForce.19:
|
||||
# LOE r12 r14 r15 r13d
|
||||
..B1.10: # Preds ..B1.9
|
||||
# Execution count [9.00e-01]
|
||||
vmovsd 16(%rsp), %xmm0 #[spill]
|
||||
xorl %esi, %esi #143.15
|
||||
vmovsd (%rsp), %xmm2 #[spill]
|
||||
xorl %eax, %eax #143.5
|
||||
vmulsd %xmm2, %xmm2, %xmm13 #129.45
|
||||
xorl %edi, %edi #143.5
|
||||
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
|
||||
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
|
||||
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
|
||||
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
|
||||
vmovsd 8(%rsp), %xmm1 #[spill]
|
||||
vbroadcastsd %xmm13, %zmm14 #129.25
|
||||
vbroadcastsd %xmm1, %zmm13 #130.21
|
||||
vbroadcastsd %xmm0, %zmm9 #197.45
|
||||
movslq %r13d, %r13 #143.5
|
||||
movq 24(%r15), %r10 #145.25
|
||||
movslq 16(%r15), %rdx #144.43
|
||||
movq 8(%r15), %rcx #144.19
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq 16(%r12), %rbx #146.25
|
||||
shlq $2, %rdx #126.5
|
||||
movq %r13, 64(%rsp) #143.5[spill]
|
||||
movq %r10, 72(%rsp) #143.5[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.11: # Preds ..B1.41 ..B1.10
|
||||
# Execution count [5.00e+00]
|
||||
movq 72(%rsp), %r9 #145.25[spill]
|
||||
vxorpd %xmm24, %xmm24, %xmm24 #149.22
|
||||
vmovapd %xmm24, %xmm18 #150.22
|
||||
movl (%r9,%rax,4), %r10d #145.25
|
||||
vmovapd %xmm18, %xmm4 #151.22
|
||||
vmovsd (%rdi,%rbx), %xmm10 #146.25
|
||||
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
|
||||
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
|
||||
testl %r10d, %r10d #173.32
|
||||
jle ..B1.41 # Prob 50% #173.32
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
vpxord %zmm8, %zmm8, %zmm8 #149.22
|
||||
vmovaps %zmm8, %zmm7 #150.22
|
||||
vmovaps %zmm7, %zmm11 #151.22
|
||||
cmpl $8, %r10d #173.13
|
||||
jl ..B1.48 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.13: # Preds ..B1.12
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $1200, %r10d #173.13
|
||||
jl ..B1.47 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.14: # Preds ..B1.13
|
||||
# Execution count [4.50e+00]
|
||||
movq %rdx, %r15 #144.43
|
||||
imulq %rsi, %r15 #144.43
|
||||
addq %rcx, %r15 #126.5
|
||||
movq %r15, %r11 #173.13
|
||||
andq $63, %r11 #173.13
|
||||
testl $3, %r11d #173.13
|
||||
je ..B1.16 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.15: # Preds ..B1.14
|
||||
# Execution count [2.25e+00]
|
||||
xorl %r11d, %r11d #173.13
|
||||
jmp ..B1.18 # Prob 100% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.16: # Preds ..B1.14
|
||||
# Execution count [2.25e+00]
|
||||
testl %r11d, %r11d #173.13
|
||||
je ..B1.18 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.17: # Preds ..B1.16
|
||||
# Execution count [2.50e+01]
|
||||
negl %r11d #173.13
|
||||
addl $64, %r11d #173.13
|
||||
shrl $2, %r11d #173.13
|
||||
cmpl %r11d, %r10d #173.13
|
||||
cmovl %r10d, %r11d #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
|
||||
# Execution count [5.00e+00]
|
||||
movl %r10d, %r13d #173.13
|
||||
subl %r11d, %r13d #173.13
|
||||
andl $7, %r13d #173.13
|
||||
negl %r13d #173.13
|
||||
addl %r10d, %r13d #173.13
|
||||
cmpl $1, %r11d #173.13
|
||||
jb ..B1.26 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.19: # Preds ..B1.18
|
||||
# Execution count [4.50e+00]
|
||||
vmovdqa %ymm15, %ymm4 #173.13
|
||||
xorl %r12d, %r12d #173.13
|
||||
vpbroadcastd %r11d, %ymm3 #173.13
|
||||
vbroadcastsd %xmm10, %zmm2 #146.23
|
||||
vbroadcastsd %xmm6, %zmm1 #147.23
|
||||
vbroadcastsd %xmm12, %zmm0 #148.23
|
||||
movslq %r11d, %r9 #173.13
|
||||
movq %r8, 32(%rsp) #173.13[spill]
|
||||
movq %r14, (%rsp) #173.13[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.20: # Preds ..B1.24 ..B1.19
|
||||
# Execution count [2.50e+01]
|
||||
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
|
||||
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
|
||||
kmovw %k3, %r14d #173.13
|
||||
vpaddd %ymm17, %ymm17, %ymm18 #175.40
|
||||
vpaddd %ymm18, %ymm17, %ymm17 #175.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.23: # Preds ..B1.20
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #175.40
|
||||
kmovw %k3, %k2 #175.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #175.40
|
||||
vpxord %zmm19, %zmm19, %zmm19 #175.40
|
||||
vpxord %zmm20, %zmm20, %zmm20 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [2.50e+01]
|
||||
addq $8, %r12 #173.13
|
||||
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
|
||||
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
|
||||
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
|
||||
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
|
||||
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
|
||||
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
|
||||
#vrcp14pd %zmm25, %zmm24 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm24, %k0 #195.42
|
||||
#kmovw %k2, %r8d #194.26
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmovaps %zmm25, %zmm17 #195.42
|
||||
#andl %r8d, %r14d #194.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
|
||||
#kmovw %r14d, %k3 #198.21
|
||||
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
|
||||
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
|
||||
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
|
||||
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
|
||||
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
|
||||
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
|
||||
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
|
||||
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
|
||||
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
|
||||
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
|
||||
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
|
||||
cmpq %r9, %r12 #173.13
|
||||
jb ..B1.20 # Prob 82% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.25: # Preds ..B1.24
|
||||
# Execution count [4.50e+00]
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq (%rsp), %r14 #[spill]
|
||||
cmpl %r11d, %r10d #173.13
|
||||
je ..B1.40 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
|
||||
# Execution count [2.50e+01]
|
||||
lea 8(%r11), %r9d #173.13
|
||||
cmpl %r9d, %r13d #173.13
|
||||
jl ..B1.34 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.27: # Preds ..B1.26
|
||||
# Execution count [4.50e+00]
|
||||
movq %rdx, %r12 #144.43
|
||||
imulq %rsi, %r12 #144.43
|
||||
vbroadcastsd %xmm10, %zmm1 #146.23
|
||||
vbroadcastsd %xmm6, %zmm0 #147.23
|
||||
vbroadcastsd %xmm12, %zmm2 #148.23
|
||||
movslq %r11d, %r9 #173.13
|
||||
addq %rcx, %r12 #126.5
|
||||
movq %rdi, 8(%rsp) #126.5[spill]
|
||||
movq %rdx, 16(%rsp) #126.5[spill]
|
||||
movq %rcx, 40(%rsp) #126.5[spill]
|
||||
movq %rax, 48(%rsp) #126.5[spill]
|
||||
movq %rsi, 56(%rsp) #126.5[spill]
|
||||
movq %r8, 32(%rsp) #126.5[spill]
|
||||
movq %r14, (%rsp) #126.5[spill]
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.28: # Preds ..B1.32 ..B1.27
|
||||
# Execution count [2.50e+01]
|
||||
vmovdqu (%r12,%r9,4), %ymm3 #174.25
|
||||
vpaddd %ymm3, %ymm3, %ymm4 #175.40
|
||||
vpaddd %ymm4, %ymm3, %ymm3 #175.40
|
||||
movl (%r12,%r9,4), %r14d #174.25
|
||||
movl 4(%r12,%r9,4), %r8d #174.25
|
||||
movl 8(%r12,%r9,4), %edi #174.25
|
||||
movl 12(%r12,%r9,4), %esi #174.25
|
||||
lea (%r14,%r14,2), %r14d #175.40
|
||||
movl 16(%r12,%r9,4), %ecx #174.25
|
||||
lea (%r8,%r8,2), %r8d #175.40
|
||||
movl 20(%r12,%r9,4), %edx #174.25
|
||||
lea (%rdi,%rdi,2), %edi #175.40
|
||||
movl 24(%r12,%r9,4), %eax #174.25
|
||||
lea (%rsi,%rsi,2), %esi #175.40
|
||||
movl 28(%r12,%r9,4), %r15d #174.25
|
||||
lea (%rcx,%rcx,2), %ecx #175.40
|
||||
lea (%rdx,%rdx,2), %edx #175.40
|
||||
lea (%rax,%rax,2), %eax #175.40
|
||||
lea (%r15,%r15,2), %r15d #175.40
|
||||
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.31: # Preds ..B1.28
|
||||
# Execution count [1.25e+01]
|
||||
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
|
||||
vpxord %zmm4, %zmm4, %zmm4 #175.40
|
||||
vpxord %zmm17, %zmm17, %zmm17 #175.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
|
||||
..B1.32: # Preds ..B1.31
|
||||
# Execution count [2.50e+01]
|
||||
addl $8, %r11d #173.13
|
||||
addq $8, %r9 #173.13
|
||||
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
|
||||
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
|
||||
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
|
||||
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
|
||||
#vrcp14pd %zmm3, %zmm22 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm22, %k0 #195.42
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
|
||||
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
|
||||
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
|
||||
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
|
||||
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
|
||||
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
|
||||
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
|
||||
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
|
||||
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
|
||||
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
|
||||
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
|
||||
cmpl %r13d, %r11d #173.13
|
||||
jb ..B1.28 # Prob 82% #173.13
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.33: # Preds ..B1.32
|
||||
# Execution count [4.50e+00]
|
||||
movq 8(%rsp), %rdi #[spill]
|
||||
movq 16(%rsp), %rdx #[spill]
|
||||
movq 40(%rsp), %rcx #[spill]
|
||||
movq 48(%rsp), %rax #[spill]
|
||||
movq 56(%rsp), %rsi #[spill]
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq (%rsp), %r14 #[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r13), %r9d #173.13
|
||||
cmpl %r10d, %r9d #173.13
|
||||
ja ..B1.40 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.35: # Preds ..B1.34
|
||||
# Execution count [2.50e+01]
|
||||
imulq %rdx, %rsi #144.43
|
||||
vbroadcastsd %xmm10, %zmm4 #146.23
|
||||
subl %r13d, %r10d #173.13
|
||||
addq %rcx, %rsi #126.5
|
||||
vpbroadcastd %r10d, %ymm0 #173.13
|
||||
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
|
||||
movslq %r13d, %r13 #173.13
|
||||
kmovw %k3, %r9d #173.13
|
||||
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
|
||||
vpaddd %ymm1, %ymm1, %ymm2 #175.40
|
||||
vpaddd %ymm2, %ymm1, %ymm0 #175.40
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.38: # Preds ..B1.35
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #175.40
|
||||
kmovw %k3, %k2 #175.40
|
||||
vpxord %zmm1, %zmm1, %zmm1 #175.40
|
||||
vpxord %zmm2, %zmm2, %zmm2 #175.40
|
||||
vpxord %zmm3, %zmm3, %zmm3 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.39: # Preds ..B1.38
|
||||
# Execution count [2.50e+01]
|
||||
#vbroadcastsd %xmm6, %zmm6 #147.23
|
||||
#vbroadcastsd %xmm12, %zmm12 #148.23
|
||||
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
|
||||
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
|
||||
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
|
||||
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
|
||||
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
|
||||
#vrcp14pd %zmm19, %zmm18 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm18, %k0 #195.42
|
||||
#kmovw %k2, %esi #194.26
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmovaps %zmm19, %zmm0 #195.42
|
||||
#andl %esi, %r9d #194.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
|
||||
#kmovw %r9d, %k3 #198.21
|
||||
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
|
||||
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
|
||||
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
|
||||
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
|
||||
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
|
||||
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
|
||||
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
|
||||
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
|
||||
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
|
||||
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
|
||||
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
|
||||
# Execution count [4.50e+00]
|
||||
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
|
||||
vpermd %zmm11, %zmm19, %zmm0 #151.22
|
||||
vpermd %zmm7, %zmm19, %zmm6 #150.22
|
||||
vpermd %zmm8, %zmm19, %zmm20 #149.22
|
||||
vaddpd %zmm11, %zmm0, %zmm11 #151.22
|
||||
vaddpd %zmm7, %zmm6, %zmm7 #150.22
|
||||
vaddpd %zmm8, %zmm20, %zmm8 #149.22
|
||||
vpermpd $78, %zmm11, %zmm1 #151.22
|
||||
vpermpd $78, %zmm7, %zmm10 #150.22
|
||||
vpermpd $78, %zmm8, %zmm21 #149.22
|
||||
vaddpd %zmm1, %zmm11, %zmm2 #151.22
|
||||
vaddpd %zmm10, %zmm7, %zmm12 #150.22
|
||||
vaddpd %zmm21, %zmm8, %zmm22 #149.22
|
||||
vpermpd $177, %zmm2, %zmm3 #151.22
|
||||
vpermpd $177, %zmm12, %zmm17 #150.22
|
||||
vpermpd $177, %zmm22, %zmm23 #149.22
|
||||
vaddpd %zmm3, %zmm2, %zmm4 #151.22
|
||||
vaddpd %zmm17, %zmm12, %zmm18 #150.22
|
||||
vaddpd %zmm23, %zmm22, %zmm24 #149.22
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.41: # Preds ..B1.40 ..B1.11
|
||||
# Execution count [5.00e+00]
|
||||
movq 80(%rsp), %rsi #208.9[spill]
|
||||
addq $24, %rdi #143.5
|
||||
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
|
||||
vmovsd %xmm0, (%rsi,%rax,8) #208.9
|
||||
movslq %eax, %rsi #143.32
|
||||
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
|
||||
vmovsd %xmm1, (%r14,%rax,8) #209.9
|
||||
incq %rsi #143.32
|
||||
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
|
||||
vmovsd %xmm2, (%r8,%rax,8) #210.9
|
||||
incq %rax #143.5
|
||||
cmpq 64(%rsp), %rax #143.5[spill]
|
||||
jb ..B1.11 # Prob 82% #143.5
|
||||
jmp ..B1.44 # Prob 100% #143.5
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.43: # Preds ..B1.2
|
||||
# Execution count [5.00e-01]
|
||||
movl $.L_2__STRING.0, %edi #141.5
|
||||
..___tag_value_computeForce.48:
|
||||
# likwid_markerStartRegion(const char *)
|
||||
call likwid_markerStartRegion #141.5
|
||||
..___tag_value_computeForce.49:
|
||||
# LOE
|
||||
..B1.44: # Preds ..B1.41 ..B1.43
|
||||
# Execution count [1.00e+00]
|
||||
movl $.L_2__STRING.0, %edi #219.5
|
||||
vzeroupper #219.5
|
||||
..___tag_value_computeForce.50:
|
||||
# likwid_markerStopRegion(const char *)
|
||||
call likwid_markerStopRegion #219.5
|
||||
..___tag_value_computeForce.51:
|
||||
# LOE
|
||||
..B1.45: # Preds ..B1.44
|
||||
# Execution count [1.00e+00]
|
||||
xorl %eax, %eax #221.16
|
||||
..___tag_value_computeForce.52:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #221.16
|
||||
..___tag_value_computeForce.53:
|
||||
# LOE xmm0
|
||||
..B1.46: # Preds ..B1.45
|
||||
# Execution count [1.00e+00]
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
|
||||
addq $88, %rsp #224.14
|
||||
.cfi_restore 3
|
||||
popq %rbx #224.14
|
||||
.cfi_restore 15
|
||||
popq %r15 #224.14
|
||||
.cfi_restore 14
|
||||
popq %r14 #224.14
|
||||
.cfi_restore 13
|
||||
popq %r13 #224.14
|
||||
.cfi_restore 12
|
||||
popq %r12 #224.14
|
||||
movq %rbp, %rsp #224.14
|
||||
popq %rbp #224.14
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #224.14
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_offset 6, -16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE
|
||||
..B1.47: # Preds ..B1.13
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
movl %r10d, %r13d #173.13
|
||||
xorl %r11d, %r11d #173.13
|
||||
andl $-8, %r13d #173.13
|
||||
jmp ..B1.26 # Prob 100% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.48: # Preds ..B1.12
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r13d, %r13d #173.13
|
||||
jmp ..B1.34 # Prob 100% #173.13
|
||||
.align 16,0x90
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.7:
|
||||
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
|
||||
.type .L_2il0floatpacket.7,@object
|
||||
.size .L_2il0floatpacket.7,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.8:
|
||||
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
|
||||
.type .L_2il0floatpacket.8,@object
|
||||
.size .L_2il0floatpacket.8,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.10:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.10,@object
|
||||
.size .L_2il0floatpacket.10,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.9:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.9,@object
|
||||
.size .L_2il0floatpacket.9,8
|
||||
.section .rodata.str1.4, "aMS",@progbits,1
|
||||
.align 4
|
||||
.align 4
|
||||
.L_2__STRING.0:
|
||||
.long 1668444006
|
||||
.word 101
|
||||
.type .L_2__STRING.0,@object
|
||||
.size .L_2__STRING.0,6
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
@@ -1,585 +0,0 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
|
||||
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
|
||||
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
|
||||
.file "force.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
.L_2__routine_start_computeForce_0:
|
||||
# -- Begin computeForce
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
|
||||
computeForce:
|
||||
# parameter 1: %rdi
|
||||
# parameter 2: %rsi
|
||||
# parameter 3: %rdx
|
||||
# parameter 4: %ecx
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_computeForce.1:
|
||||
..L2:
|
||||
#103.87
|
||||
pushq %rbp #103.87
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #103.87
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-64, %rsp #103.87
|
||||
pushq %r12 #103.87
|
||||
pushq %r13 #103.87
|
||||
pushq %r14 #103.87
|
||||
subq $104, %rsp #103.87
|
||||
xorl %eax, %eax #106.16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
movq %rdx, %r14 #103.87
|
||||
movq %rsi, %r13 #103.87
|
||||
movq %rdi, %r12 #103.87
|
||||
..___tag_value_computeForce.9:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #106.16
|
||||
..___tag_value_computeForce.10:
|
||||
# LOE rbx r12 r13 r14 r15 xmm0
|
||||
..B1.48: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
vmovsd %xmm0, 16(%rsp) #106.16[spill]
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.2: # Preds ..B1.48
|
||||
# Execution count [1.00e+00]
|
||||
movl 4(%r13), %ecx #107.18
|
||||
movq 64(%r13), %r11 #109.20
|
||||
movq 72(%r13), %r10 #109.45
|
||||
movq 80(%r13), %r9 #109.70
|
||||
vmovsd 72(%r12), %xmm2 #111.27
|
||||
vmovsd 8(%r12), %xmm1 #112.23
|
||||
vmovsd (%r12), %xmm0 #113.24
|
||||
testl %ecx, %ecx #116.24
|
||||
jle ..B1.42 # Prob 50% #116.24
|
||||
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.3: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
xorl %edi, %edi #116.5
|
||||
movl %ecx, %edx #116.5
|
||||
xorl %esi, %esi #116.5
|
||||
movl $1, %r8d #116.5
|
||||
xorl %eax, %eax #117.17
|
||||
shrl $1, %edx #116.5
|
||||
je ..B1.7 # Prob 9% #116.5
|
||||
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.5: # Preds ..B1.3 ..B1.5
|
||||
# Execution count [2.50e+00]
|
||||
movq %rax, (%rsi,%r11) #117.9
|
||||
incq %rdi #116.5
|
||||
movq %rax, (%rsi,%r10) #118.9
|
||||
movq %rax, (%rsi,%r9) #119.9
|
||||
movq %rax, 8(%rsi,%r11) #117.9
|
||||
movq %rax, 8(%rsi,%r10) #118.9
|
||||
movq %rax, 8(%rsi,%r9) #119.9
|
||||
addq $16, %rsi #116.5
|
||||
cmpq %rdx, %rdi #116.5
|
||||
jb ..B1.5 # Prob 63% #116.5
|
||||
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [9.00e-01]
|
||||
lea 1(%rdi,%rdi), %r8d #117.9
|
||||
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.7: # Preds ..B1.3 ..B1.6
|
||||
# Execution count [1.00e+00]
|
||||
lea -1(%r8), %edx #116.5
|
||||
cmpl %ecx, %edx #116.5
|
||||
jae ..B1.9 # Prob 9% #116.5
|
||||
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.8: # Preds ..B1.7
|
||||
# Execution count [9.00e-01]
|
||||
movslq %r8d, %r8 #116.5
|
||||
movq %rax, -8(%r11,%r8,8) #117.9
|
||||
movq %rax, -8(%r10,%r8,8) #118.9
|
||||
movq %rax, -8(%r9,%r8,8) #119.9
|
||||
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.9: # Preds ..B1.7 ..B1.8
|
||||
# Execution count [9.00e-01]
|
||||
vmulsd %xmm2, %xmm2, %xmm13 #111.45
|
||||
xorl %edi, %edi #124.15
|
||||
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
|
||||
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
|
||||
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
|
||||
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
|
||||
vbroadcastsd %xmm13, %zmm14 #111.25
|
||||
vbroadcastsd %xmm1, %zmm13 #112.21
|
||||
vbroadcastsd %xmm0, %zmm9 #177.45
|
||||
movq 16(%r13), %rdx #127.25
|
||||
xorl %r8d, %r8d #124.5
|
||||
movslq %ecx, %r12 #124.5
|
||||
xorl %eax, %eax #124.5
|
||||
movq 24(%r14), %r13 #126.25
|
||||
movslq 16(%r14), %rcx #125.43
|
||||
movq 8(%r14), %rsi #125.19
|
||||
shlq $2, %rcx #108.5
|
||||
movq %r12, 80(%rsp) #124.5[spill]
|
||||
movq %r13, 88(%rsp) #124.5[spill]
|
||||
movq %r11, 96(%rsp) #124.5[spill]
|
||||
movq %r15, 8(%rsp) #124.5[spill]
|
||||
movq %rbx, (%rsp) #124.5[spill]
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.10: # Preds ..B1.40 ..B1.9
|
||||
# Execution count [5.00e+00]
|
||||
movq 88(%rsp), %rbx #126.25[spill]
|
||||
vxorpd %xmm24, %xmm24, %xmm24 #130.22
|
||||
vmovapd %xmm24, %xmm18 #131.22
|
||||
movl (%rbx,%r8,4), %r11d #126.25
|
||||
vmovapd %xmm18, %xmm4 #132.22
|
||||
vmovsd (%rax,%rdx), %xmm10 #127.25
|
||||
vmovsd 8(%rax,%rdx), %xmm6 #128.25
|
||||
vmovsd 16(%rax,%rdx), %xmm12 #129.25
|
||||
testl %r11d, %r11d #153.32
|
||||
jle ..B1.40 # Prob 50% #153.32
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.11: # Preds ..B1.10
|
||||
# Execution count [4.50e+00]
|
||||
vpxord %zmm8, %zmm8, %zmm8 #130.22
|
||||
vmovaps %zmm8, %zmm7 #131.22
|
||||
vmovaps %zmm7, %zmm11 #132.22
|
||||
cmpl $8, %r11d #153.13
|
||||
jl ..B1.45 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $1200, %r11d #153.13
|
||||
jl ..B1.44 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.13: # Preds ..B1.12
|
||||
# Execution count [4.50e+00]
|
||||
movq %rcx, %r15 #125.43
|
||||
imulq %rdi, %r15 #125.43
|
||||
addq %rsi, %r15 #108.5
|
||||
movq %r15, %r12 #153.13
|
||||
andq $63, %r12 #153.13
|
||||
testl $3, %r12d #153.13
|
||||
je ..B1.15 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.14: # Preds ..B1.13
|
||||
# Execution count [2.25e+00]
|
||||
xorl %r12d, %r12d #153.13
|
||||
jmp ..B1.17 # Prob 100% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.15: # Preds ..B1.13
|
||||
# Execution count [2.25e+00]
|
||||
testl %r12d, %r12d #153.13
|
||||
je ..B1.17 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.16: # Preds ..B1.15
|
||||
# Execution count [2.50e+01]
|
||||
negl %r12d #153.13
|
||||
addl $64, %r12d #153.13
|
||||
shrl $2, %r12d #153.13
|
||||
cmpl %r12d, %r11d #153.13
|
||||
cmovl %r11d, %r12d #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
|
||||
# Execution count [5.00e+00]
|
||||
movl %r11d, %r14d #153.13
|
||||
subl %r12d, %r14d #153.13
|
||||
andl $7, %r14d #153.13
|
||||
negl %r14d #153.13
|
||||
addl %r11d, %r14d #153.13
|
||||
cmpl $1, %r12d #153.13
|
||||
jb ..B1.25 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.18: # Preds ..B1.17
|
||||
# Execution count [4.50e+00]
|
||||
vmovdqa %ymm15, %ymm4 #153.13
|
||||
xorl %r13d, %r13d #153.13
|
||||
vpbroadcastd %r12d, %ymm3 #153.13
|
||||
vbroadcastsd %xmm10, %zmm2 #127.23
|
||||
vbroadcastsd %xmm6, %zmm1 #128.23
|
||||
vbroadcastsd %xmm12, %zmm0 #129.23
|
||||
movslq %r12d, %rbx #153.13
|
||||
movq %r9, 24(%rsp) #153.13[spill]
|
||||
movq %r10, 32(%rsp) #153.13[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.19: # Preds ..B1.23 ..B1.18
|
||||
# Execution count [2.50e+01]
|
||||
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
|
||||
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
|
||||
kmovw %k3, %r10d #153.13
|
||||
vpaddd %ymm17, %ymm17, %ymm18 #155.40
|
||||
vpaddd %ymm18, %ymm17, %ymm17 #155.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.22: # Preds ..B1.19
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #155.40
|
||||
kmovw %k3, %k2 #155.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #155.40
|
||||
vpxord %zmm19, %zmm19, %zmm19 #155.40
|
||||
vpxord %zmm20, %zmm20, %zmm20 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
..B1.23: # Preds ..B1.22
|
||||
# Execution count [2.50e+01]
|
||||
addq $8, %r13 #153.13
|
||||
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
|
||||
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
|
||||
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
|
||||
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
|
||||
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
|
||||
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
|
||||
#vrcp14pd %zmm25, %zmm24 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm24, %k0 #175.42
|
||||
#kmovw %k2, %r9d #174.26
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmovaps %zmm25, %zmm17 #175.42
|
||||
#andl %r9d, %r10d #174.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
|
||||
#kmovw %r10d, %k3 #178.21
|
||||
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
|
||||
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
|
||||
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
|
||||
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
|
||||
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
|
||||
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
|
||||
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
|
||||
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
|
||||
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
|
||||
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
|
||||
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
|
||||
cmpq %rbx, %r13 #153.13
|
||||
jb ..B1.19 # Prob 82% #153.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [4.50e+00]
|
||||
movq 24(%rsp), %r9 #[spill]
|
||||
movq 32(%rsp), %r10 #[spill]
|
||||
cmpl %r12d, %r11d #153.13
|
||||
je ..B1.39 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
|
||||
# Execution count [2.50e+01]
|
||||
lea 8(%r12), %ebx #153.13
|
||||
cmpl %ebx, %r14d #153.13
|
||||
jl ..B1.33 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.26: # Preds ..B1.25
|
||||
# Execution count [4.50e+00]
|
||||
movq %rcx, %r13 #125.43
|
||||
imulq %rdi, %r13 #125.43
|
||||
vbroadcastsd %xmm10, %zmm1 #127.23
|
||||
vbroadcastsd %xmm6, %zmm0 #128.23
|
||||
vbroadcastsd %xmm12, %zmm2 #129.23
|
||||
movslq %r12d, %rbx #153.13
|
||||
addq %rsi, %r13 #108.5
|
||||
movq %rax, 40(%rsp) #108.5[spill]
|
||||
movq %rcx, 48(%rsp) #108.5[spill]
|
||||
movq %rsi, 56(%rsp) #108.5[spill]
|
||||
movq %r8, 64(%rsp) #108.5[spill]
|
||||
movq %rdi, 72(%rsp) #108.5[spill]
|
||||
movq %r9, 24(%rsp) #108.5[spill]
|
||||
movq %r10, 32(%rsp) #108.5[spill]
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.27: # Preds ..B1.31 ..B1.26
|
||||
# Execution count [2.50e+01]
|
||||
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
|
||||
vpaddd %ymm3, %ymm3, %ymm4 #155.40
|
||||
vpaddd %ymm4, %ymm3, %ymm3 #155.40
|
||||
movl (%r13,%rbx,4), %r10d #154.25
|
||||
movl 4(%r13,%rbx,4), %r9d #154.25
|
||||
movl 8(%r13,%rbx,4), %r8d #154.25
|
||||
movl 12(%r13,%rbx,4), %edi #154.25
|
||||
lea (%r10,%r10,2), %r10d #155.40
|
||||
movl 16(%r13,%rbx,4), %esi #154.25
|
||||
lea (%r9,%r9,2), %r9d #155.40
|
||||
movl 20(%r13,%rbx,4), %ecx #154.25
|
||||
lea (%r8,%r8,2), %r8d #155.40
|
||||
movl 24(%r13,%rbx,4), %eax #154.25
|
||||
lea (%rdi,%rdi,2), %edi #155.40
|
||||
movl 28(%r13,%rbx,4), %r15d #154.25
|
||||
lea (%rsi,%rsi,2), %esi #155.40
|
||||
lea (%rcx,%rcx,2), %ecx #155.40
|
||||
lea (%rax,%rax,2), %eax #155.40
|
||||
lea (%r15,%r15,2), %r15d #155.40
|
||||
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.30: # Preds ..B1.27
|
||||
# Execution count [1.25e+01]
|
||||
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
|
||||
vpxord %zmm4, %zmm4, %zmm4 #155.40
|
||||
vpxord %zmm17, %zmm17, %zmm17 #155.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
|
||||
..B1.31: # Preds ..B1.30
|
||||
# Execution count [2.50e+01]
|
||||
addl $8, %r12d #153.13
|
||||
addq $8, %rbx #153.13
|
||||
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
|
||||
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
|
||||
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
|
||||
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
|
||||
#vrcp14pd %zmm3, %zmm22 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm22, %k0 #175.42
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
|
||||
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
|
||||
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
|
||||
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
|
||||
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
|
||||
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
|
||||
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
|
||||
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
|
||||
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
|
||||
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
|
||||
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
|
||||
cmpl %r14d, %r12d #153.13
|
||||
jb ..B1.27 # Prob 82% #153.13
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.32: # Preds ..B1.31
|
||||
# Execution count [4.50e+00]
|
||||
movq 40(%rsp), %rax #[spill]
|
||||
movq 48(%rsp), %rcx #[spill]
|
||||
movq 56(%rsp), %rsi #[spill]
|
||||
movq 64(%rsp), %r8 #[spill]
|
||||
movq 72(%rsp), %rdi #[spill]
|
||||
movq 24(%rsp), %r9 #[spill]
|
||||
movq 32(%rsp), %r10 #[spill]
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r14), %ebx #153.13
|
||||
cmpl %r11d, %ebx #153.13
|
||||
ja ..B1.39 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.34: # Preds ..B1.33
|
||||
# Execution count [2.50e+01]
|
||||
imulq %rcx, %rdi #125.43
|
||||
vbroadcastsd %xmm10, %zmm4 #127.23
|
||||
subl %r14d, %r11d #153.13
|
||||
addq %rsi, %rdi #108.5
|
||||
vpbroadcastd %r11d, %ymm0 #153.13
|
||||
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
|
||||
movslq %r14d, %r14 #153.13
|
||||
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
|
||||
kmovw %k3, %edi #153.13
|
||||
vpaddd %ymm1, %ymm1, %ymm2 #155.40
|
||||
vpaddd %ymm2, %ymm1, %ymm0 #155.40
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.37: # Preds ..B1.34
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #155.40
|
||||
kmovw %k3, %k2 #155.40
|
||||
vpxord %zmm1, %zmm1, %zmm1 #155.40
|
||||
vpxord %zmm2, %zmm2, %zmm2 #155.40
|
||||
vpxord %zmm3, %zmm3, %zmm3 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.38: # Preds ..B1.37
|
||||
# Execution count [2.50e+01]
|
||||
#vbroadcastsd %xmm6, %zmm6 #128.23
|
||||
#vbroadcastsd %xmm12, %zmm12 #129.23
|
||||
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
|
||||
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
|
||||
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
|
||||
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
|
||||
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
|
||||
#vrcp14pd %zmm19, %zmm18 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm18, %k0 #175.42
|
||||
#kmovw %k2, %ebx #174.26
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmovaps %zmm19, %zmm0 #175.42
|
||||
#andl %ebx, %edi #174.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
|
||||
#kmovw %edi, %k3 #178.21
|
||||
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
|
||||
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
|
||||
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
|
||||
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
|
||||
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
|
||||
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
|
||||
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
|
||||
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
|
||||
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
|
||||
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
|
||||
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
|
||||
# Execution count [4.50e+00]
|
||||
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
|
||||
vpermd %zmm11, %zmm19, %zmm0 #132.22
|
||||
vpermd %zmm7, %zmm19, %zmm6 #131.22
|
||||
vpermd %zmm8, %zmm19, %zmm20 #130.22
|
||||
vaddpd %zmm11, %zmm0, %zmm11 #132.22
|
||||
vaddpd %zmm7, %zmm6, %zmm7 #131.22
|
||||
vaddpd %zmm8, %zmm20, %zmm8 #130.22
|
||||
vpermpd $78, %zmm11, %zmm1 #132.22
|
||||
vpermpd $78, %zmm7, %zmm10 #131.22
|
||||
vpermpd $78, %zmm8, %zmm21 #130.22
|
||||
vaddpd %zmm1, %zmm11, %zmm2 #132.22
|
||||
vaddpd %zmm10, %zmm7, %zmm12 #131.22
|
||||
vaddpd %zmm21, %zmm8, %zmm22 #130.22
|
||||
vpermpd $177, %zmm2, %zmm3 #132.22
|
||||
vpermpd $177, %zmm12, %zmm17 #131.22
|
||||
vpermpd $177, %zmm22, %zmm23 #130.22
|
||||
vaddpd %zmm3, %zmm2, %zmm4 #132.22
|
||||
vaddpd %zmm17, %zmm12, %zmm18 #131.22
|
||||
vaddpd %zmm23, %zmm22, %zmm24 #130.22
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.40: # Preds ..B1.39 ..B1.10
|
||||
# Execution count [5.00e+00]
|
||||
movq 96(%rsp), %rbx #188.9[spill]
|
||||
addq $24, %rax #124.5
|
||||
movslq %r8d, %rdi #124.32
|
||||
incq %rdi #124.32
|
||||
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
|
||||
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
|
||||
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
|
||||
#vmovsd %xmm1, (%r10,%r8,8) #189.9
|
||||
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
|
||||
#vmovsd %xmm2, (%r9,%r8,8) #190.9
|
||||
incq %r8 #124.5
|
||||
cmpq 80(%rsp), %r8 #124.5[spill]
|
||||
jb ..B1.10 # Prob 82% #124.5
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.41: # Preds ..B1.40
|
||||
# Execution count [9.00e-01]
|
||||
movq 8(%rsp), %r15 #[spill]
|
||||
.cfi_restore 15
|
||||
movq (%rsp), %rbx #[spill]
|
||||
.cfi_restore 3
|
||||
# LOE rbx r15
|
||||
..B1.42: # Preds ..B1.2 ..B1.41
|
||||
# Execution count [1.00e+00]
|
||||
xorl %eax, %eax #201.16
|
||||
vzeroupper #201.16
|
||||
..___tag_value_computeForce.43:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #201.16
|
||||
..___tag_value_computeForce.44:
|
||||
# LOE rbx r15 xmm0
|
||||
..B1.43: # Preds ..B1.42
|
||||
# Execution count [1.00e+00]
|
||||
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
|
||||
addq $104, %rsp #204.14
|
||||
.cfi_restore 14
|
||||
popq %r14 #204.14
|
||||
.cfi_restore 13
|
||||
popq %r13 #204.14
|
||||
.cfi_restore 12
|
||||
popq %r12 #204.14
|
||||
movq %rbp, %rsp #204.14
|
||||
popq %rbp #204.14
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #204.14
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_offset 6, -16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE
|
||||
..B1.44: # Preds ..B1.12
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
movl %r11d, %r14d #153.13
|
||||
xorl %r12d, %r12d #153.13
|
||||
andl $-8, %r14d #153.13
|
||||
jmp ..B1.25 # Prob 100% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.45: # Preds ..B1.11
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r14d, %r14d #153.13
|
||||
jmp ..B1.33 # Prob 100% #153.13
|
||||
.align 16,0x90
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.7:
|
||||
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
|
||||
.type .L_2il0floatpacket.7,@object
|
||||
.size .L_2il0floatpacket.7,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.8:
|
||||
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
|
||||
.type .L_2il0floatpacket.8,@object
|
||||
.size .L_2il0floatpacket.8,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.10:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.10,@object
|
||||
.size .L_2il0floatpacket.10,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.9:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.9,@object
|
||||
.size .L_2il0floatpacket.9,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
@@ -1,324 +0,0 @@
|
||||
.intel_syntax noprefix
|
||||
|
||||
.text
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
computeForce:
|
||||
# parameter 1: rdi Parameter*
|
||||
# parameter 2: rsi Atom*
|
||||
# parameter 3: rdx Neighbor*
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbx
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
|
||||
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
|
||||
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
|
||||
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
|
||||
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
|
||||
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
|
||||
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
|
||||
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
|
||||
test r9d, r9d # atom->Nlocal <= 0
|
||||
jle ..atom_loop_exit
|
||||
xor r10d, r10d # r10d <- 0
|
||||
mov ecx, r9d # ecx <- atom->Nlocal
|
||||
xor r8d, r8d # r8d <- 0
|
||||
mov r11d, 1 # r11d <- 1
|
||||
xor eax, eax # eax <- 0
|
||||
shr ecx, 1 # ecx <- atom->Nlocal >> 1
|
||||
je ..zero_last_element # ecx == 0
|
||||
|
||||
# Init forces to zero loop (unroll factor = 2)
|
||||
..init_force_loop:
|
||||
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
|
||||
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
|
||||
add r8, 16 # i++
|
||||
inc r10 # i++
|
||||
cmp r10, rcx # i < Nlocal
|
||||
jb ..init_force_loop
|
||||
|
||||
# Trick to make r11d contain value of last element to be zeroed plus 1
|
||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
|
||||
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
|
||||
..zero_last_element:
|
||||
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
|
||||
cmp ecx, r9d # i >= Nlocal
|
||||
jae ..before_atom_loop
|
||||
|
||||
# Set last element to zero
|
||||
movsxd r11, r11d # r11 <- i * 2
|
||||
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
|
||||
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
|
||||
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
|
||||
|
||||
# Initialize registers to be used within atom loop
|
||||
..before_atom_loop:
|
||||
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
|
||||
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
|
||||
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
|
||||
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
|
||||
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
|
||||
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
|
||||
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
|
||||
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
|
||||
movsxd r9, r9d # r9 <- atom->Nlocal
|
||||
xor r10d, r10d # r10d <- 0 (i)
|
||||
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
|
||||
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
|
||||
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
|
||||
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
|
||||
### AOS
|
||||
xor eax, eax
|
||||
### SOA
|
||||
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
|
||||
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
|
||||
###
|
||||
shl r12, 2 # r12 <- neighbor->maxneighs * 4
|
||||
|
||||
# Register spilling
|
||||
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
|
||||
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
|
||||
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
|
||||
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
|
||||
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
|
||||
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
|
||||
|
||||
..atom_loop_begin:
|
||||
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
|
||||
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
|
||||
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
|
||||
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
|
||||
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
|
||||
|
||||
### AOS
|
||||
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
|
||||
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
|
||||
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
|
||||
### SOA
|
||||
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
|
||||
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
|
||||
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
|
||||
###
|
||||
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
|
||||
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
|
||||
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
|
||||
test r13d, r13d # numneighs <= 0
|
||||
jle ..atom_loop_exit
|
||||
|
||||
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
|
||||
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
|
||||
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
|
||||
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
|
||||
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
|
||||
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
|
||||
xor r9d, r9d # r9d <- 0 (k)
|
||||
mov r14d, r13d # r14d <- numneighs
|
||||
cmp r14d, 8
|
||||
jl ..compute_forces_remainder
|
||||
|
||||
..compute_forces:
|
||||
vpcmpeqb k1, xmm0, xmm0
|
||||
vpcmpeqb k2, xmm0, xmm0
|
||||
vpcmpeqb k3, xmm0, xmm0
|
||||
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
||||
sub r14d, 8
|
||||
add r9, 8
|
||||
cmp r14d, 8
|
||||
jge ..compute_forces
|
||||
|
||||
# Check if there are remaining neighbors to be computed
|
||||
..compute_forces_remainder:
|
||||
test r14d, r14d
|
||||
jle ..sum_up_forces
|
||||
|
||||
vpbroadcastd ymm4, r14d
|
||||
vpcmpgtd k1, ymm4, ymm17
|
||||
kmovw r15d, k1
|
||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
|
||||
kmovw k2, k1
|
||||
kmovw k3, k1
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
#### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
kmovw r9d, k5 # r9d <- rsq < cutforcesq
|
||||
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
|
||||
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
|
||||
|
||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
||||
# and add them (reduction) to obtain the final contribution for the current atom
|
||||
..sum_up_forces:
|
||||
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
|
||||
vpermd zmm0, zmm10, zmm11
|
||||
vpermd zmm5, zmm10, zmm12
|
||||
vpermd zmm21, zmm10, zmm13
|
||||
vaddpd zmm11, zmm0, zmm11
|
||||
vaddpd zmm12, zmm5, zmm12
|
||||
vaddpd zmm13, zmm21, zmm13
|
||||
vpermpd zmm1, zmm11, 78
|
||||
vpermpd zmm6, zmm12, 78
|
||||
vpermpd zmm22, zmm13, 78
|
||||
vaddpd zmm2, zmm11, zmm1
|
||||
vaddpd zmm8, zmm12, zmm6
|
||||
vaddpd zmm23, zmm13, zmm22
|
||||
vpermpd zmm3, zmm2, 177
|
||||
vpermpd zmm9, zmm8, 177
|
||||
vpermpd zmm24, zmm23, 177
|
||||
vaddpd zmm4, zmm2, zmm3
|
||||
vaddpd zmm20, zmm8, zmm9
|
||||
vaddpd zmm25, zmm23, zmm24
|
||||
|
||||
..atom_loop_exit:
|
||||
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
|
||||
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
|
||||
|
||||
### AOS
|
||||
add rax, 24
|
||||
###
|
||||
|
||||
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
|
||||
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
|
||||
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
|
||||
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
|
||||
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
|
||||
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
|
||||
inc r10 #55.5
|
||||
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
|
||||
jb ..atom_loop_begin
|
||||
vzeroupper #93.12
|
||||
vxorpd xmm0, xmm0, xmm0 #93.12
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
|
||||
pop rbx
|
||||
pop r15
|
||||
pop r14 #93.12
|
||||
pop r13 #93.12
|
||||
pop r12 #93.12
|
||||
pop rbp #93.12
|
||||
ret #93.12
|
||||
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
|
||||
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
@@ -1,326 +0,0 @@
|
||||
.intel_syntax noprefix
|
||||
|
||||
.text
|
||||
.align 16,0x90
|
||||
.globl computeForceLJ
|
||||
computeForceLJ:
|
||||
# parameter 1: rdi Parameter*
|
||||
# parameter 2: rsi Atom*
|
||||
# parameter 3: rdx Neighbor*
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbx
|
||||
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
|
||||
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
|
||||
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
|
||||
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
|
||||
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
|
||||
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
|
||||
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
|
||||
test r9d, r9d # atom->Nlocal <= 0
|
||||
jle ..atom_loop_exit
|
||||
xor r10d, r10d # r10d <- 0
|
||||
mov ecx, r9d # ecx <- atom->Nlocal
|
||||
xor r8d, r8d # r8d <- 0
|
||||
mov r11d, 1 # r11d <- 1
|
||||
xor eax, eax # eax <- 0
|
||||
shr ecx, 1 # ecx <- atom->Nlocal >> 1
|
||||
je ..zero_last_element # ecx == 0
|
||||
|
||||
# Init forces to zero loop (unroll factor = 2)
|
||||
..init_force_loop:
|
||||
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
|
||||
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
|
||||
add r8, 16 # i++
|
||||
inc r10 # i++
|
||||
cmp r10, rcx # i < Nlocal
|
||||
jb ..init_force_loop
|
||||
|
||||
# Trick to make r11d contain value of last element to be zeroed plus 1
|
||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
|
||||
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
|
||||
..zero_last_element:
|
||||
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
|
||||
cmp ecx, r9d # i >= Nlocal
|
||||
jae ..before_atom_loop
|
||||
|
||||
# Set last element to zero
|
||||
movsxd r11, r11d # r11 <- i * 2
|
||||
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
|
||||
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
|
||||
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
|
||||
|
||||
# Initialize registers to be used within atom loop
|
||||
..before_atom_loop:
|
||||
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
|
||||
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
|
||||
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
|
||||
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
|
||||
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
|
||||
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
|
||||
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
|
||||
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
|
||||
movsxd r9, r9d # r9 <- atom->Nlocal
|
||||
xor r10d, r10d # r10d <- 0 (i)
|
||||
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
|
||||
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
|
||||
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
|
||||
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
|
||||
### AOS
|
||||
xor eax, eax
|
||||
### SOA
|
||||
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
|
||||
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
|
||||
###
|
||||
shl r12, 2 # r12 <- neighbor->maxneighs * 4
|
||||
|
||||
# Register spilling
|
||||
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
|
||||
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
|
||||
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
|
||||
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
|
||||
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
|
||||
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
|
||||
#sub rsp, 64
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
|
||||
#add rsp, 64
|
||||
|
||||
..atom_loop_begin:
|
||||
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
|
||||
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
|
||||
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
|
||||
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
|
||||
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
|
||||
|
||||
### AOS
|
||||
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
|
||||
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
|
||||
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
|
||||
### SOA
|
||||
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
|
||||
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
|
||||
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
|
||||
###
|
||||
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
|
||||
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
|
||||
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
|
||||
test r13d, r13d # numneighs <= 0
|
||||
jle ..atom_loop_exit
|
||||
|
||||
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
|
||||
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
|
||||
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
|
||||
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
|
||||
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
|
||||
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
|
||||
xor r9d, r9d # r9d <- 0 (k)
|
||||
mov r14d, r13d # r14d <- numneighs
|
||||
cmp r14d, 8
|
||||
jl ..compute_forces_remainder
|
||||
|
||||
..compute_forces:
|
||||
vpcmpeqb k1, xmm0, xmm0
|
||||
vpcmpeqb k2, xmm0, xmm0
|
||||
vpcmpeqb k3, xmm0, xmm0
|
||||
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
||||
sub r14d, 8
|
||||
add r9, 8
|
||||
cmp r14d, 8
|
||||
jge ..compute_forces
|
||||
|
||||
# Check if there are remaining neighbors to be computed
|
||||
..compute_forces_remainder:
|
||||
test r14d, r14d
|
||||
jle ..sum_up_forces
|
||||
|
||||
vpbroadcastd ymm4, r14d
|
||||
vpcmpgtd k1, ymm4, ymm17
|
||||
kmovw r15d, k1
|
||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
|
||||
kmovw k2, k1
|
||||
kmovw k3, k1
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
#### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
kmovw r9d, k5 # r9d <- rsq < cutforcesq
|
||||
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
|
||||
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
|
||||
|
||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
||||
# and add them (reduction) to obtain the final contribution for the current atom
|
||||
..sum_up_forces:
|
||||
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
|
||||
vpermd zmm0, zmm10, zmm11
|
||||
vpermd zmm5, zmm10, zmm12
|
||||
vpermd zmm21, zmm10, zmm13
|
||||
vaddpd zmm11, zmm0, zmm11
|
||||
vaddpd zmm12, zmm5, zmm12
|
||||
vaddpd zmm13, zmm21, zmm13
|
||||
vpermpd zmm1, zmm11, 78
|
||||
vpermpd zmm6, zmm12, 78
|
||||
vpermpd zmm22, zmm13, 78
|
||||
vaddpd zmm2, zmm11, zmm1
|
||||
vaddpd zmm8, zmm12, zmm6
|
||||
vaddpd zmm23, zmm13, zmm22
|
||||
vpermpd zmm3, zmm2, 177
|
||||
vpermpd zmm9, zmm8, 177
|
||||
vpermpd zmm24, zmm23, 177
|
||||
vaddpd zmm4, zmm2, zmm3
|
||||
vaddpd zmm20, zmm8, zmm9
|
||||
vaddpd zmm25, zmm23, zmm24
|
||||
|
||||
..atom_loop_exit:
|
||||
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
|
||||
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
|
||||
|
||||
### AOS
|
||||
add rax, 24
|
||||
###
|
||||
|
||||
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
|
||||
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
|
||||
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
|
||||
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
|
||||
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
|
||||
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
|
||||
inc r10 #55.5
|
||||
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
|
||||
jb ..atom_loop_begin
|
||||
vzeroupper #93.12
|
||||
vxorpd xmm0, xmm0, xmm0 #93.12
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
|
||||
pop rbx
|
||||
pop r15
|
||||
pop r14 #93.12
|
||||
pop r13 #93.12
|
||||
pop r12 #93.12
|
||||
pop rbp #93.12
|
||||
ret #93.12
|
||||
|
||||
.type computeForceLJ,@function
|
||||
.size computeForceLJ,.-computeForceLJ
|
||||
|
||||
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForceLJ
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
97
common/box.c
Normal file
97
common/box.c
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
#include <box.h>
|
||||
#include <mpi.h>
|
||||
|
||||
int overlapBox(int dim, int dir, const Box* mybox, const Box* other, Box* cut, MD_FLOAT xprd, MD_FLOAT cutneigh)
|
||||
{
|
||||
int pbc = -100;
|
||||
MD_FLOAT min[3], max[3];
|
||||
int same = (mybox->id == other->id) ? 1 : 0;
|
||||
|
||||
//projections
|
||||
min[_x] = MAX(mybox->lo[_x], other->lo[_x]); max[_x] = MIN(mybox->hi[_x], other->hi[_x]);
|
||||
min[_y] = MAX(mybox->lo[_y], other->lo[_y]); max[_y] = MIN(mybox->hi[_y], other->hi[_y]);
|
||||
min[_z] = MAX(mybox->lo[_z], other->lo[_z]); max[_z] = MIN(mybox->hi[_z], other->hi[_z]);
|
||||
|
||||
//Intersection no periodic case
|
||||
if(!same){
|
||||
if (dir == 0) max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ cutneigh);
|
||||
if (dir == 1) min[dim] = MAX(mybox->lo[dim], other->lo[dim]- cutneigh);
|
||||
if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z])) pbc = 0;
|
||||
}
|
||||
|
||||
//Intersection periodic case
|
||||
if(pbc < 0)
|
||||
{
|
||||
if(dir == 0){
|
||||
min[dim] = MAX(mybox->lo[dim] , other->lo[dim]- xprd);
|
||||
max[dim] = MIN(mybox->hi[dim] , other->hi[dim]- xprd + cutneigh);
|
||||
|
||||
} else {
|
||||
min[dim] = MAX(mybox->lo[dim], other->lo[dim]+ xprd - cutneigh);
|
||||
max[dim] = MIN(mybox->hi[dim], other->hi[dim]+ xprd);
|
||||
|
||||
}
|
||||
if((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z]))
|
||||
pbc = (dir == 0) ? 1:-1;
|
||||
}
|
||||
|
||||
//storing the cuts
|
||||
cut->lo[_x] = min[_x]; cut->hi[_x] = max[_x];
|
||||
cut->lo[_y] = min[_y]; cut->hi[_y] = max[_y];
|
||||
cut->lo[_z] = min[_z]; cut->hi[_z] = max[_z];
|
||||
|
||||
return pbc;
|
||||
}
|
||||
|
||||
int overlapFullBox(Parameter* param, MD_FLOAT *cutneigh ,const Box* mybox, const Box* other)
|
||||
{
|
||||
MD_FLOAT min[3], max[3];
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int k = -1; k < 2; k++)
|
||||
{
|
||||
for(int j = -1; j < 2; j++)
|
||||
{
|
||||
for(int i= -1; i < 2; i++)
|
||||
{
|
||||
min[_x] = MAX(mybox->lo[_x], other->lo[_x]-cutneigh[_x] + i*xprd);
|
||||
min[_y] = MAX(mybox->lo[_y], other->lo[_y]-cutneigh[_y] + j*yprd);
|
||||
min[_z] = MAX(mybox->lo[_z], other->lo[_z]-cutneigh[_z] + k*zprd);
|
||||
max[_x] = MIN(mybox->hi[_x], other->hi[_x]+cutneigh[_x] + i*xprd);
|
||||
max[_y] = MIN(mybox->hi[_y], other->hi[_y]+cutneigh[_y] + j*yprd);
|
||||
max[_z] = MIN(mybox->hi[_z], other->hi[_z]+cutneigh[_z] + k*zprd);
|
||||
if ((min[_x]<max[_x]) && (min[_y]<max[_y]) && (min[_z]<max[_z]))
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void expandBox(int iswap, const Box* me, const Box* other, Box* cut, MD_FLOAT cutneigh)
|
||||
{
|
||||
if(iswap==2 || iswap==3){
|
||||
if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
|
||||
if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
|
||||
}
|
||||
|
||||
if(iswap==4 || iswap==5){
|
||||
if(me->lo[_x] <= other->lo[_x]) cut->lo[_x] -= cutneigh;
|
||||
if(me->hi[_x] >= other->hi[_x]) cut->hi[_x] += cutneigh;
|
||||
if(me->lo[_y] <= other->lo[_y]) cut->lo[_y] -= cutneigh;
|
||||
if(me->hi[_y] >= other->hi[_y]) cut->hi[_y] += cutneigh;
|
||||
}
|
||||
}
|
||||
|
||||
556
common/comm.c
Normal file
556
common/comm.c
Normal file
@@ -0,0 +1,556 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <comm.h>
|
||||
#include <allocate.h>
|
||||
#include <mpi.h>
|
||||
#include <util.h>
|
||||
|
||||
#define NEIGHMIN 6
|
||||
#define BUFFACTOR 2
|
||||
#define BUFMIN 1000
|
||||
#define BUFEXTRA 100
|
||||
#define world MPI_COMM_WORLD
|
||||
|
||||
MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
|
||||
static inline void allocDynamicBuffers(Comm*);
|
||||
static inline void freeDynamicBuffers(Comm*);
|
||||
static inline void freeBuffers(Comm*);
|
||||
|
||||
void defineReverseList(Comm* comm){
|
||||
int dim = 0;
|
||||
int index = 0;
|
||||
int me = comm->myproc;
|
||||
|
||||
//Set the inverse list
|
||||
for(int iswap = 0; iswap<6; iswap++){
|
||||
int dim = comm->swapdim[iswap];
|
||||
int dir = comm->swapdir[iswap];
|
||||
int invswap = comm->swap[dim][(dir+1)%2];
|
||||
|
||||
for(int ineigh = comm->sendfrom[invswap]; ineigh< comm->sendtill[invswap]; ineigh++)
|
||||
comm->nrecv[index++] = comm->nsend[ineigh];
|
||||
|
||||
comm->recvfrom[iswap] = (iswap == 0) ? 0 : comm->recvtill[iswap-1];
|
||||
comm->recvtill[iswap] = index;
|
||||
}
|
||||
|
||||
//set if myproc is unique in the swap
|
||||
for(int iswap = 0; iswap<6; iswap++){
|
||||
int sizeswap = comm->sendtill[iswap]-comm->sendfrom[iswap];
|
||||
int index = comm->sendfrom[iswap];
|
||||
int myneigh = comm->nsend[index];
|
||||
comm->othersend[iswap] = (sizeswap != 1 || comm->myproc != myneigh) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
void addNeighToExchangeList(Comm* comm, int newneigh){
|
||||
|
||||
int numneigh = comm->numneighexch;
|
||||
|
||||
if(comm->numneighexch>=comm->maxneighexch){
|
||||
size_t oldByteSize = comm->maxneighexch*sizeof(int);
|
||||
comm->maxneighexch *=2;
|
||||
comm->nexch = (int*) reallocate(comm->nexch, ALIGNMENT, comm->maxneighexch * sizeof(int), oldByteSize);
|
||||
}
|
||||
|
||||
// Add the new element to the list
|
||||
comm->nexch[numneigh] = newneigh;
|
||||
comm->numneighexch++;
|
||||
}
|
||||
|
||||
//Exported functions
|
||||
void neighComm(Comm *comm, Parameter* param, Grid *grid)
|
||||
{
|
||||
int me = comm->myproc;
|
||||
int numproc = comm ->numproc;
|
||||
int PAD = 6; //number of elements for processor in the map
|
||||
int ineigh = 0;
|
||||
int sneigh = 0;
|
||||
MD_FLOAT *map = grid->map;
|
||||
MD_FLOAT cutneigh = param->cutneigh;
|
||||
MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
|
||||
Box mybox, other, cut;
|
||||
|
||||
//needed for rebalancing
|
||||
freeDynamicBuffers(comm);
|
||||
|
||||
//Local box
|
||||
mybox.id = me;
|
||||
mybox.lo[_x] = map[me*PAD+0]; mybox.hi[_x] = map[me*PAD+3];
|
||||
mybox.lo[_y] = map[me*PAD+1]; mybox.hi[_y] = map[me*PAD+4];
|
||||
mybox.lo[_z] = map[me*PAD+2]; mybox.hi[_z] = map[me*PAD+5];
|
||||
|
||||
//Check for all possible neighbours only for exchange atoms
|
||||
comm->numneighexch = 0;
|
||||
for(int proc = 0; proc <numproc; proc++){
|
||||
other.id = proc;
|
||||
other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3];
|
||||
other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4];
|
||||
other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5];
|
||||
|
||||
if(proc != me){
|
||||
int intersection = overlapFullBox(param,grid->cutneigh,&mybox,&other);
|
||||
if(intersection) addNeighToExchangeList(comm,proc);
|
||||
}
|
||||
}
|
||||
|
||||
//MAP is stored as follows: xlo,ylo,zlo,xhi,yhi,zhi
|
||||
for(int iswap = 0; iswap <6; iswap++)
|
||||
{
|
||||
int dir = comm->swapdir[iswap];
|
||||
int dim = comm->swapdim[iswap];
|
||||
|
||||
for(int proc = 0; proc < numproc; proc++)
|
||||
{
|
||||
//Check for neighbours along dimmensions, for forwardComm, backwardComm and ghostComm
|
||||
other.id = proc;
|
||||
other.lo[_x] = map[proc*PAD+0]; other.hi[_x] = map[proc*PAD+3];
|
||||
other.lo[_y] = map[proc*PAD+1]; other.hi[_y] = map[proc*PAD+4];
|
||||
other.lo[_z] = map[proc*PAD+2]; other.hi[_z] = map[proc*PAD+5];
|
||||
|
||||
//return if two boxes intersect: -100 not intersection, 0, 1 and -1 intersection for each different pbc.
|
||||
int pbc = overlapBox(dim,dir,&mybox,&other,&cut,prd[dim],cutneigh);
|
||||
if(pbc == -100) continue;
|
||||
|
||||
expandBox(iswap, &mybox, &other, &cut, cutneigh);
|
||||
|
||||
if(ineigh >= comm->maxneigh) {
|
||||
size_t oldByteSize = comm->maxneigh*sizeof(int);
|
||||
size_t oldBoxSize = comm->maxneigh*sizeof(Box);
|
||||
comm->maxneigh = 2*ineigh;
|
||||
comm->nsend = (int*) reallocate(comm->nsend, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->nrecv = (int*) reallocate(comm->nrecv, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->pbc_x = (int*) reallocate(comm->pbc_x, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->pbc_y = (int*) reallocate(comm->pbc_y, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->pbc_z = (int*) reallocate(comm->pbc_z, ALIGNMENT, comm->maxneigh * sizeof(int), oldByteSize);
|
||||
comm->boxes = (Box*) reallocate(comm->boxes, ALIGNMENT, comm->maxneigh * sizeof(Box), oldBoxSize);
|
||||
}
|
||||
|
||||
comm->boxes[ineigh] = cut;
|
||||
comm->nsend[ineigh] = proc;
|
||||
comm->pbc_x[ineigh] = (dim == _x) ? pbc : 0;
|
||||
comm->pbc_y[ineigh] = (dim == _y) ? pbc : 0;
|
||||
comm->pbc_z[ineigh] = (dim == _z) ? pbc : 0;
|
||||
ineigh++;
|
||||
}
|
||||
|
||||
comm->sendfrom[iswap] = (iswap == 0) ? 0:comm->sendtill[iswap-1];
|
||||
comm->sendtill[iswap] = ineigh;
|
||||
comm->numneigh = ineigh;
|
||||
}
|
||||
|
||||
allocDynamicBuffers(comm);
|
||||
defineReverseList(comm);
|
||||
}
|
||||
|
||||
void initComm(int* argc, char*** argv, Comm* comm)
|
||||
{
|
||||
//MPI Initialize
|
||||
MPI_Init(argc, argv);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &(comm->numproc));
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &(comm->myproc));
|
||||
comm->numneigh = 0;
|
||||
comm->numneighexch = 0;
|
||||
comm->nrecv=NULL;
|
||||
comm->nsend=NULL;
|
||||
comm->nexch=NULL;
|
||||
comm->pbc_x=NULL;
|
||||
comm->pbc_y=NULL;
|
||||
comm->pbc_z=NULL;
|
||||
comm->boxes=NULL;
|
||||
comm->atom_send=NULL;
|
||||
comm->atom_recv=NULL;
|
||||
comm->off_atom_send=NULL;
|
||||
comm->off_atom_recv=NULL;
|
||||
comm->maxsendlist=NULL;
|
||||
comm->sendlist=NULL;
|
||||
comm->buf_send=NULL;
|
||||
comm->buf_recv=NULL;
|
||||
}
|
||||
|
||||
void endComm(Comm* comm)
|
||||
{
|
||||
comm->maxneigh = 0;
|
||||
comm->maxneighexch =0;
|
||||
comm->maxsend = 0;
|
||||
comm->maxrecv = 0;
|
||||
freeBuffers(comm);
|
||||
MPI_Finalize();
|
||||
}
|
||||
|
||||
void setupComm(Comm* comm, Parameter* param, Grid* grid){
|
||||
|
||||
comm->swap[_x][0] = 0; comm->swap[_x][1] =1;
|
||||
comm->swap[_y][0] = 2; comm->swap[_y][1] =3;
|
||||
comm->swap[_z][0] = 4; comm->swap[_z][1] =5;
|
||||
|
||||
comm->swapdim[0] = comm->swapdim[1] = _x;
|
||||
comm->swapdim[2] = comm->swapdim[3] = _y;
|
||||
comm->swapdim[4] = comm->swapdim[5] = _z;
|
||||
|
||||
comm->swapdir[0] = comm->swapdir[2] = comm->swapdir[4] = 0;
|
||||
comm->swapdir[1] = comm->swapdir[3] = comm->swapdir[5] = 1;
|
||||
|
||||
for(int i = 0; i<6; i++){
|
||||
comm->sendfrom[i] = 0;
|
||||
comm->sendtill[i] = 0;
|
||||
comm->recvfrom[i] = 0;
|
||||
comm->recvtill[i] = 0;
|
||||
}
|
||||
|
||||
comm->forwardSize = FORWARD_SIZE; //send coordiantes x,y,z
|
||||
comm->reverseSize = REVERSE_SIZE; //return forces fx, fy, fz
|
||||
comm->ghostSize = GHOST_SIZE; //send x,y,z,type;
|
||||
comm->exchangeSize = EXCHANGE_SIZE; //send x,y,z,vx,vy,vz,type
|
||||
|
||||
//Allocate memory for recv buffer and recv buffer
|
||||
comm->maxsend = BUFMIN;
|
||||
comm->maxrecv = BUFMIN;
|
||||
comm->buf_send = (MD_FLOAT*) allocate(ALIGNMENT,(comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT));
|
||||
comm->buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
|
||||
|
||||
comm->maxneighexch = NEIGHMIN;
|
||||
comm->nexch = (int*) allocate(ALIGNMENT, comm->maxneighexch * sizeof(int));
|
||||
|
||||
comm->maxneigh = NEIGHMIN;
|
||||
comm->nsend = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->nrecv = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->pbc_x = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->pbc_y = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->pbc_z = (int*) allocate(ALIGNMENT, comm->maxneigh * sizeof(int));
|
||||
comm->boxes = (Box*) allocate(ALIGNMENT, comm->maxneigh * sizeof(Box));
|
||||
|
||||
neighComm(comm, param, grid);
|
||||
}
|
||||
|
||||
void forwardComm(Comm* comm, Atom* atom, int iswap)
|
||||
{
|
||||
int nrqst=0, offset=0, nsend=0, nrecv=0;
|
||||
int pbc[3];
|
||||
int size = comm->forwardSize;
|
||||
int maxrqst = comm->numneigh;
|
||||
MD_FLOAT* buf;
|
||||
MPI_Request requests[maxrqst];
|
||||
|
||||
for(int ineigh = comm->sendfrom[iswap]; ineigh < comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh];
|
||||
pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh];
|
||||
packForward(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &comm->buf_send[offset*size],pbc);
|
||||
}
|
||||
|
||||
//Receives elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh]*size;
|
||||
nrecv = comm->atom_recv[ineigh]*size;
|
||||
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
|
||||
//Send elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh]*size;
|
||||
nsend = comm->atom_send[ineigh]*size;
|
||||
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);
|
||||
}
|
||||
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
if(comm->othersend[iswap]) buf = comm->buf_recv;
|
||||
else buf = comm->buf_send;
|
||||
|
||||
/* unpack buffer */
|
||||
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh];
|
||||
unpackForward(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &buf[offset*size]);
|
||||
}
|
||||
}
|
||||
|
||||
void reverseComm(Comm* comm, Atom* atom, int iswap)
|
||||
{
|
||||
int nrqst=0, offset=0, nsend=0, nrecv=0 ;
|
||||
int size = comm->reverseSize;
|
||||
int maxrqst = comm->numneigh;
|
||||
MD_FLOAT* buf;
|
||||
MPI_Request requests[maxrqst];
|
||||
|
||||
for(int ineigh = comm->recvfrom[iswap]; ineigh < comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh];
|
||||
packReverse(atom, comm->atom_recv[ineigh], comm->firstrecv[iswap] + offset, &comm->buf_send[offset*size]);
|
||||
}
|
||||
//Receives elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh]*size;
|
||||
nrecv = comm->atom_send[ineigh]*size;
|
||||
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nsend[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
//Send elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh]*size;
|
||||
nsend = comm->atom_recv[ineigh]*size;
|
||||
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nrecv[ineigh],0,world);
|
||||
}
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
if(comm->othersend[iswap]) buf = comm->buf_recv;
|
||||
else buf = comm->buf_send;
|
||||
|
||||
/* unpack buffer */
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh];
|
||||
unpackReverse(atom, comm->atom_send[ineigh], comm->sendlist[ineigh], &buf[offset*size]);
|
||||
}
|
||||
}
|
||||
|
||||
void ghostComm(Comm* comm, Atom* atom,int iswap){
|
||||
|
||||
MD_FLOAT xlo=0, xhi=0, ylo=0, yhi=0, zlo=0, zhi=0;
|
||||
MD_FLOAT* buf;
|
||||
int nrqst=0, nsend=0, nrecv=0, offset=0, ineigh=0, pbc[3];
|
||||
int all_recv=0, all_send=0, currentSend=0;
|
||||
int size = comm->ghostSize;
|
||||
int maxrqrst = comm->numneigh;
|
||||
MPI_Request requests[maxrqrst];
|
||||
for(int i = 0; i<maxrqrst; i++)
|
||||
requests[maxrqrst]=MPI_REQUEST_NULL;
|
||||
if(iswap%2==0) comm->iterAtom = LOCAL+GHOST;
|
||||
int iter = 0;
|
||||
for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
|
||||
{
|
||||
Box* tile = &comm->boxes[ineigh];
|
||||
|
||||
xlo = tile->lo[_x]; ylo = tile->lo[_y]; zlo = tile->lo[_z];
|
||||
xhi = tile->hi[_x]; yhi = tile->hi[_y]; zhi = tile->hi[_z];
|
||||
pbc[_x]=comm->pbc_x[ineigh]; pbc[_y]=comm->pbc_y[ineigh]; pbc[_z]=comm->pbc_z[ineigh];
|
||||
nsend = 0;
|
||||
|
||||
for(int i = 0; i < comm->iterAtom ; i++)
|
||||
{
|
||||
if(IsinRegionToSend(i)){
|
||||
if(nsend >= comm->maxsendlist[ineigh]) growList(comm,ineigh,nsend);
|
||||
if(currentSend + size >= comm->maxsend) growSend(comm,currentSend);
|
||||
comm->sendlist[ineigh][nsend++] = i;
|
||||
currentSend += packGhost(atom, i, &comm->buf_send[currentSend], pbc);
|
||||
}
|
||||
}
|
||||
comm->atom_send[ineigh] = nsend; //#atoms send per neigh
|
||||
comm->off_atom_send[ineigh] = all_send; //offset atom respect to neighbours in a swap
|
||||
all_send += nsend; //all atoms send
|
||||
}
|
||||
//Receives how many elements to be received.
|
||||
if(comm->othersend[iswap])
|
||||
for(nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++)
|
||||
MPI_Irecv(&comm->atom_recv[ineigh],1,MPI_INT,comm->nrecv[ineigh],0,world,&requests[nrqst++]);
|
||||
|
||||
if(!comm->othersend[iswap]) comm->atom_recv[comm->recvfrom[iswap]] = nsend;
|
||||
|
||||
//Communicate how many elements to be sent.
|
||||
if(comm->othersend[iswap])
|
||||
for(int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++)
|
||||
MPI_Send(&comm->atom_send[ineigh],1,MPI_INT,comm->nsend[ineigh],0,world);
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
//Define offset to store in the recv_buff
|
||||
for(int ineigh = comm->recvfrom[iswap]; ineigh<comm->recvtill[iswap]; ineigh++){
|
||||
comm->off_atom_recv[ineigh] = all_recv;
|
||||
all_recv += comm->atom_recv[ineigh];
|
||||
}
|
||||
|
||||
if(all_recv*size>=comm->maxrecv) growRecv(comm,all_recv*size);
|
||||
|
||||
//Receives elements
|
||||
if(comm->othersend[iswap])
|
||||
for (nrqst=0, ineigh = comm->recvfrom[iswap]; ineigh< comm->recvtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_recv[ineigh]*size;
|
||||
nrecv = comm->atom_recv[ineigh]*size;
|
||||
MPI_Irecv(&comm->buf_recv[offset], nrecv, type, comm->nrecv[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
//Send elements
|
||||
if(comm->othersend[iswap])
|
||||
for (int ineigh = comm->sendfrom[iswap]; ineigh< comm->sendtill[iswap]; ineigh++){
|
||||
offset = comm->off_atom_send[ineigh]*size;
|
||||
nsend = comm->atom_send[ineigh]*size;
|
||||
MPI_Send(&comm->buf_send[offset],nsend,type,comm->nsend[ineigh],0,world);
|
||||
}
|
||||
if(comm->othersend[iswap]) MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
if(comm->othersend[iswap]) buf = comm->buf_recv;
|
||||
else buf = comm->buf_send;
|
||||
//unpack elements
|
||||
comm->firstrecv[iswap] = LOCAL+GHOST;
|
||||
for(int i = 0; i < all_recv; i++)
|
||||
unpackGhost(atom, LOCAL+GHOST, &buf[i*size]);
|
||||
|
||||
//Increases the buffer if needed
|
||||
int max_size = MAX(comm->forwardSize,comm->reverseSize);
|
||||
int max_buf = max_size * MAX(all_recv, all_send);
|
||||
if(max_buf>=comm->maxrecv) growRecv(comm,max_buf);
|
||||
if(max_buf>=comm->maxsend) growSend(comm,max_buf);
|
||||
}
|
||||
|
||||
void exchangeComm(Comm* comm, Atom* atom){
|
||||
|
||||
MD_FLOAT x,y,z;
|
||||
MD_FLOAT *lo = atom->mybox.lo;
|
||||
MD_FLOAT *hi = atom->mybox.hi;
|
||||
int size = comm->exchangeSize;
|
||||
int numneigh = comm->numneighexch;
|
||||
int offset_recv[numneigh];
|
||||
int size_recv[numneigh];
|
||||
MPI_Request requests[numneigh];
|
||||
int i =0, nsend = 0, nrecv = 0;
|
||||
int nrqst = 0;
|
||||
int nlocal, offset,m;
|
||||
|
||||
/* enforce PBC */
|
||||
pbc(atom);
|
||||
|
||||
if(comm->numneigh == 0) return;
|
||||
|
||||
nlocal = atom->Nlocal;
|
||||
while(i < nlocal) {
|
||||
if(atom_x(i) < lo[_x] || atom_x(i) >= hi[_x] ||
|
||||
atom_y(i) < lo[_y] || atom_y(i) >= hi[_y] ||
|
||||
atom_z(i) < lo[_z] || atom_z(i) >= hi[_z]) {
|
||||
if(nsend+size >= comm->maxsend) growSend(comm, nsend);
|
||||
nsend += packExchange(atom, i, &comm->buf_send[nsend]);
|
||||
copy(atom, i, nlocal-1);
|
||||
nlocal--;
|
||||
} else i++;
|
||||
}
|
||||
atom->Nlocal = nlocal;
|
||||
|
||||
/* send/recv number of to share atoms with neighbouring procs*/
|
||||
for(int ineigh = 0; ineigh < numneigh; ineigh++)
|
||||
MPI_Irecv(&size_recv[ineigh],1,MPI_INT,comm->nexch[ineigh],0,world,&requests[nrqst++]);
|
||||
|
||||
for (int ineigh = 0; ineigh < numneigh; ineigh++)
|
||||
MPI_Send(&nsend,1,MPI_INT,comm->nexch[ineigh],0,world);
|
||||
MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
//Define offset to store in the recv_buff
|
||||
for(int ineigh = 0; ineigh<numneigh; ineigh++){
|
||||
offset_recv[ineigh] = nrecv;
|
||||
nrecv += size_recv[ineigh];
|
||||
}
|
||||
|
||||
if(nrecv >= comm->maxrecv) growRecv(comm,nrecv);
|
||||
|
||||
//Receives elements
|
||||
nrqst=0;
|
||||
for (int ineigh = 0; ineigh< numneigh; ineigh++){
|
||||
offset = offset_recv[ineigh];
|
||||
MPI_Irecv(&comm->buf_recv[offset], size_recv[ineigh], type, comm->nexch[ineigh],0,world,&requests[nrqst++]);
|
||||
}
|
||||
//Send elements
|
||||
for (int ineigh = 0; ineigh< numneigh; ineigh++)
|
||||
MPI_Send(comm->buf_send,nsend,type,comm->nexch[ineigh],0,world);
|
||||
MPI_Waitall(nrqst,requests,MPI_STATUS_IGNORE);
|
||||
|
||||
nlocal = atom->Nlocal;
|
||||
m = 0;
|
||||
while(m < nrecv) {
|
||||
x = comm->buf_recv[m + _x];
|
||||
y = comm->buf_recv[m + _y];
|
||||
z = comm->buf_recv[m + _z];
|
||||
|
||||
if(x >= lo[_x] && x < hi[_x] &&
|
||||
y >= lo[_y] && y < hi[_y] &&
|
||||
z >= lo[_z] && z < hi[_z]){
|
||||
m += unpackExchange(atom, nlocal++, &comm->buf_recv[m]);
|
||||
} else {
|
||||
m += size;
|
||||
}
|
||||
}
|
||||
atom->Nlocal = nlocal;
|
||||
|
||||
int all_atoms=0;
|
||||
MPI_Allreduce(&atom->Nlocal, &all_atoms, 1, MPI_INT, MPI_SUM, world);
|
||||
if(atom->Natoms!=all_atoms && comm->myproc ==0){
|
||||
printf("Losing atoms! current atoms:%d expected atoms:%d\n",all_atoms,atom->Natoms);
|
||||
}
|
||||
}
|
||||
|
||||
//Internal functions
|
||||
|
||||
inline void growRecv(Comm* comm, int n)
|
||||
{
|
||||
comm -> maxrecv = BUFFACTOR * n;
|
||||
if(comm->buf_recv) free(comm -> buf_recv);
|
||||
comm -> buf_recv = (MD_FLOAT*) allocate(ALIGNMENT, comm->maxrecv * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
inline void growSend(Comm* comm, int n)
|
||||
{
|
||||
size_t oldByteSize = (comm->maxsend+BUFEXTRA)*sizeof(MD_FLOAT);
|
||||
comm -> maxsend = BUFFACTOR * n;
|
||||
comm -> buf_send = (MD_FLOAT*) reallocate(comm->buf_send, ALIGNMENT, (comm->maxsend + BUFEXTRA) * sizeof(MD_FLOAT), oldByteSize);
|
||||
}
|
||||
|
||||
inline void growList(Comm* comm, int ineigh, int n)
|
||||
{
|
||||
size_t oldByteSize = comm->maxsendlist[ineigh]*sizeof(int);
|
||||
comm->maxsendlist[ineigh] = BUFFACTOR * n;
|
||||
comm->sendlist[ineigh] = (int*) reallocate(comm->sendlist[ineigh],ALIGNMENT, comm->maxsendlist[ineigh] * sizeof(int), oldByteSize);
|
||||
}
|
||||
|
||||
static inline void allocDynamicBuffers(Comm* comm)
|
||||
{
|
||||
//Buffers depending on the # of my neighs
|
||||
int numneigh = comm->numneigh;
|
||||
comm->atom_send = (int*) allocate(ALIGNMENT, numneigh * sizeof(int));
|
||||
comm->atom_recv = (int*) allocate(ALIGNMENT, numneigh * sizeof(int));
|
||||
comm->off_atom_send = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
|
||||
comm->off_atom_recv = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
|
||||
comm->maxsendlist = (int*) allocate(ALIGNMENT,numneigh * sizeof(int));
|
||||
|
||||
for(int i = 0; i < numneigh; i++)
|
||||
comm->maxsendlist[i] = BUFMIN;
|
||||
|
||||
comm->sendlist = (int**) allocate(ALIGNMENT, numneigh * sizeof(int*));
|
||||
for(int i = 0; i < numneigh; i++)
|
||||
comm->sendlist[i] = (int*) allocate(ALIGNMENT, comm->maxsendlist[i] * sizeof(int));
|
||||
}
|
||||
|
||||
static inline void freeDynamicBuffers(Comm* comm)
|
||||
{
|
||||
int numneigh =comm->numneigh;
|
||||
|
||||
if(comm->atom_send) free(comm->atom_send);
|
||||
if(comm->atom_recv) free(comm->atom_recv);
|
||||
if(comm->off_atom_send) free(comm->off_atom_send);
|
||||
if(comm->off_atom_recv) free(comm->off_atom_recv);
|
||||
if(comm->maxsendlist) free(comm->maxsendlist);
|
||||
if(comm->sendlist){
|
||||
for(int i = 0; i < numneigh; i++)
|
||||
if(comm->sendlist[i]) free(comm->sendlist[i]);
|
||||
}
|
||||
if(comm->sendlist) free(comm->sendlist);
|
||||
}
|
||||
|
||||
static inline void freeBuffers(Comm* comm)
|
||||
{
|
||||
if(comm->nrecv) free(comm->nrecv);
|
||||
if(comm->nsend) free(comm->nsend);
|
||||
if(comm->nexch) free(comm->nexch);
|
||||
if(comm->pbc_x) free(comm->pbc_x);
|
||||
if(comm->pbc_y) free(comm->pbc_y);
|
||||
if(comm->pbc_z) free(comm->pbc_z);
|
||||
if(comm->boxes) free(comm->boxes);
|
||||
if(comm->atom_send) free(comm->atom_send);
|
||||
if(comm->atom_recv) free(comm->atom_recv);
|
||||
if(comm->off_atom_send) free(comm->off_atom_send);
|
||||
if(comm->off_atom_recv) free(comm->off_atom_recv);
|
||||
if(comm->maxsendlist) free(comm->maxsendlist);
|
||||
|
||||
if(comm->sendlist){
|
||||
for(int i = 0; i < comm->numneigh; i++)
|
||||
if(comm->sendlist[i]) free(comm->sendlist[i]);
|
||||
}
|
||||
if(comm->sendlist) free(comm->sendlist);
|
||||
|
||||
if(comm->buf_send) free(comm->buf_send);
|
||||
if(comm->buf_recv) free(comm->buf_recv);
|
||||
}
|
||||
490
common/grid.c
Normal file
490
common/grid.c
Normal file
@@ -0,0 +1,490 @@
|
||||
#include <stdio.h>
|
||||
#include <grid.h>
|
||||
#include <mpi.h>
|
||||
#include <parameter.h>
|
||||
#include <allocate.h>
|
||||
#include <util.h>
|
||||
#include <math.h>
|
||||
|
||||
static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
|
||||
|
||||
//Grommacs Balancing
|
||||
MD_FLOAT f_normalization(MD_FLOAT* x,MD_FLOAT* fx, MD_FLOAT minx, int nprocs) {
|
||||
|
||||
MD_FLOAT sum=0;
|
||||
for(int n = 0; n<nprocs; n++){
|
||||
fx[n] = MAX(minx,x[n]);
|
||||
sum+=fx[n];
|
||||
}
|
||||
|
||||
for(int n = 0; n<nprocs; n++)
|
||||
fx[n] /= sum;
|
||||
}
|
||||
|
||||
void fixedPointIteration(MD_FLOAT* x0, int nprocs, MD_FLOAT minx)
|
||||
{
|
||||
MD_FLOAT tolerance = 1e-3;
|
||||
MD_FLOAT alpha = 0.5;
|
||||
MD_FLOAT *fx = (MD_FLOAT*) malloc(nprocs*sizeof(MD_FLOAT));
|
||||
int maxIterations = 100;
|
||||
|
||||
for (int i = 0; i < maxIterations; i++) {
|
||||
|
||||
int converged = 1;
|
||||
f_normalization(x0,fx,minx,nprocs);
|
||||
|
||||
for(int n=0; n<nprocs; n++)
|
||||
fx[n]= (1-alpha) * x0[n] + alpha * fx[n];
|
||||
|
||||
for (int n=0; n<nprocs; n++) {
|
||||
if (fabs(fx[n] - x0[n]) >= tolerance) {
|
||||
converged = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (int n=0; n<nprocs; n++)
|
||||
x0[n] = fx[n];
|
||||
|
||||
if(converged){
|
||||
for(int n = 0; n<nprocs; n++)
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void staggeredBalance(Grid* grid, Atom* atom, Parameter* param, double newTime)
|
||||
{
|
||||
int me;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
int *coord = grid->coord;
|
||||
int *nprocs = grid ->nprocs;
|
||||
//Elapsed time since the last rebalance
|
||||
double time = newTime - grid->Timer;
|
||||
grid->Timer = newTime;
|
||||
//store the older dimm to compare later for exchange
|
||||
MD_FLOAT lo[3], hi[3];
|
||||
for(int dim = 0; dim< 3; dim++){
|
||||
lo[dim] = atom->mybox.lo[dim];
|
||||
hi[dim] = atom->mybox.hi[dim];
|
||||
}
|
||||
|
||||
//Define parameters
|
||||
MPI_Comm subComm[3];
|
||||
int color[3] = {0,0,0};
|
||||
int id[3] = {0,0,0};
|
||||
MD_FLOAT ** load = (MD_FLOAT**) malloc(3*sizeof(MD_FLOAT*));
|
||||
for(int dim = 0; dim<3; dim++)
|
||||
load[dim] = (MD_FLOAT*) malloc(nprocs[dim]*sizeof(MD_FLOAT));
|
||||
|
||||
int maxprocs = MAX(MAX(nprocs[_x],nprocs[_y]),nprocs[_z]);
|
||||
MD_FLOAT* cellSize = (MD_FLOAT*) malloc(maxprocs*sizeof(MD_FLOAT));
|
||||
MD_FLOAT* limits = (MD_FLOAT*) malloc(2*maxprocs*sizeof(MD_FLOAT)); //limits: (x0, x1), (x1, x2)... Repeat values in between to perfom MPI_Scatter later
|
||||
MD_FLOAT t_sum[3] = {0,0,0};
|
||||
MD_FLOAT recv_buf[2] = {0,0}; //Each proc only receives 2 elments per dimension xlo and xhi
|
||||
MD_FLOAT balancedLoad[3] = {0,0,0}; //1/nprocs
|
||||
MD_FLOAT minLoad[3] = {0,0,0}; //beta*(1/nprocs)
|
||||
MD_FLOAT prd[3] = {param->xprd, param->yprd, param->zprd};
|
||||
MD_FLOAT boundaries[6] ={0,0,0,0,0,0}; // xlo,xhi,ylo,yhi,zlo,zhi
|
||||
|
||||
//Create sub-communications along each dimension
|
||||
for(int dim = 0; dim<3; dim++){
|
||||
if(dim == _x){
|
||||
color[_x] = (coord[_y] == 0 && coord[_z] ==0) ? 1:MPI_UNDEFINED;
|
||||
id[_x] = me;
|
||||
} else if(dim == _y) {
|
||||
color[_y] = coord[_z] == 0 ? coord[_x]:MPI_UNDEFINED;
|
||||
id[_y] = (coord[_y] == 0 && coord[_z] == 0) ? 0:me;
|
||||
} else {
|
||||
color[_z]= coord[_y]*nprocs[_x]+coord[_x];
|
||||
id[_z] = coord[_z] == 0 ? 0 : me;
|
||||
}
|
||||
MPI_Comm_split(world, color[dim], id[dim], &subComm[dim]);
|
||||
}
|
||||
|
||||
//Set the minimum load and the balance load
|
||||
for(int dim = 0; dim<3; dim++){
|
||||
balancedLoad[dim] = 1./nprocs[dim];
|
||||
minLoad[dim] = 0.8*balancedLoad[dim];
|
||||
}
|
||||
//set and communicate the workload in reverse order
|
||||
for(int dim = _z; dim>= _x; dim--)
|
||||
{
|
||||
if(subComm[dim] != MPI_COMM_NULL){
|
||||
MPI_Gather(&time,1,type,load[dim],1,type,0,subComm[dim]);
|
||||
|
||||
if(id[dim] == 0)
|
||||
{
|
||||
for(int n=0; n<nprocs[dim]; n++)
|
||||
t_sum[dim] += load[dim][n];
|
||||
|
||||
for(int n=0; n<nprocs[dim]; n++)
|
||||
load[dim][n] /= t_sum[dim];
|
||||
}
|
||||
time =t_sum[dim];
|
||||
}
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
|
||||
//Brodacast the new boundaries along dimensions
|
||||
for(int dim=0; dim<3; dim++){
|
||||
|
||||
if(subComm[dim] != MPI_COMM_NULL){
|
||||
|
||||
MPI_Bcast(boundaries,6,type,0,subComm[dim]);
|
||||
if(id[dim] == 0) {
|
||||
fixedPointIteration(load[dim], nprocs[dim], minLoad[dim]);
|
||||
MD_FLOAT inv_sum=0;
|
||||
for(int n=0; n<nprocs[dim];n++)
|
||||
inv_sum +=(1/load[dim][n]);
|
||||
|
||||
for(int n=0; n<nprocs[dim];n++)
|
||||
cellSize[n] = (prd[dim]/load[dim][n])*(1./inv_sum);
|
||||
|
||||
MD_FLOAT sum=0;
|
||||
for(int n=0; n<nprocs[dim]; n++){
|
||||
limits[2*n] = sum;
|
||||
limits[2*n+1] = sum+cellSize[n];
|
||||
sum+= cellSize[n];
|
||||
}
|
||||
limits[2*nprocs[dim]-1] = prd[dim];
|
||||
}
|
||||
MPI_Scatter(limits,2,type,recv_buf,2,type,0,subComm[dim]);
|
||||
boundaries[2*dim] = recv_buf[0];
|
||||
boundaries[2*dim+1] = recv_buf[1];
|
||||
}
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
|
||||
atom->mybox.lo[_x]=boundaries[0]; atom->mybox.hi[_x]=boundaries[1];
|
||||
atom->mybox.lo[_y]=boundaries[2]; atom->mybox.hi[_y]=boundaries[3];
|
||||
atom->mybox.lo[_z]=boundaries[4]; atom->mybox.hi[_z]=boundaries[5];
|
||||
|
||||
MD_FLOAT domain[6] = {boundaries[0], boundaries[2], boundaries[4], boundaries[1], boundaries[3], boundaries[5]};
|
||||
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
|
||||
|
||||
//because cells change dynamically, It is required to increase the neighbouring exchange region
|
||||
for(int dim =_x; dim<=_z; dim++){
|
||||
MD_FLOAT dr,dr_max;
|
||||
int n = grid->nprocs[dim];
|
||||
MD_FLOAT maxdelta = 0.2*prd[dim];
|
||||
dr = MAX(fabs(lo[dim] - atom->mybox.lo[dim]),fabs(hi[dim] - atom->mybox.hi[dim]));
|
||||
MPI_Allreduce(&dr, &dr_max, 1, type, MPI_MAX, world);
|
||||
grid->cutneigh[dim] = param->cutneigh+dr_max;
|
||||
}
|
||||
|
||||
for(int dim=0; dim<3; dim++) {
|
||||
if(subComm[dim] != MPI_COMM_NULL){
|
||||
MPI_Comm_free(&subComm[dim]);
|
||||
}
|
||||
free(load[dim]);
|
||||
}
|
||||
free(load);
|
||||
free(limits);
|
||||
}
|
||||
|
||||
//RCB Balancing
|
||||
MD_FLOAT meanTimeBisect(Atom *atom, MPI_Comm subComm, int dim, double time)
|
||||
{
|
||||
MD_FLOAT mean=0, sum=0, total_sum=0, weightAtoms= 0, total_weight=0;
|
||||
|
||||
for(int i=0; i<atom->Nlocal; i++){
|
||||
sum += atom_pos(i);
|
||||
}
|
||||
sum*=time;
|
||||
weightAtoms = atom->Nlocal*time;
|
||||
MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
|
||||
MPI_Allreduce(&weightAtoms, &total_weight, 1, type, MPI_SUM, subComm);
|
||||
|
||||
mean = total_sum/total_weight;
|
||||
return mean;
|
||||
}
|
||||
|
||||
MD_FLOAT meanBisect(Atom* atom, MPI_Comm subComm, int dim, double time)
|
||||
{
|
||||
int Natoms = 0;
|
||||
MD_FLOAT sum=0, mean=0, total_sum=0;
|
||||
|
||||
for(int i=0; i<atom->Nlocal; i++){
|
||||
sum += atom_pos(i);
|
||||
}
|
||||
MPI_Allreduce(&sum, &total_sum, 1, type, MPI_SUM, subComm);
|
||||
MPI_Allreduce(&atom->Nlocal, &Natoms, 1, MPI_INT, MPI_SUM, subComm);
|
||||
mean = total_sum/Natoms;
|
||||
return mean;
|
||||
}
|
||||
|
||||
void nextBisectionLevel(Grid* grid, Atom* atom, RCB_Method method, MPI_Comm subComm, int dim ,int* color, int ilevel, double time)
|
||||
{
|
||||
int rank, size;
|
||||
int branch = 0, i = 0, m = 0;
|
||||
int nsend = 0, nrecv = 0, nrecv2 = 0;
|
||||
int values_per_atom = 7;
|
||||
MD_FLOAT bisection, pos;
|
||||
MPI_Request request[2] = {MPI_REQUEST_NULL,MPI_REQUEST_NULL};
|
||||
MPI_Comm_rank(subComm,&rank);
|
||||
MPI_Comm_size(subComm,&size);
|
||||
|
||||
int odd = size%2;
|
||||
int extraProc = odd ? size-1:size;
|
||||
int half = (int) (0.5*size);
|
||||
int partner = (rank<half) ? rank+half:rank-half;
|
||||
if(odd && rank == extraProc) partner = 0;
|
||||
//Apply the bisection
|
||||
bisection = method(atom,subComm,dim,time);
|
||||
//Define the new boundaries
|
||||
if(rank<half){
|
||||
atom->mybox.hi[dim] = bisection;
|
||||
branch = 0;
|
||||
} else {
|
||||
atom->mybox.lo[dim] = bisection;
|
||||
branch = 1;
|
||||
}
|
||||
//Define new color for the further communicaton
|
||||
*color = (branch << ilevel) | *color;
|
||||
//Grow the send buffer
|
||||
if(atom->Nlocal>=grid->maxsend){
|
||||
if(grid->buf_send) free(grid->buf_send);
|
||||
grid->buf_send = (MD_FLOAT*) malloc(atom->Nlocal*values_per_atom* sizeof(MD_FLOAT));
|
||||
grid->maxsend = atom->Nlocal;
|
||||
}
|
||||
//buffer particles to send
|
||||
while(i < atom->Nlocal) {
|
||||
pos = atom_pos(i);
|
||||
if(pos < atom->mybox.lo[dim] || pos >= atom->mybox.hi[dim]) {
|
||||
nsend += packExchange(atom, i, &grid->buf_send[nsend]);
|
||||
copy(atom, i, atom->Nlocal-1);
|
||||
atom->Nlocal--;
|
||||
} else i++;
|
||||
}
|
||||
|
||||
//Communicate the number of elements to be sent
|
||||
if(rank < extraProc){
|
||||
MPI_Irecv(&nrecv,1,MPI_INT,partner,0,subComm,&request[0]);
|
||||
}
|
||||
if(odd && rank == 0){
|
||||
MPI_Irecv(&nrecv2,1,MPI_INT,extraProc,0,subComm,&request[1]);
|
||||
}
|
||||
MPI_Send(&nsend,1,MPI_INT,partner,0,subComm);
|
||||
MPI_Waitall(2,request,MPI_STATUS_IGNORE);
|
||||
|
||||
//Grow the recv buffer
|
||||
if(nrecv+nrecv2>=grid->maxrecv){
|
||||
if(grid->buf_recv) free(grid->buf_recv);
|
||||
grid->buf_recv = (MD_FLOAT*) malloc((nrecv+nrecv2)*values_per_atom*sizeof(MD_FLOAT));
|
||||
grid->maxrecv = nrecv+nrecv2;
|
||||
}
|
||||
|
||||
//communicate elements in the buffer
|
||||
request[0] = MPI_REQUEST_NULL;
|
||||
request[1] = MPI_REQUEST_NULL;
|
||||
|
||||
if(rank < extraProc){
|
||||
MPI_Irecv(grid->buf_recv,nrecv,type,partner,0,subComm,&request[0]);
|
||||
}
|
||||
if(odd && rank == 0){
|
||||
MPI_Irecv(&grid->buf_recv[nrecv],nrecv2,type,extraProc,0,subComm,&request[1]);
|
||||
}
|
||||
MPI_Send (grid->buf_send,nsend,type,partner,0,subComm);
|
||||
MPI_Waitall(2,request,MPI_STATUS_IGNORE);
|
||||
|
||||
//store atoms in atom list
|
||||
while(m < nrecv+nrecv2){
|
||||
m += unpackExchange(atom, atom->Nlocal++, &grid->buf_recv[m]);
|
||||
}
|
||||
}
|
||||
|
||||
void rcbBalance(Grid* grid, Atom* atom, Parameter* param, RCB_Method method, int ndim, double newTime)
|
||||
{
|
||||
int me, nprocs=0, ilevel=0, nboxes=1;
|
||||
int color = 0, size =0;
|
||||
int index, prd[3];
|
||||
MPI_Comm subComm;
|
||||
MPI_Comm_size(world, &nprocs);
|
||||
MPI_Comm_rank(world, &me);
|
||||
|
||||
//set the elapsed time since the last dynamic balance
|
||||
double time = newTime - grid->Timer;
|
||||
|
||||
prd[_x] = atom->mybox.xprd = param->xprd;
|
||||
prd[_y] = atom->mybox.yprd = param->yprd;
|
||||
prd[_z] = atom->mybox.zprd = param->zprd;
|
||||
|
||||
//Sort by larger dimension
|
||||
int largerDim[3] ={_x, _y, _z};
|
||||
|
||||
for(int i = 0; i< 2; i++){
|
||||
for(int j = i+1; j<3; j++)
|
||||
{
|
||||
if(prd[largerDim[j]]>prd[largerDim[i]]){
|
||||
MD_FLOAT tmp = largerDim[j];
|
||||
largerDim[j] = largerDim[i];
|
||||
largerDim[i] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
//Initial Partition
|
||||
atom->mybox.lo[_x] = 0; atom->mybox.hi[_x] = atom->mybox.xprd;
|
||||
atom->mybox.lo[_y] = 0; atom->mybox.hi[_y] = atom->mybox.yprd;
|
||||
atom->mybox.lo[_z] = 0; atom->mybox.hi[_z] = atom->mybox.zprd;
|
||||
|
||||
//Recursion tree
|
||||
while(nboxes<nprocs)
|
||||
{
|
||||
index = ilevel%ndim;
|
||||
MPI_Comm_split(world, color, me, &subComm);
|
||||
MPI_Comm_size(subComm,&size);
|
||||
if(size > 1){
|
||||
nextBisectionLevel(grid, atom, method, subComm, largerDim[index], &color, ilevel, time);
|
||||
}
|
||||
MPI_Comm_free(&subComm);
|
||||
nboxes = pow(2,++ilevel);
|
||||
}
|
||||
//Set the new timer grid
|
||||
grid->Timer = newTime;
|
||||
|
||||
//Creating the global map
|
||||
MD_FLOAT domain[6] = {atom->mybox.lo[_x], atom->mybox.lo[_y], atom->mybox.lo[_z], atom->mybox.hi[_x], atom->mybox.hi[_y], atom->mybox.hi[_z]};
|
||||
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
|
||||
|
||||
//Define the same cutneighbour in all dimensions for the exchange communication
|
||||
for(int dim =_x; dim<=_z; dim++)
|
||||
grid->cutneigh[dim] = param->cutneigh;
|
||||
}
|
||||
|
||||
//Regular grid
|
||||
void cartisian3d(Grid* grid, Parameter* param, Box* box)
|
||||
{
|
||||
int me, nproc;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
int numdim=3;
|
||||
int reorder=0;
|
||||
int periods[3]={1,1,1};
|
||||
int mycoord[3]={0,0,0};
|
||||
int griddim[3]={0,0,0};
|
||||
MD_FLOAT len[3];
|
||||
MPI_Comm cartesian;
|
||||
|
||||
box->xprd = param->xprd;
|
||||
box->yprd = param->yprd;
|
||||
box->zprd = param->zprd;
|
||||
|
||||
//Creates a cartesian 3d grid
|
||||
MPI_Dims_create(nproc, numdim, griddim);
|
||||
MPI_Cart_create(world,numdim,griddim,periods,reorder,&cartesian);
|
||||
grid->nprocs[_x] = griddim[_x];
|
||||
grid->nprocs[_y] = griddim[_y];
|
||||
grid->nprocs[_z] = griddim[_z];
|
||||
|
||||
//Coordinates position in the grid
|
||||
MPI_Cart_coords(cartesian,me,3,mycoord);
|
||||
grid->coord[_x] = mycoord[_x];
|
||||
grid->coord[_y] = mycoord[_y];
|
||||
grid->coord[_z] = mycoord[_z];
|
||||
|
||||
//boundaries of my local box, with origin in (0,0,0).
|
||||
len[_x] = param->xprd / griddim[_x];
|
||||
len[_y] = param->yprd / griddim[_y];
|
||||
len[_z] = param->zprd / griddim[_z];
|
||||
|
||||
box->lo[_x] = mycoord[_x] * len[_x];
|
||||
box->hi[_x] = (mycoord[_x] + 1) * len[_x];
|
||||
box->lo[_y] = mycoord[_y] * len[_y];
|
||||
box->hi[_y] = (mycoord[_y] + 1) * len[_y];
|
||||
box->lo[_z] = mycoord[_z] * len[_z];
|
||||
box->hi[_z] = (mycoord[_z] + 1) * len[_z];
|
||||
|
||||
MD_FLOAT domain[6] = {box->lo[_x], box->lo[_y], box->lo[_z], box->hi[_x], box->hi[_y], box->hi[_z]};
|
||||
MPI_Allgather(domain, 6, type, grid->map, 6, type, world);
|
||||
MPI_Comm_free(&cartesian);
|
||||
|
||||
//Define the same cutneighbour in all dimensions for the exchange communication
|
||||
for(int dim =_x; dim<=_z; dim++)
|
||||
grid->cutneigh[dim] = param->cutneigh;
|
||||
}
|
||||
|
||||
//Other Functions from the grid
|
||||
void initGrid(Grid* grid)
|
||||
{ //start with regular grid
|
||||
int nprocs;
|
||||
MPI_Comm_size(world, &nprocs);
|
||||
grid->map_size = 6 * nprocs;
|
||||
grid->map = (MD_FLOAT*) allocate(ALIGNMENT, grid->map_size * sizeof(MD_FLOAT));
|
||||
//========rcb=======
|
||||
grid->maxsend = 0;
|
||||
grid->maxrecv = 0;
|
||||
grid->buf_send = NULL;
|
||||
grid->buf_recv = NULL;
|
||||
//====staggered=====
|
||||
grid->Timer = 0.;
|
||||
}
|
||||
|
||||
void setupGrid(Grid* grid, Atom* atom, Parameter* param)
|
||||
{
|
||||
int me;
|
||||
MD_FLOAT xlo, ylo, zlo, xhi, yhi, zhi;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
initGrid(grid);
|
||||
|
||||
//Set the origin at (0,0,0)
|
||||
if(param->input_file){
|
||||
for(int i=0; i<atom->Nlocal; i++){
|
||||
atom_x(i) = atom_x(i) - param->xlo;
|
||||
atom_y(i) = atom_y(i) - param->ylo;
|
||||
atom_z(i) = atom_z(i) - param->zlo;
|
||||
}
|
||||
}
|
||||
|
||||
cartisian3d(grid, param, &atom->mybox);
|
||||
|
||||
xlo = atom->mybox.lo[_x]; xhi = atom->mybox.hi[_x];
|
||||
ylo = atom->mybox.lo[_y]; yhi = atom->mybox.hi[_y];
|
||||
zlo = atom->mybox.lo[_z]; zhi = atom->mybox.hi[_z];
|
||||
|
||||
int i = 0;
|
||||
while(i < atom->Nlocal)
|
||||
{
|
||||
if(atom_x(i) >= xlo && atom_x(i)< xhi &&
|
||||
atom_y(i) >= ylo && atom_y(i)< yhi &&
|
||||
atom_z(i) >= zlo && atom_z(i)< zhi)
|
||||
{
|
||||
i++;
|
||||
} else {
|
||||
copy(atom, i, atom->Nlocal-1);
|
||||
atom->Nlocal--;
|
||||
}
|
||||
}
|
||||
|
||||
//printGrid(grid);
|
||||
if(!param->balance){
|
||||
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
|
||||
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
}
|
||||
|
||||
void printGrid(Grid* grid)
|
||||
{
|
||||
int me, nprocs;
|
||||
MPI_Comm_size(world, &nprocs);
|
||||
MPI_Comm_rank(world, &me);
|
||||
MD_FLOAT* map = grid->map;
|
||||
if(me==0)
|
||||
{
|
||||
|
||||
printf("GRID:\n");
|
||||
printf("===================================================================================================\n");
|
||||
for(int i=0; i<nprocs; i++)
|
||||
printf("Box:%i\txlo:%.4f\txhi:%.4f\tylo:%.4f\tyhi:%.4f\tzlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
|
||||
printf("\n\n");
|
||||
//printf("Box processor:%i\n xlo:%.4f\txhi:%.4f\n ylo:%.4f\tyhi:%.4f\n zlo:%.4f\tzhi:%.4f\n", i,map[6*i],map[6*i+3],map[6*i+1],map[6*i+4],map[6*i+2],map[6*i+5]);
|
||||
}
|
||||
MPI_Barrier(world);
|
||||
}
|
||||
|
||||
|
||||
|
||||
22
common/includes/box.h
Normal file
22
common/includes/box.h
Normal file
@@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __BOX_H_
|
||||
#define __BOX_H_
|
||||
|
||||
typedef struct {
|
||||
int id;
|
||||
MD_FLOAT xprd, yprd, zprd; //Domain Dimension
|
||||
MD_FLOAT lo[3]; //smallest coordinate of my subdomain
|
||||
MD_FLOAT hi[3]; //Highest coordinate of my subdomain
|
||||
} Box;
|
||||
|
||||
int overlapBox(int, int , const Box*, const Box* , Box* , MD_FLOAT , MD_FLOAT);
|
||||
int overlapFullBox(Parameter*, MD_FLOAT*, const Box*, const Box*);
|
||||
void expandBox(int , const Box*, const Box* , Box* , MD_FLOAT);
|
||||
#endif
|
||||
104
common/includes/comm.h
Normal file
104
common/includes/comm.h
Normal file
@@ -0,0 +1,104 @@
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <box.h>
|
||||
#include <grid.h>
|
||||
|
||||
#ifndef COMM_H
|
||||
#define COMM_H
|
||||
|
||||
#ifdef GROMACS
|
||||
#define FORWARD_SIZE (3*CLUSTER_N)
|
||||
#define REVERSE_SIZE (3*CLUSTER_N)
|
||||
#define GHOST_SIZE (4*CLUSTER_N+10)
|
||||
#define EXCHANGE_SIZE 7
|
||||
|
||||
#define JFAC MAX(1, CLUSTER_N / CLUSTER_M)
|
||||
#define LOCAL atom->Nclusters_local / JFAC
|
||||
#define GHOST atom->Nclusters_ghost
|
||||
|
||||
#define IsinRegionToSend(cj) \
|
||||
((atom->jclusters[(cj)].bbminx >= xlo || atom->jclusters[(cj)].bbmaxx >= xlo) && \
|
||||
(atom->jclusters[(cj)].bbminx < xhi || atom->jclusters[(cj)].bbmaxx < xhi) && \
|
||||
(atom->jclusters[(cj)].bbminy >= ylo || atom->jclusters[(cj)].bbmaxy >= ylo) && \
|
||||
(atom->jclusters[(cj)].bbminy < yhi || atom->jclusters[(cj)].bbmaxy < yhi) && \
|
||||
(atom->jclusters[(cj)].bbminz >= zlo || atom->jclusters[(cj)].bbmaxz >= zlo) && \
|
||||
(atom->jclusters[(cj)].bbminz < zhi || atom->jclusters[(cj)].bbmaxz < zhi))
|
||||
|
||||
#else
|
||||
|
||||
#define FORWARD_SIZE 3
|
||||
#define REVERSE_SIZE 3
|
||||
#define GHOST_SIZE 4
|
||||
#define EXCHANGE_SIZE 7
|
||||
#define LOCAL atom->Nlocal
|
||||
#define GHOST atom->Nghost
|
||||
|
||||
#define IsinRegionToSend(i) \
|
||||
((atom_x((i)) >= xlo && atom_x((i)) < xhi) && \
|
||||
(atom_y((i)) >= ylo && atom_y((i)) < yhi) && \
|
||||
(atom_z((i)) >= zlo && atom_z((i)) < zhi))
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int myproc; // my proc ID
|
||||
int numproc; // # of processors
|
||||
|
||||
int numneigh; // # of all my neighs along all swaps
|
||||
int maxneigh; // Buffer size for my neighs
|
||||
int sendfrom[6]; //return the lowest neigh index to send in each swap
|
||||
int sendtill[6]; //return the highest neigh index to send in each swao
|
||||
int recvfrom[6]; //return the lowest neigh index to recv in each swap
|
||||
int recvtill[6]; //return the highest neigh index to recv in each swap
|
||||
int* nsend; // neigh whose I want to send
|
||||
int* nrecv; // neigh whose I want to recv
|
||||
|
||||
int* pbc_x; // if pbc in x
|
||||
int* pbc_y; // if pbc in y
|
||||
int* pbc_z; // if pbc in z
|
||||
|
||||
int* atom_send, *atom_recv; // # of atoms to send/recv for each of my neighs
|
||||
int* off_atom_send; // atom offset to send, inside of a swap
|
||||
int* off_atom_recv; // atom offset to recv, inside of a swap
|
||||
|
||||
int* nexch; //procs to exchange
|
||||
int numneighexch; //# of neighbours to exchange
|
||||
int maxneighexch; //max buff size to store neighbours
|
||||
|
||||
int numswap; // # of swaps to perform, it is 6
|
||||
int swapdim[6]; // dimension of the swap (_x, _y or _z)
|
||||
int swapdir[6]; // direction of the swap 0 or 1
|
||||
int swap[3][2]; // given a dim and dir, knows the swap
|
||||
int othersend[6]; // Determine if a proc interact with more procs in a given swap
|
||||
|
||||
int firstrecv[6]; // where to put 1st recv atom in each swap
|
||||
int** sendlist; // list of atoms to send in each swap
|
||||
int* maxsendlist; // max # of atoms send in each list-swap
|
||||
|
||||
int maxsend; // max elements in buff sender
|
||||
int maxrecv; // max elements in buff receiver
|
||||
MD_FLOAT* buf_send; // sender buffer for all comm
|
||||
MD_FLOAT* buf_recv; // receicer buffer for all comm
|
||||
|
||||
int forwardSize; // # of paramaters per atom in forward comm.
|
||||
int reverseSize; // # of parameters per atom in reverse
|
||||
int exchangeSize; // # of parameters per atom in exchange
|
||||
int ghostSize; // # of parameters per atom in ghost list
|
||||
|
||||
int iterAtom; //last atom to iterate in each swap.
|
||||
Box* boxes; // Boundaries to be sent to other procs as ghost.
|
||||
} Comm;
|
||||
|
||||
|
||||
void initComm(int*, char***, Comm*); //Init MPI
|
||||
void endComm(Comm*); //End MPI
|
||||
void setupComm(Comm*,Parameter*,Grid*); //Creates a 3d grid or rcb grid
|
||||
void neighComm(Comm*,Parameter*,Grid*); //Find neighbours within cut-off and defines ghost regions
|
||||
void forwardComm(Comm*,Atom*,int); //Send info in one direction
|
||||
void reverseComm(Comm*,Atom*,int); //Return info after forward communication
|
||||
void exchangeComm(Comm*,Atom*); //Exchange info between procs
|
||||
void ghostComm(Comm*, Atom*,int); //Build the ghost neighbours to send during next forwards
|
||||
void growSend(Comm*,int); //Grows the size of the buffer sender
|
||||
void growRecv(Comm*,int); //Grows the size of the buffer receiver
|
||||
void growList(Comm*, int, int); //Grows the size of the list to send
|
||||
#endif
|
||||
51
common/includes/grid.h
Normal file
51
common/includes/grid.h
Normal file
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
|
||||
|
||||
#include <parameter.h>
|
||||
#include <box.h>
|
||||
#include <atom.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#ifndef __MAP_H_
|
||||
#define __MAP_H_
|
||||
|
||||
#define world MPI_COMM_WORLD
|
||||
#define atom_pos(i) ((dim == _x) ? atom_x((i)) : (dim == _y) ? atom_y((i)) : atom_z((i)))
|
||||
|
||||
enum {RCB=1, meanTimeRCB, Staggered};
|
||||
|
||||
typedef struct {
|
||||
int balance_every;
|
||||
int map_size;
|
||||
MD_FLOAT* map;
|
||||
//===Param for Staggerd balance
|
||||
int nprocs[3];
|
||||
int coord[3];
|
||||
MD_FLOAT cutneigh[3];
|
||||
double Timer;
|
||||
//===Param for RCB balance
|
||||
MD_FLOAT* buf_send;
|
||||
MD_FLOAT* buf_recv;
|
||||
int maxsend;
|
||||
int maxrecv;
|
||||
} Grid;
|
||||
|
||||
|
||||
typedef MD_FLOAT(*RCB_Method)(Atom*,MPI_Comm,int,double);
|
||||
|
||||
void setupGrid(Grid*, Atom*, Parameter*);
|
||||
void cartisian3d(Grid*, Parameter*, Box*);
|
||||
void rcbBalance(Grid*, Atom*, Parameter* ,RCB_Method, int, double);
|
||||
void staggeredBalance(Grid*, Atom*, Parameter*, double);
|
||||
void printGrid(Grid*);
|
||||
//rcb methods
|
||||
MD_FLOAT meanBisect(Atom* , MPI_Comm, int, double);
|
||||
MD_FLOAT meanTimeBisect(Atom*, MPI_Comm, int, double);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ typedef struct {
|
||||
char* input_file;
|
||||
char* vtk_file;
|
||||
char* xtc_file;
|
||||
char* write_atom_file;
|
||||
MD_FLOAT epsilon;
|
||||
MD_FLOAT sigma;
|
||||
MD_FLOAT sigma6;
|
||||
@@ -52,6 +53,10 @@ typedef struct {
|
||||
MD_FLOAT k_dn;
|
||||
MD_FLOAT gx, gy, gz;
|
||||
MD_FLOAT reflect_x, reflect_y, reflect_z;
|
||||
//MPI implementation
|
||||
int balance;
|
||||
int method;
|
||||
int balance_every;
|
||||
} Parameter;
|
||||
|
||||
void initParameter(Parameter*);
|
||||
|
||||
71
common/includes/shell_methods.h
Normal file
71
common/includes/shell_methods.h
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <comm.h>
|
||||
#include <atom.h>
|
||||
#include <timing.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
//static void addDummyCluster(Atom*);
|
||||
|
||||
double forward(Comm* comm, Atom *atom, Parameter* param){
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
if(param->method == halfShell){
|
||||
for(int iswap = 0; iswap < 5; iswap++)
|
||||
forwardComm(comm, atom, iswap);
|
||||
} else if(param->method == eightShell){
|
||||
for(int iswap = 0; iswap < 6; iswap+=2)
|
||||
forwardComm(comm, atom, iswap);
|
||||
} else {
|
||||
for(int iswap = 0; iswap < 6; iswap++)
|
||||
forwardComm(comm, atom, iswap);
|
||||
}
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reverse(Comm* comm, Atom *atom, Parameter* param){
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
if(param->method == halfShell){
|
||||
for(int iswap = 4; iswap >= 0; iswap--)
|
||||
reverseComm(comm, atom, iswap);
|
||||
} else if(param->method == eightShell){
|
||||
for(int iswap = 4; iswap >= 0; iswap-=2)
|
||||
reverseComm(comm, atom, iswap);
|
||||
} else if(param->method == halfStencil){
|
||||
for(int iswap = 5; iswap >= 0; iswap--)
|
||||
reverseComm(comm, atom, iswap);
|
||||
} else { } //Full Shell Reverse does nothing
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void ghostNeighbor(Comm* comm, Atom* atom, Parameter* param)
|
||||
{
|
||||
#ifdef GROMACS
|
||||
atom->Nclusters_ghost = 0;
|
||||
#endif
|
||||
atom->Nghost = 0;
|
||||
if(param->method == halfShell){
|
||||
for(int iswap=0; iswap<5; iswap++)
|
||||
ghostComm(comm,atom,iswap);
|
||||
} else if(param->method == eightShell){
|
||||
for(int iswap = 0; iswap<6; iswap+=2)
|
||||
ghostComm(comm, atom,iswap);
|
||||
} else {
|
||||
for(int iswap=0; iswap<6; iswap++)
|
||||
ghostComm(comm,atom,iswap);
|
||||
}
|
||||
}
|
||||
@@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
|
||||
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
|
||||
t0 = _mm256_add_pd(t0, t2);
|
||||
t1 = _mm256_add_pd(t1, t2);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0xC);
|
||||
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
|
||||
_mm256_store_pd(m, t1);
|
||||
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
|
||||
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(t0);
|
||||
a1 = _mm256_extractf128_pd(t0, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
@@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
|
||||
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
|
||||
|
||||
@@ -9,9 +9,15 @@
|
||||
|
||||
typedef enum {
|
||||
TOTAL = 0,
|
||||
NEIGH,
|
||||
FORCE,
|
||||
NEIGH,
|
||||
FORWARD,
|
||||
REVERSE,
|
||||
UPDATE,
|
||||
BALANCE,
|
||||
SETUP,
|
||||
REST,
|
||||
NUMTIMER
|
||||
} timertype;
|
||||
} timerComm;
|
||||
|
||||
#endif
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
#ifndef __TIMING_H_
|
||||
#define __TIMING_H_
|
||||
|
||||
extern double getTimeStamp();
|
||||
extern double getTimeResolution();
|
||||
extern double getTimeStamp_();
|
||||
extern double getTimeStamp(void);
|
||||
extern double getTimeResolution(void);
|
||||
extern double getTimeStamp_(void);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
|
||||
#ifndef __UTIL_H_
|
||||
#define __UTIL_H_
|
||||
|
||||
@@ -35,12 +37,19 @@
|
||||
# define PRECISION_STRING "double"
|
||||
#endif
|
||||
|
||||
#define BigOrEqual(a,b) (fabs((a)-(b))<1e-9 || (a)>(b))
|
||||
#define Equal(a,b) (fabs((a)-(b))<1e-9)
|
||||
|
||||
enum {_x=0, _y, _z};
|
||||
enum {fullShell=0, halfShell, eightShell, halfStencil};
|
||||
|
||||
|
||||
extern double myrandom(int*);
|
||||
extern void random_reset(int *seed, int ibase, double *coord);
|
||||
extern int str2ff(const char *string);
|
||||
extern const char* ff2str(int ff);
|
||||
extern int get_num_threads();
|
||||
extern void readline(char *line, FILE *fp);
|
||||
extern void debug_printf(const char *format, ...);
|
||||
extern int get_cuda_num_threads();
|
||||
|
||||
#endif
|
||||
|
||||
@@ -11,12 +11,14 @@
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
void initParameter(Parameter *param) {
|
||||
param->input_file = NULL;
|
||||
param->vtk_file = NULL;
|
||||
param->xtc_file = NULL;
|
||||
param->eam_file = NULL;
|
||||
param->write_atom_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma = 1.0;
|
||||
@@ -53,13 +55,17 @@ void initParameter(Parameter *param) {
|
||||
param->reflect_x = 0.0;
|
||||
param->reflect_y = 0.0;
|
||||
param->reflect_z = 0.0;
|
||||
//MPI
|
||||
param->balance = 0;
|
||||
param->method = 0;
|
||||
param->balance_every =param->reneigh_every;
|
||||
}
|
||||
|
||||
void readParameter(Parameter *param, const char *filename) {
|
||||
FILE *fp = fopen(filename, "r");
|
||||
char line[MAXLINE];
|
||||
int i;
|
||||
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open parameter file: %s\n", filename);
|
||||
exit(-1);
|
||||
@@ -71,8 +77,8 @@ void readParameter(Parameter *param, const char *filename) {
|
||||
for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
|
||||
line[i] = '\0';
|
||||
|
||||
char *tok = strtok(line, " ");
|
||||
char *val = strtok(NULL, " ");
|
||||
char *tok = strtok(line, "\t ");
|
||||
char *val = strtok(NULL, "\t ");
|
||||
|
||||
#define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
|
||||
#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
|
||||
@@ -116,15 +122,20 @@ void readParameter(Parameter *param, const char *filename) {
|
||||
PARSE_INT(x_out_every);
|
||||
PARSE_INT(v_out_every);
|
||||
PARSE_INT(half_neigh);
|
||||
PARSE_INT(method);
|
||||
PARSE_INT(balance);
|
||||
PARSE_INT(balance_every);
|
||||
}
|
||||
}
|
||||
|
||||
// Update dtforce
|
||||
param->dtforce = 0.5 * param->dt;
|
||||
|
||||
// Update sigma6 parameter
|
||||
MD_FLOAT s2 = param->sigma * param->sigma;
|
||||
param->sigma6 = s2 * s2 * s2;
|
||||
|
||||
//Update balance parameter, 10 could be change
|
||||
param->balance_every *=param->reneigh_every;
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
@@ -169,6 +180,11 @@ void printParameter(Parameter *param) {
|
||||
printf("\tNumber of timesteps: %d\n", param->ntimes);
|
||||
printf("\tReport stats every (timesteps): %d\n", param->nstat);
|
||||
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
|
||||
#ifdef SORT_ATOMS
|
||||
printf("\tSort atoms when reneighboring: yes\n");
|
||||
#else
|
||||
printf("\tSort atoms when reneighboring: no\n");
|
||||
#endif
|
||||
printf("\tPrune every (timesteps): %d\n", param->prune_every);
|
||||
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
|
||||
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
|
||||
@@ -177,4 +193,19 @@ void printParameter(Parameter *param) {
|
||||
printf("\tSkin: %e\n", param->skin);
|
||||
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
|
||||
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
|
||||
|
||||
// ================ New MPI features =============
|
||||
char str[20];
|
||||
strcpy(str, (param->method == 1) ? "Half Shell" :
|
||||
(param->method == 2) ? "Eight Shell" :
|
||||
(param->method == 3) ? "Half Stencil":
|
||||
"Full Shell");
|
||||
printf("\tMethod: %s\n", str);
|
||||
strcpy(str, (param->balance == 1) ? "mean RCB" :
|
||||
(param->balance == 2) ? "mean Time RCB" :
|
||||
(param->balance == 3) ? "Staggered" :
|
||||
"cartisian");
|
||||
printf("\tPartition: %s\n", str);
|
||||
if(param->balance)
|
||||
printf("\tRebalancing every (timesteps): %d\n",param->balance_every);
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
#include <thermo.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
static int *steparr;
|
||||
static MD_FLOAT *tmparr;
|
||||
@@ -24,6 +25,7 @@ static MD_FLOAT t_act;
|
||||
static MD_FLOAT p_act;
|
||||
static MD_FLOAT e_act;
|
||||
static int mstat;
|
||||
static MPI_Datatype type = (sizeof(MD_FLOAT) == 4) ? MPI_FLOAT : MPI_DOUBLE;
|
||||
|
||||
/* exported subroutines */
|
||||
void setupThermo(Parameter *param, int natoms)
|
||||
@@ -53,57 +55,73 @@ void setupThermo(Parameter *param, int natoms)
|
||||
|
||||
void computeThermo(int iflag, Parameter *param, Atom *atom)
|
||||
{
|
||||
MD_FLOAT t = 0.0, p;
|
||||
MD_FLOAT t_sum = 0.0, t = 0.0, p;
|
||||
int me;
|
||||
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||
}
|
||||
|
||||
t = t * t_scale;
|
||||
p = (t * dof_boltz) * p_scale;
|
||||
int istep = iflag;
|
||||
MPI_Reduce(&t, &t_sum, 1, type, MPI_SUM, 0 ,MPI_COMM_WORLD);
|
||||
if(me == 0)
|
||||
{
|
||||
t = t_sum * t_scale;
|
||||
p = (t * dof_boltz) * p_scale;
|
||||
int istep = iflag;
|
||||
|
||||
if(iflag == -1){
|
||||
istep = param->ntimes;
|
||||
}
|
||||
if(iflag == 0){
|
||||
mstat = 0;
|
||||
}
|
||||
if(iflag == -1){
|
||||
istep = param->ntimes;
|
||||
}
|
||||
if(iflag == 0){
|
||||
mstat = 0;
|
||||
}
|
||||
|
||||
steparr[mstat] = istep;
|
||||
tmparr[mstat] = t;
|
||||
prsarr[mstat] = p;
|
||||
mstat++;
|
||||
fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
|
||||
steparr[mstat] = istep;
|
||||
tmparr[mstat] = t;
|
||||
prsarr[mstat] = p;
|
||||
mstat++;
|
||||
fprintf(stdout, "%i\t%e\t%e\n", istep, t, p);
|
||||
}
|
||||
}
|
||||
|
||||
void adjustThermo(Parameter *param, Atom *atom)
|
||||
{
|
||||
/* zero center-of-mass motion */
|
||||
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
|
||||
|
||||
MD_FLOAT v_sum[3], vtot[3];
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vxtot += atom_vx(i);
|
||||
vytot += atom_vy(i);
|
||||
vztot += atom_vz(i);
|
||||
}
|
||||
|
||||
vtot[0] = vxtot; vtot[1] = vytot; vtot[2] = vztot;
|
||||
|
||||
vxtot = vxtot / atom->Natoms;
|
||||
vytot = vytot / atom->Natoms;
|
||||
vztot = vztot / atom->Natoms;
|
||||
MPI_Allreduce(vtot, v_sum, 3, type, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
vxtot = v_sum[0] / atom->Natoms;
|
||||
vytot = v_sum[1] / atom->Natoms;
|
||||
vztot = v_sum[2] / atom->Natoms;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
atom_vx(i) -= vxtot;
|
||||
atom_vy(i) -= vytot;
|
||||
atom_vz(i) -= vztot;
|
||||
}
|
||||
|
||||
t_act = 0;
|
||||
|
||||
MD_FLOAT t = 0.0;
|
||||
MD_FLOAT t_sum = 0.0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||
}
|
||||
|
||||
MPI_Allreduce(&t, &t_sum, 1,type, MPI_SUM,MPI_COMM_WORLD);
|
||||
|
||||
t = t_sum;
|
||||
t *= t_scale;
|
||||
MD_FLOAT factor = sqrt(param->temp / t);
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <util.h>
|
||||
#include <math.h>
|
||||
|
||||
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
|
||||
#define IA 16807
|
||||
@@ -79,13 +80,14 @@ const char* ff2str(int ff) {
|
||||
return "invalid";
|
||||
}
|
||||
|
||||
int get_num_threads() {
|
||||
int get_cuda_num_threads() {
|
||||
const char *num_threads_env = getenv("NUM_THREADS");
|
||||
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
|
||||
}
|
||||
|
||||
void readline(char *line, FILE *fp) {
|
||||
if(fgets(line, MAXLINE, fp) == NULL) {
|
||||
printf("error %i\n",errno);
|
||||
if(errno != 0) {
|
||||
perror("readline()");
|
||||
exit(-1);
|
||||
|
||||
14
config.mk
14
config.mk
@@ -1,20 +1,22 @@
|
||||
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
|
||||
TAG ?= ICC
|
||||
TAG ?= MPIICC
|
||||
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
|
||||
ISA ?= AVX512
|
||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||
OPT_SCHEME ?= gromacs
|
||||
# Enable likwid (true or false)
|
||||
ENABLE_LIKWID ?= true
|
||||
ENABLE_LIKWID ?= false
|
||||
# SP or DP
|
||||
DATA_TYPE ?= DP
|
||||
# AOS or SOA
|
||||
DATA_LAYOUT ?= AOS
|
||||
DATA_LAYOUT ?= SOA
|
||||
# Assembly syntax to generate (ATT/INTEL)
|
||||
ASM_SYNTAX ?= ATT
|
||||
# Debug
|
||||
DEBUG ?= false
|
||||
|
||||
# Sort atoms when reneighboring (true or false)
|
||||
SORT_ATOMS ?= true
|
||||
# Explicitly store and load atom types (true or false)
|
||||
EXPLICIT_TYPES ?= false
|
||||
# Trace memory addresses for cache simulator (true or false)
|
||||
@@ -22,13 +24,13 @@ MEM_TRACER ?= false
|
||||
# Trace indexes and distances for gather-md (true or false)
|
||||
INDEX_TRACER ?= false
|
||||
# Compute statistics
|
||||
COMPUTE_STATS ?= true
|
||||
COMPUTE_STATS ?= false
|
||||
|
||||
# Configurations for lammps optimization scheme
|
||||
# Use omp simd pragma when running with half neighbor-lists
|
||||
ENABLE_OMP_SIMD ?= true
|
||||
ENABLE_OMP_SIMD ?= false
|
||||
# Use kernel with explicit SIMD intrinsics
|
||||
USE_SIMD_KERNEL ?= false
|
||||
USE_SIMD_KERNEL ?= true
|
||||
|
||||
# Configurations for gromacs optimization scheme
|
||||
# Use reference version
|
||||
|
||||
@@ -6,7 +6,7 @@ dt 0.001
|
||||
temp 80
|
||||
x_out_freq 500
|
||||
v_out_freq 5
|
||||
cutforce 0.9
|
||||
skin 0.05
|
||||
cutforce 1.8
|
||||
skin 0.1
|
||||
reneigh_every 100
|
||||
nstat 125000
|
||||
|
||||
Submodule gather-bench deleted from 2f654cb043
374
gromacs/atom.c
374
gromacs/atom.c
@@ -12,7 +12,8 @@
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <util.h>
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
void initAtom(Atom *atom) {
|
||||
atom->x = NULL; atom->y = NULL; atom->z = NULL;
|
||||
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
|
||||
@@ -27,6 +28,7 @@ void initAtom(Atom *atom) {
|
||||
atom->Nclusters = 0;
|
||||
atom->Nclusters_local = 0;
|
||||
atom->Nclusters_ghost = 0;
|
||||
atom->NmaxGhost = 0; //Temporal
|
||||
atom->Nclusters_max = 0;
|
||||
atom->type = NULL;
|
||||
atom->ntypes = 0;
|
||||
@@ -37,10 +39,19 @@ void initAtom(Atom *atom) {
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
atom->PBCx = NULL;
|
||||
atom->PBCy = NULL;
|
||||
atom->PBCz = NULL;
|
||||
initMasks(atom);
|
||||
//MPI
|
||||
Box *mybox = &(atom->mybox);
|
||||
mybox->xprd = mybox->yprd = mybox->zprd = 0;
|
||||
mybox->lo[0] = mybox->lo[1] = mybox->lo[2] = 0;
|
||||
mybox->hi[0] = mybox->hi[1] = mybox->hi[2] = 0;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
|
||||
@@ -90,7 +101,7 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
ytmp = 0.5 * alat * j;
|
||||
ztmp = 0.5 * alat * k;
|
||||
|
||||
if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
|
||||
if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
|
||||
n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
|
||||
for(m = 0; m < 5; m++) { myrandom(&n); }
|
||||
vxtmp = myrandom(&n);
|
||||
@@ -128,22 +139,26 @@ int type_str2int(const char *type) {
|
||||
}
|
||||
|
||||
int readAtom(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
int len = strlen(param->input_file);
|
||||
if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
|
||||
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
|
||||
if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -153,11 +168,11 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
char *item = strtok(line, " ");
|
||||
if(strncmp(item, "CRYST1", 6) == 0) {
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
@@ -166,23 +181,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
char *label;
|
||||
int atom_id, comp_id;
|
||||
MD_FLOAT occupancy, charge;
|
||||
atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
|
||||
label = strtok(NULL, " ");
|
||||
comp_id = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
|
||||
label = strtok(NULL, "\t ");
|
||||
comp_id = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->vx[atom_id] = 0.0;
|
||||
atom->vy[atom_id] = 0.0;
|
||||
atom->vz[atom_id] = 0.0;
|
||||
occupancy = atof(strtok(NULL, " "));
|
||||
charge = atof(strtok(NULL, " "));
|
||||
occupancy = atof(strtok(NULL, "\t "));
|
||||
charge = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
@@ -194,14 +209,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
strncmp(item, "ENDMDL", 6) == 0) {
|
||||
// Do nothing
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(!read_atoms) {
|
||||
fprintf(stderr, "Input error: No atoms read!\n");
|
||||
if(me==0) fprintf(stderr, "Input error: No atoms read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -217,12 +232,15 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
char desc[MAXLINE];
|
||||
@@ -231,7 +249,7 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
int i = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -241,25 +259,25 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
desc[i] = '\0';
|
||||
readline(line, fp);
|
||||
atoms_to_read = atoi(strtok(line, " "));
|
||||
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
if(me==0) fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
|
||||
while(!feof(fp) && read_atoms < atoms_to_read) {
|
||||
readline(line, fp);
|
||||
char *label = strtok(line, " ");
|
||||
int type = type_str2int(strtok(NULL, " "));
|
||||
int atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
char *label = strtok(line, "\t ");
|
||||
int type = type_str2int(strtok(NULL, "\t "));
|
||||
int atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
atom_id = read_atoms;
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type;
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
@@ -269,18 +287,18 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
if(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(line, " "));
|
||||
param->xhi = atof(strtok(line, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
}
|
||||
|
||||
if(read_atoms != atoms_to_read) {
|
||||
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
if(me==0) fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -296,12 +314,14 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
@@ -310,7 +330,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
int ts = -1;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -333,47 +353,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
}
|
||||
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
|
||||
readline(line, fp);
|
||||
param->xlo = atof(strtok(line, " "));
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->xlo = atof(strtok(line, "\t "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
|
||||
readline(line, fp);
|
||||
param->ylo = atof(strtok(line, " "));
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->ylo = atof(strtok(line, "\t "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
|
||||
readline(line, fp);
|
||||
param->zlo = atof(strtok(line, " "));
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zlo = atof(strtok(line, "\t "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
atom_id = atoi(strtok(line, " ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, " "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, " "));
|
||||
atom_id = atoi(strtok(line, "\t ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->vx[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vy[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->vz[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
read_atoms++;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
if(me==0) fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
if(me==0) fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(ts < 0 || !natoms || !read_atoms) {
|
||||
fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
if(me==0) fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -389,7 +409,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
if(me==0) fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
fclose(fp);
|
||||
return natoms;
|
||||
}
|
||||
@@ -530,3 +550,249 @@ void growClusters(Atom *atom) {
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||
}
|
||||
|
||||
/* MPI added*/
|
||||
void growPbc(Atom* atom) {
|
||||
int nold = atom->NmaxGhost;
|
||||
atom->NmaxGhost += DELTA;
|
||||
|
||||
if (atom->PBCx || atom->PBCy || atom->PBCz){
|
||||
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, atom->NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
} else {
|
||||
atom->PBCx = (int*) malloc(atom->NmaxGhost * sizeof(int));
|
||||
atom->PBCy = (int*) malloc(atom->NmaxGhost * sizeof(int));
|
||||
atom->PBCz = (int*) malloc(atom->NmaxGhost * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
void packForward(Atom* atom, int nc, int* list, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = list[i];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
|
||||
buf[3*(displ+cjj)+1] = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
|
||||
buf[3*(displ+cjj)+2] = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = -1; //x
|
||||
buf[3*(displ+cjj)+1] = -1; //y
|
||||
buf[3*(displ+cjj)+2] = -1; //z
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unpackForward(Atom* atom, int nc, int c0, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = c0+i;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
if(cj_x[CL_X_OFFSET + cjj]<INFINITY) cj_x[CL_X_OFFSET + cjj] = buf[3*(displ+cjj)+0];
|
||||
if(cj_x[CL_Y_OFFSET + cjj]<INFINITY) cj_x[CL_Y_OFFSET + cjj] = buf[3*(displ+cjj)+1];
|
||||
if(cj_x[CL_Z_OFFSET + cjj]<INFINITY) cj_x[CL_Z_OFFSET + cjj] = buf[3*(displ+cjj)+2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int packGhost(Atom* atom, int cj, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
//#of elements per cluster natoms,x0,y0,z0,type_0, . . ,xn,yn,zn,type_n,bbminx,bbmaxxy,bbminy,bbmaxy,bbminz,bbmaxz
|
||||
//count = 4*N_CLUSTER+7, if N_CLUSTER =4 => count = 23 value/cluster + trackpbc[x] + trackpbc[y] + trackpbc[z]
|
||||
int m = 0;
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
buf[m++] = atom->jclusters[cj].natoms;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
|
||||
MD_FLOAT xtmp = cj_x[CL_X_OFFSET + cjj] + pbc[_x] * atom->mybox.xprd;
|
||||
MD_FLOAT ytmp = cj_x[CL_Y_OFFSET + cjj] + pbc[_y] * atom->mybox.yprd;
|
||||
MD_FLOAT ztmp = cj_x[CL_Z_OFFSET + cjj] + pbc[_z] * atom->mybox.zprd;
|
||||
|
||||
buf[m++] = xtmp;
|
||||
buf[m++] = ytmp;
|
||||
buf[m++] = ztmp;
|
||||
buf[m++]= atom->cl_type[cj_sca_base + cjj];
|
||||
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
buf[m++] = -1; //x
|
||||
buf[m++] = -1; //y
|
||||
buf[m++] = -1; //z
|
||||
buf[m++] = -1; //type
|
||||
}
|
||||
|
||||
buf[m++] = bbminx;
|
||||
buf[m++] = bbmaxx;
|
||||
buf[m++] = bbminy;
|
||||
buf[m++] = bbmaxy;
|
||||
buf[m++] = bbminz;
|
||||
buf[m++] = bbmaxz;
|
||||
//TODO: check atom->ncj
|
||||
int ghostId = cj-atom->ncj;
|
||||
//check for ghost particles
|
||||
buf[m++] = (cj-atom->ncj>=0) ? pbc[_x]+atom->PBCx[ghostId]:pbc[_x];
|
||||
buf[m++] = (cj-atom->ncj>=0) ? pbc[_y]+atom->PBCy[ghostId]:pbc[_y];
|
||||
buf[m++] = (cj-atom->ncj>=0) ? pbc[_z]+atom->PBCz[ghostId]:pbc[_z];
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackGhost(Atom* atom, int cj, MD_FLOAT* buf)
|
||||
{
|
||||
int m = 0;
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
if(cj*jfac>=atom->Nclusters_max) growClusters(atom);
|
||||
if(atom->Nclusters_ghost>=atom->NmaxGhost) growPbc(atom);
|
||||
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj);
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
atom->jclusters[cj].natoms = buf[m++];
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
|
||||
cj_x[CL_X_OFFSET + cjj] = buf[m++];
|
||||
cj_x[CL_Y_OFFSET + cjj] = buf[m++];
|
||||
cj_x[CL_Z_OFFSET + cjj] = buf[m++];
|
||||
atom->cl_type[cj_sca_base + cjj] = buf[m++];
|
||||
atom->Nghost++;
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
atom->cl_type[cj_sca_base + cjj] = -1;
|
||||
m+=4;
|
||||
}
|
||||
|
||||
atom->jclusters[cj].bbminx = buf[m++];
|
||||
atom->jclusters[cj].bbmaxx = buf[m++];
|
||||
atom->jclusters[cj].bbminy = buf[m++];
|
||||
atom->jclusters[cj].bbmaxy = buf[m++];
|
||||
atom->jclusters[cj].bbminz = buf[m++];
|
||||
atom->jclusters[cj].bbmaxz = buf[m++];
|
||||
atom->PBCx[atom->Nclusters_ghost] = buf[m++];
|
||||
atom->PBCy[atom->Nclusters_ghost] = buf[m++];
|
||||
atom->PBCz[atom->Nclusters_ghost] = buf[m++];
|
||||
atom->Nclusters_ghost++;
|
||||
|
||||
}
|
||||
|
||||
void packReverse(Atom* atom, int nc, int c0, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = c0+i;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = cj_f[CL_X_OFFSET + cjj];
|
||||
buf[3*(displ+cjj)+1] = cj_f[CL_Y_OFFSET + cjj];
|
||||
buf[3*(displ+cjj)+2] = cj_f[CL_Z_OFFSET + cjj];
|
||||
}
|
||||
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
buf[3*(displ+cjj)+0] = -1; //x
|
||||
buf[3*(displ+cjj)+1] = -1; //y
|
||||
buf[3*(displ+cjj)+2] = -1; //z
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void unpackReverse(Atom* atom, int nc, int* list, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < nc; i++) {
|
||||
int cj = list[i];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
int displ = i*CLUSTER_N;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
cj_f[CL_X_OFFSET + cjj] += buf[3*(displ+cjj)+0];
|
||||
cj_f[CL_Y_OFFSET + cjj] += buf[3*(displ+cjj)+1];
|
||||
cj_f[CL_Z_OFFSET + cjj] += buf[3*(displ+cjj)+2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int packExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
int m = 0;
|
||||
buf[m++] = atom_x(i);
|
||||
buf[m++] = atom_y(i);
|
||||
buf[m++] = atom_z(i);
|
||||
buf[m++] = atom_vx(i);
|
||||
buf[m++] = atom_vy(i);
|
||||
buf[m++] = atom_vz(i);
|
||||
buf[m++] = atom->type[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
while(i >= atom->Nmax) growAtom(atom);
|
||||
int m = 0;
|
||||
atom_x(i) = buf[m++];
|
||||
atom_y(i) = buf[m++];
|
||||
atom_z(i) = buf[m++];
|
||||
atom_vx(i) = buf[m++];
|
||||
atom_vy(i) = buf[m++];
|
||||
atom_vz(i) = buf[m++];
|
||||
atom->type[i] = buf[m++];
|
||||
return m;
|
||||
}
|
||||
|
||||
void pbc(Atom* atom)
|
||||
{
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
MD_FLOAT xprd = atom->mybox.xprd;
|
||||
MD_FLOAT yprd = atom->mybox.yprd;
|
||||
MD_FLOAT zprd = atom->mybox.zprd;
|
||||
|
||||
if(atom_x(i) < 0.0) atom_x(i) += xprd;
|
||||
if(atom_y(i) < 0.0) atom_y(i) += yprd;
|
||||
if(atom_z(i) < 0.0) atom_z(i) +=zprd;
|
||||
if(atom_x(i) >= xprd) atom_x(i) -=xprd;
|
||||
if(atom_y(i) >= yprd) atom_y(i) -=yprd;
|
||||
if(atom_z(i) >= zprd) atom_z(i) -=zprd;
|
||||
}
|
||||
}
|
||||
|
||||
void copy(Atom* atom, int i, int j)
|
||||
{
|
||||
atom_x(i) = atom_x(j);
|
||||
atom_y(i) = atom_y(j);
|
||||
atom_z(i) = atom_z(j);
|
||||
atom_vx(i) = atom_vx(j);
|
||||
atom_vy(i) = atom_vy(j);
|
||||
atom_vz(i) = atom_vz(j);
|
||||
atom->type[i] = atom->type[j];
|
||||
}
|
||||
|
||||
@@ -14,8 +14,9 @@
|
||||
#include <stats.h>
|
||||
#include <util.h>
|
||||
#include <simd.h>
|
||||
#include <math.h>
|
||||
|
||||
|
||||
void computeForceGhostShell(Parameter*, Atom*, Neighbor*);
|
||||
/*
|
||||
static inline void gmx_load_simd_2xnn_interactions(
|
||||
int excl,
|
||||
@@ -45,11 +46,10 @@ static inline void gmx_load_simd_4xn_interactions(
|
||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
NeighborCluster* neighs;
|
||||
int *neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
@@ -60,6 +60,16 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
}
|
||||
|
||||
for(int cg = atom->ncj; cg < atom->ncj+atom->Nclusters_ghost; cg++) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) {
|
||||
cj_f[CL_X_OFFSET + cjj] = 0.0;
|
||||
cj_f[CL_Y_OFFSET + cjj] = 0.0;
|
||||
cj_f[CL_Z_OFFSET + cjj] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
@@ -75,14 +85,14 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int any = 0;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
|
||||
@@ -103,6 +113,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
cond = neighbor->half_neigh ? (ci_cj0 != cj || cii < cjj) && (ci_cj1 != cj || cii < cjj + CLUSTER_N) :
|
||||
(ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N);
|
||||
#endif
|
||||
|
||||
if(cond) {
|
||||
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
|
||||
@@ -113,12 +124,11 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
if(neighbor->half_neigh) {
|
||||
if(neighbor->half_neigh || param->method) {
|
||||
cj_f[CL_X_OFFSET + cjj] -= delx * force;
|
||||
cj_f[CL_Y_OFFSET + cjj] -= dely * force;
|
||||
cj_f[CL_Z_OFFSET + cjj] -= delz * force;
|
||||
}
|
||||
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
@@ -129,13 +139,11 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(any != 0) {
|
||||
addStat(stats->clusters_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->clusters_outside_cutoff, 1);
|
||||
}
|
||||
|
||||
ci_f[CL_X_OFFSET + cii] += fix;
|
||||
ci_f[CL_Y_OFFSET + cii] += fiy;
|
||||
ci_f[CL_Z_OFFSET + cii] += fiz;
|
||||
@@ -146,7 +154,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor);
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
@@ -158,7 +166,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
NeighborCluster* neighs;
|
||||
int *neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -168,7 +176,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local+atom->Nclusters_ghost; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
@@ -178,6 +186,16 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
}
|
||||
|
||||
for(int cg = atom->ncj; cg < atom->ncj+atom->Nclusters_ghost; cg++) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) {
|
||||
cj_f[CL_X_OFFSET + cjj] = 0.0;
|
||||
cj_f[CL_Y_OFFSET + cjj] = 0.0;
|
||||
cj_f[CL_Z_OFFSET + cjj] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
@@ -240,9 +258,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
//int imask = neighs[k].imask;
|
||||
//int imask = neighs_imask[k];
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
//MD_SIMD_MASK interact0;
|
||||
@@ -322,7 +340,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
fiz2 += tz2;
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)|| param->method) {
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
}
|
||||
#else
|
||||
@@ -331,7 +349,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
@@ -373,7 +391,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
fiz2 += tz2;
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal) || param->method) {
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
}
|
||||
#else
|
||||
@@ -389,7 +407,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor);
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
@@ -401,7 +419,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
NeighborCluster* neighs;
|
||||
int *neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -454,9 +472,8 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
@@ -507,7 +524,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
@@ -563,14 +580,13 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
||||
if(neighbor->half_neigh) {
|
||||
return computeForceLJ_2xnn_half(param, atom, neighbor, stats);
|
||||
}
|
||||
|
||||
return computeForceLJ_2xnn_full(param, atom, neighbor, stats);
|
||||
}
|
||||
|
||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
NeighborCluster* neighs;
|
||||
int *neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -590,6 +606,16 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
for(int cg = atom->ncj; cg < atom->ncj+atom->Nclusters_ghost; cg++) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg);
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) {
|
||||
cj_f[CL_X_OFFSET + cjj] = 0.0;
|
||||
cj_f[CL_Y_OFFSET + cjj] = 0.0;
|
||||
cj_f[CL_Z_OFFSET + cjj] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
@@ -635,9 +661,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
@@ -728,7 +753,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
fiz3 = simd_add(fiz3, tz3);
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal) || param->method) {
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
@@ -741,9 +766,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
@@ -814,7 +838,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
fiz3 = simd_add(fiz3, tz3);
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal) || param->method) {
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
@@ -834,7 +858,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor);
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
@@ -846,7 +870,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
NeighborCluster* neighs;
|
||||
int *neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -911,9 +935,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
@@ -991,9 +1014,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
@@ -1075,3 +1097,120 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
|
||||
return computeForceLJ_4xn_full(param, atom, neighbor, stats);
|
||||
}
|
||||
|
||||
//Routine for eight shell method
|
||||
void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||
|
||||
int Nshell = neighbor->Nshell;
|
||||
int *neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
for(int ci = 0; ci < Nshell; ci++) {
|
||||
neighs = &neighbor->neighshell[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numNeighShell[ci];
|
||||
int cs = neighbor->listshell[ci];
|
||||
int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs);
|
||||
MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base];
|
||||
MD_FLOAT *cs_f = &atom->cl_f[cs_vec_base];
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
|
||||
for(int css = 0; css < CLUSTER_N; css++) {
|
||||
MD_FLOAT x = cs_x[CL_X_OFFSET + css];
|
||||
MD_FLOAT y = cs_x[CL_Y_OFFSET + css];
|
||||
MD_FLOAT z = cs_x[CL_Z_OFFSET + css];
|
||||
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
|
||||
MD_FLOAT delx = x - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = y - cj_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = z - cj_x[CL_Z_OFFSET + cjj];
|
||||
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
cj_f[CL_X_OFFSET + cjj] -= delx * force;
|
||||
cj_f[CL_Y_OFFSET + cjj] -= dely * force;
|
||||
cj_f[CL_Z_OFFSET + cjj] -= delz * force;
|
||||
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
}
|
||||
}
|
||||
|
||||
cs_f[CL_X_OFFSET + css] += fix;
|
||||
cs_f[CL_Y_OFFSET + css] += fiy;
|
||||
cs_f[CL_Z_OFFSET + css] += fiz;
|
||||
}
|
||||
}
|
||||
// addStat(stats->calculated_forces, 1);
|
||||
// addStat(stats->num_neighs, numneighs);
|
||||
// addStat(stats->force_iters, (long long int)((double)numneighs));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
int Nshell = neighbor->Nshell;
|
||||
Pair* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
for(int ci = 0; ci < Nshell; ci++) {
|
||||
neighs = &neighbor->neighshell[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numNeighShell[ci];
|
||||
int cs = neighbor->listshell[ci].cluster;
|
||||
int css = neighbor->listshell[ci].atom;
|
||||
int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs);
|
||||
MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base];
|
||||
MD_FLOAT *cs_f = &atom->cl_f[cs_vec_base];
|
||||
MD_FLOAT x = cs_x[CL_X_OFFSET + css];
|
||||
MD_FLOAT y = cs_x[CL_Y_OFFSET + css];
|
||||
MD_FLOAT z = cs_x[CL_Z_OFFSET + css];
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k].cluster;
|
||||
int cjj = neighs[k].atom;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
|
||||
MD_FLOAT delx = x - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = y - cj_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = z - cj_x[CL_Z_OFFSET + cjj];
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
cj_f[CL_X_OFFSET + cjj] -= delx * force;
|
||||
cj_f[CL_Y_OFFSET + cjj] -= dely * force;
|
||||
cj_f[CL_Z_OFFSET + cjj] -= delz * force;
|
||||
cs_f[CL_X_OFFSET + css] += delx * force;
|
||||
cs_f[CL_Y_OFFSET + css] += delx * force;
|
||||
cs_f[CL_Z_OFFSET + css] += delx * force;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
#include <box.h>
|
||||
|
||||
#ifndef __ATOM_H_
|
||||
#define __ATOM_H_
|
||||
@@ -102,7 +103,7 @@ typedef struct {
|
||||
|
||||
typedef struct {
|
||||
int Natoms, Nlocal, Nghost, Nmax;
|
||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
|
||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max, NmaxGhost,ncj;
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
int *border_map;
|
||||
@@ -112,6 +113,7 @@ typedef struct {
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
//track the movement of a particle along boundaries
|
||||
int *PBCx, *PBCy, *PBCz;
|
||||
// Data in cluster format
|
||||
MD_FLOAT *cl_x;
|
||||
@@ -128,6 +130,9 @@ typedef struct {
|
||||
unsigned int masks_2xnn_fn[8];
|
||||
unsigned int masks_4xn_hn[16];
|
||||
unsigned int masks_4xn_fn[16];
|
||||
|
||||
//Info Subdomain
|
||||
Box mybox;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
@@ -140,6 +145,18 @@ extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
extern void growClusters(Atom*);
|
||||
|
||||
int packGhost(Atom*, int, MD_FLOAT* , int*);
|
||||
int unpackGhost(Atom*, int, MD_FLOAT*);
|
||||
int packExchange(Atom*, int, MD_FLOAT*);
|
||||
int unpackExchange(Atom*, int, MD_FLOAT*);
|
||||
void packForward(Atom*, int, int*, MD_FLOAT*, int*);
|
||||
void unpackForward(Atom*, int, int, MD_FLOAT*);
|
||||
void packReverse(Atom* , int , int , MD_FLOAT*);
|
||||
void unpackReverse(Atom*, int, int*, MD_FLOAT*);
|
||||
void pbc(Atom*);
|
||||
void copy(Atom*, int, int);
|
||||
|
||||
|
||||
#ifdef AOS
|
||||
# define POS_DATA_LAYOUT "AoS"
|
||||
# define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
|
||||
@@ -9,10 +9,13 @@
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
#include <timers.h>
|
||||
#include <timing.h>
|
||||
#include <simd.h>
|
||||
/*
|
||||
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
@@ -32,9 +35,9 @@ void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
|
||||
}
|
||||
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
@@ -46,6 +49,56 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
|
||||
}
|
||||
*/
|
||||
|
||||
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
|
||||
MD_SIMD_FLOAT dt = simd_broadcast(param->dt);
|
||||
|
||||
MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
|
||||
MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
|
||||
MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
|
||||
MD_SIMD_FLOAT x_vector = simd_fma(vx_vector, dt, simd_load(&ci_x[CL_X_OFFSET]));
|
||||
MD_SIMD_FLOAT y_vector = simd_fma(vy_vector, dt, simd_load(&ci_x[CL_Y_OFFSET]));
|
||||
MD_SIMD_FLOAT z_vector = simd_fma(vz_vector, dt, simd_load(&ci_x[CL_Z_OFFSET]));
|
||||
|
||||
simd_store(&ci_v[CL_X_OFFSET], vx_vector);
|
||||
simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
|
||||
simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
|
||||
simd_store(&ci_x[CL_X_OFFSET], x_vector);
|
||||
simd_store(&ci_x[CL_Y_OFFSET], y_vector);
|
||||
simd_store(&ci_x[CL_Z_OFFSET], z_vector);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
|
||||
}
|
||||
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci+=2) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT dtforce = simd_broadcast(param->dtforce);
|
||||
MD_SIMD_FLOAT vx_vector = simd_fma(simd_load(&ci_f[CL_X_OFFSET]), dtforce, simd_load(&ci_v[CL_X_OFFSET]));
|
||||
MD_SIMD_FLOAT vy_vector = simd_fma(simd_load(&ci_f[CL_Y_OFFSET]), dtforce, simd_load(&ci_v[CL_Y_OFFSET]));
|
||||
MD_SIMD_FLOAT vz_vector = simd_fma(simd_load(&ci_f[CL_Z_OFFSET]), dtforce, simd_load(&ci_v[CL_Z_OFFSET]));
|
||||
simd_store(&ci_v[CL_X_OFFSET], vx_vector);
|
||||
simd_store(&ci_v[CL_Y_OFFSET], vy_vector);
|
||||
simd_store(&ci_v[CL_Z_OFFSET], vz_vector);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
|
||||
}
|
||||
@@ -54,3 +107,6 @@ void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
void cudaInitialIntegrate(Parameter*, Atom*);
|
||||
void cudaFinalIntegrate(Parameter*, Atom*);
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@@ -26,9 +26,9 @@
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||
|
||||
typedef struct {
|
||||
int cj;
|
||||
unsigned int imask;
|
||||
} NeighborCluster;
|
||||
int cluster;
|
||||
int atom;
|
||||
} Pair;
|
||||
|
||||
typedef struct {
|
||||
int every;
|
||||
@@ -37,9 +37,22 @@ typedef struct {
|
||||
int* numneigh;
|
||||
int* numneigh_masked;
|
||||
int half_neigh;
|
||||
NeighborCluster* neighbors;
|
||||
int* neighbors;
|
||||
unsigned int* neighbors_imask;
|
||||
//MPI
|
||||
/*
|
||||
int Nshell; //# of atoms in listShell(Cluster here cover all possible ghost interactions)
|
||||
int *numNeighShell; //# of neighs for each atom in listShell
|
||||
Pair *neighshell; //list of neighs for each atom in listShell
|
||||
Pair *listshell; //Atoms to compute the force
|
||||
*/
|
||||
int Nshell; //# of cluster in listShell(Cluster here cover all possible ghost interactions)
|
||||
int *numNeighShell; //# of neighs for each atom in listShell
|
||||
int *neighshell; //list of neighs for each atom in listShell
|
||||
int *listshell; //Atoms to compute the force
|
||||
} Neighbor;
|
||||
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
extern void setupNeighbor(Parameter*, Atom*);
|
||||
extern void binatoms(Atom*);
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <comm.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __VTK_H_
|
||||
#define __VTK_H_
|
||||
@@ -13,4 +15,5 @@ extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int t
|
||||
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
|
||||
#endif
|
||||
|
||||
@@ -60,18 +60,15 @@ void init(Parameter *param) {
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
// Show debug messages
|
||||
#define DEBUG(msg) printf(msg)
|
||||
// Do not show debug messages
|
||||
//#define DEBUG(msg)
|
||||
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
const int ncj = atom->Nclusters_local / jfac;
|
||||
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
|
||||
|
||||
if(pattern == P_RAND && ncj <= nneighs) {
|
||||
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
|
||||
@@ -80,6 +77,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
|
||||
int m = (pattern == P_SEQ) ? ncj : nneighs;
|
||||
int k = 0;
|
||||
@@ -90,6 +88,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
do {
|
||||
int cj = rand() % ncj;
|
||||
neighptr[k] = cj;
|
||||
neighptr_imask[k] = imask;
|
||||
found = 0;
|
||||
for(int l = 0; l < k; l++) {
|
||||
if(neighptr[l] == cj) {
|
||||
@@ -99,6 +98,7 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
} while(found == 1);
|
||||
} else {
|
||||
neighptr[k] = j;
|
||||
neighptr_imask[k] = imask;
|
||||
j = (j + 1) % m;
|
||||
}
|
||||
}
|
||||
@@ -106,10 +106,12 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
for(int r = 1; r < nreps; r++) {
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
neighptr[r * nneighs + k] = neighptr[k];
|
||||
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = nneighs * nreps;
|
||||
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,12 +127,13 @@ int main(int argc, const char *argv[]) {
|
||||
int niclusters = 256; // Number of local i-clusters
|
||||
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
|
||||
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
|
||||
int masked = 0; // Use masked loop
|
||||
int nreps = 1;
|
||||
int csv = 0;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG("Initializing parameters...\n");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
@@ -156,6 +159,10 @@ int main(int argc, const char *argv[]) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0)) {
|
||||
masked = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
@@ -206,11 +213,11 @@ int main(int argc, const char *argv[]) {
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG("Initializing EAM parameters...\n");
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG("Initializing atoms...\n");
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
@@ -226,7 +233,7 @@ int main(int argc, const char *argv[]) {
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG("Creating atoms...\n");
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
while(atom->Nmax < niclusters * iclusters_natoms) {
|
||||
growAtom(atom);
|
||||
}
|
||||
@@ -281,13 +288,13 @@ int main(int argc, const char *argv[]) {
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG("Defining j-clusters...\n");
|
||||
DEBUG_MESSAGE("Defining j-clusters...\n");
|
||||
defineJClusters(atom);
|
||||
DEBUG("Initializing neighbor lists...\n");
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
|
||||
DEBUG("Computing forces...\n");
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
||||
|
||||
216
gromacs/main.c
216
gromacs/main.c
@@ -24,6 +24,10 @@
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
#include <xtc.h>
|
||||
#include <comm.h>
|
||||
#include <grid.h>
|
||||
#include <shell_methods.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
@@ -40,17 +44,55 @@ extern void copyDataFromCUDADevice(Atom *atom);
|
||||
extern void cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time)
|
||||
{
|
||||
double S, E;
|
||||
int dims = 3; //TODO: Adjust to do in 3d and 2d
|
||||
S = getTimeStamp();
|
||||
if(param->balance == RCB) {
|
||||
rcbBalance(grid, atom, param, meanBisect,dims,0);
|
||||
neighComm(comm, param, grid);
|
||||
}else if(param->balance == meanTimeRCB){
|
||||
rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
|
||||
neighComm(comm, param, grid);
|
||||
}else if(param->balance == Staggered) {
|
||||
staggeredBalance(grid, atom, param, time);
|
||||
neighComm(comm, param, grid);
|
||||
exchangeComm(comm,atom);
|
||||
}else { } //Do nothing
|
||||
//printGrid(grid);
|
||||
E = getTimeStamp();
|
||||
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
|
||||
{
|
||||
double E,S,time;
|
||||
int me;
|
||||
MPI_Comm_rank(world,&me);
|
||||
S = getTimeStamp();
|
||||
if(param->balance == meanTimeRCB || param->balance == RCB){
|
||||
rcbBalance(grid, atom, param, meanBisect,3,0);
|
||||
neighComm(comm, param, grid);
|
||||
}
|
||||
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
|
||||
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
|
||||
MPI_Barrier(world);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
|
||||
S = getTimeStamp();
|
||||
initAtom(atom);
|
||||
initPbc(atom);
|
||||
//initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if(param->input_file == NULL) {
|
||||
@@ -58,13 +100,18 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
|
||||
setupGrid(grid,atom,param);
|
||||
setupNeighbor(param, atom);
|
||||
setupComm(comm, param, grid);
|
||||
if(param->balance){
|
||||
initialBalance(param, eam, atom, neighbor, stats, comm, grid);
|
||||
}
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
//setupPbc(atom, param);
|
||||
ghostNeighbor(comm, atom, param); //change
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
initDevice(atom, neighbor);
|
||||
@@ -72,15 +119,15 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateSingleAtoms(atom);
|
||||
updateAtomsPbc(atom, param);
|
||||
//updateAtomsPbc(atom, param);
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
//setupPbc(atom, param);
|
||||
ghostNeighbor(comm, atom, param);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
@@ -88,15 +135,13 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void printAtomState(Atom *atom) {
|
||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
|
||||
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
||||
|
||||
/* int nall = atom->Nlocal + atom->Nghost; */
|
||||
|
||||
/* for (int i=0; i<nall; i++) { */
|
||||
/* printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
|
||||
/* } */
|
||||
double updateAtoms(Comm* comm, Atom* atom){
|
||||
double S,E;
|
||||
S = getTimeStamp();
|
||||
updateSingleAtoms(atom);
|
||||
exchangeComm(comm, atom);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
@@ -106,7 +151,8 @@ int main(int argc, char** argv) {
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
|
||||
Comm comm;
|
||||
Grid grid;
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
{
|
||||
@@ -114,10 +160,10 @@ int main(int argc, char** argv) {
|
||||
//LIKWID_MARKER_REGISTER("reneighbour");
|
||||
//LIKWID_MARKER_REGISTER("pbc");
|
||||
}
|
||||
|
||||
initComm(&argc, &argv, &comm); //change
|
||||
initParameter(¶m);
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
|
||||
readParameter(¶m, argv[++i]);
|
||||
continue;
|
||||
}
|
||||
@@ -156,6 +202,24 @@ int main(int argc, char** argv) {
|
||||
param.half_neigh = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-method") == 0)) {
|
||||
param.method = atoi(argv[++i]);
|
||||
if (param.method>2 || param.method< 0){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-bal") == 0)) {
|
||||
param.balance = atoi(argv[++i]);
|
||||
if (param.balance>3 || param.balance< 0){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Load balance does not exist!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
|
||||
param.mass = atof(argv[++i]);
|
||||
continue;
|
||||
@@ -186,6 +250,7 @@ int main(int argc, char** argv) {
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
//TODO: add the shell and ac print options
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
|
||||
@@ -203,98 +268,101 @@ int main(int argc, char** argv) {
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(param.balance>0 && param.method == 1){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
timer[SETUP]=setup(¶m, &eam, &atom, &neighbor, &stats, &comm, &grid);
|
||||
if(comm.myproc == 0) printParameter(¶m);
|
||||
if(comm.myproc == 0) printf(HLINE);
|
||||
if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] = computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] = computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[FORWARD] = 0.0;
|
||||
timer[UPDATE] = 0.0;
|
||||
timer[BALANCE] = 0.0;
|
||||
timer[REVERSE] = reverse(&comm, &atom, ¶m);
|
||||
MPI_Barrier(world);
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
|
||||
if(param.vtk_file != NULL) {
|
||||
write_data_to_vtk_file(param.vtk_file, &atom, 0);
|
||||
//write_data_to_vtk_file(param.vtk_file, &comm ,&atom, 0);
|
||||
printvtk(param.vtk_file, &comm, &atom, ¶m, 0);
|
||||
}
|
||||
|
||||
//TODO: modify xct
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_init(param.xtc_file, &atom, 0);
|
||||
}
|
||||
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
double forceTime=0.0;
|
||||
double commTime=0.0;
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
initialIntegrate(¶m, &atom);
|
||||
|
||||
if((n + 1) % param.reneigh_every) {
|
||||
if(!((n + 1) % param.prune_every)) {
|
||||
timer[FORWARD]+=forward(&comm, &atom, ¶m);
|
||||
if(!((n + 1) % param.prune_every)){
|
||||
pruneNeighbor(¶m, &atom, &neighbor);
|
||||
}
|
||||
|
||||
updatePbc(&atom, ¶m, 0);
|
||||
} else {
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
|
||||
timer[UPDATE] +=updateAtoms(&comm,&atom);
|
||||
if(param.balance && !((n+1)%param.balance_every))
|
||||
timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , ¶m, timer[FORCE]);
|
||||
timer[NEIGH] += reneighbour(&comm, ¶m, &atom, &neighbor);
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
isReneighboured = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
|
||||
}
|
||||
timer[REVERSE] += reverse(&comm, &atom, ¶m);
|
||||
finalIntegrate(¶m, &atom);
|
||||
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
|
||||
int write_pos = !((n + 1) % param.x_out_every);
|
||||
int write_vel = !((n + 1) % param.v_out_every);
|
||||
if(write_pos || write_vel) {
|
||||
if(param.vtk_file != NULL) {
|
||||
write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
|
||||
printvtk(param.vtk_file, &comm, &atom, ¶m, n+1);
|
||||
}
|
||||
|
||||
//TODO: xtc file
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_write(&atom, n + 1, write_pos, write_vel);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
MPI_Barrier(world);
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
updateSingleAtoms(&atom);
|
||||
updateAtoms(&comm,&atom);
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
//TODO:
|
||||
if(param.xtc_file != NULL) {
|
||||
xtc_end();
|
||||
}
|
||||
@@ -302,17 +370,35 @@ int main(int argc, char** argv) {
|
||||
#ifdef CUDA_TARGET
|
||||
cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
#ifdef COMPUTE_STATS
|
||||
double mint[NUMTIMER];
|
||||
double maxt[NUMTIMER];
|
||||
double sumt[NUMTIMER];
|
||||
timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
|
||||
MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
|
||||
MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
|
||||
MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
|
||||
int Nghost;
|
||||
MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);
|
||||
|
||||
if(comm.myproc == 0){
|
||||
int n = comm.numproc;
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs\n\n",timer[TOTAL]);
|
||||
printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
|
||||
printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
endComm(&comm);
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -7,15 +7,15 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#define SMALL 1.0e-6
|
||||
#define FACTOR 0.999
|
||||
|
||||
#define eps 1.0e-9
|
||||
static MD_FLOAT xprd, yprd, zprd;
|
||||
static MD_FLOAT bininvx, bininvy;
|
||||
static int mbinxlo, mbinylo;
|
||||
@@ -34,9 +34,16 @@ static int nmax;
|
||||
static int nstencil; // # of bins in stencil
|
||||
static int* stencil; // stencil list of bin offsets
|
||||
static MD_FLOAT binsizex, binsizey;
|
||||
int me; //rank
|
||||
int method; // method
|
||||
int shellMethod; //If shell method exist
|
||||
|
||||
static int coord2bin(MD_FLOAT, MD_FLOAT);
|
||||
static MD_FLOAT bindist(int, int);
|
||||
//static int ghostZone(Atom*, int);
|
||||
static int halfZoneCluster(Atom*,int);
|
||||
static int ghostClusterinRange(Atom*, int, int, MD_FLOAT);
|
||||
static void neighborGhost(Atom*, Neighbor*);
|
||||
|
||||
/* exported subroutines */
|
||||
void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
@@ -53,11 +60,25 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
bincount = NULL;
|
||||
bin_clusters = NULL;
|
||||
bin_nclusters = NULL;
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->maxneighs = 200;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->numneigh_masked = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
neighbor->neighbors_imask = NULL;
|
||||
//MPI
|
||||
shellMethod = 0;
|
||||
method = param->method;
|
||||
if(method == halfShell || method == eightShell){
|
||||
param->half_neigh = 1;
|
||||
shellMethod = 1;
|
||||
}
|
||||
me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->Nshell = 0;
|
||||
neighbor->numNeighShell = NULL;
|
||||
neighbor->neighshell = NULL;
|
||||
neighbor->listshell = NULL;
|
||||
}
|
||||
|
||||
void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
@@ -76,7 +97,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||
|
||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Natoms)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
|
||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
|
||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
|
||||
@@ -145,6 +166,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
}
|
||||
|
||||
MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
|
||||
|
||||
MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
|
||||
MD_FLOAT dm = MAX(dl, dh);
|
||||
@@ -162,6 +184,7 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d2 += dm0 * dm0;
|
||||
|
||||
return d2;
|
||||
}
|
||||
|
||||
@@ -224,15 +247,17 @@ static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(atom->Nclusters_local > nmax) {
|
||||
nmax = atom->Nclusters_local;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
|
||||
}
|
||||
|
||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||
@@ -245,10 +270,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int n = 0, nmasked = 0;
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
@@ -257,14 +282,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
|
||||
MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
|
||||
MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
|
||||
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
|
||||
int cj, m = -1;
|
||||
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
|
||||
const int c = bin_nclusters[jbin];
|
||||
|
||||
if(c > 0) {
|
||||
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
|
||||
|
||||
@@ -274,6 +297,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
if(neighbor->half_neigh && ci_cj1 > cj) {
|
||||
continue;
|
||||
}
|
||||
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
@@ -282,7 +306,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq = dm0 * dm0;
|
||||
} while(m + 1 < c && d_bb_sq > cutneighsq);
|
||||
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
@@ -324,15 +347,17 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
imask = get_imask_simd_4xn(1, ci, cj);
|
||||
#endif
|
||||
|
||||
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||
neighptr[n].cj = cj;
|
||||
neighptr[n].imask = imask;
|
||||
} else {
|
||||
neighptr[n].cj = neighptr[nmasked].cj;
|
||||
neighptr[n].imask = neighptr[nmasked].imask;
|
||||
neighptr[nmasked].cj = cj;
|
||||
neighptr[nmasked].imask = imask;
|
||||
nmasked++;
|
||||
if(n < neighbor->maxneighs) {
|
||||
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||
neighptr[n] = cj;
|
||||
neighptr_imask[n] = imask;
|
||||
} else {
|
||||
neighptr[n] = neighptr[nmasked];
|
||||
neighptr_imask[n] = neighptr_imask[nmasked];
|
||||
neighptr[nmasked] = cj;
|
||||
neighptr_imask[nmasked] = imask;
|
||||
nmasked++;
|
||||
}
|
||||
}
|
||||
|
||||
n++;
|
||||
@@ -350,15 +375,15 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr[n].imask = 0;
|
||||
neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr_imask[n] = 0;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
@@ -375,13 +400,16 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
fprintf(stdout, "RESIZE %d, PROC %d\n", neighbor->maxneighs,me);
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
free(neighbor->neighbors_imask);
|
||||
neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
|
||||
}
|
||||
}
|
||||
|
||||
if(method == eightShell) neighborGhost(atom, neighbor);
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
|
||||
for(int ci = 0; ci < 6; ci++) {
|
||||
@@ -433,20 +461,21 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
MD_FLOAT cutsq = cutneighsq;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
|
||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
||||
numneighs--;
|
||||
}
|
||||
}
|
||||
|
||||
while(k < numneighs) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj = neighs[k];
|
||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||
k++;
|
||||
} else {
|
||||
@@ -461,8 +490,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs[numneighs].imask = 0;
|
||||
neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs_imask[numneighs] = 0;
|
||||
numneighs++;
|
||||
}
|
||||
}
|
||||
@@ -501,46 +530,44 @@ int coord2bin(MD_FLOAT xin, MD_FLOAT yin) {
|
||||
int ix, iy;
|
||||
|
||||
if(xin >= xprd) {
|
||||
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
ix = (int)((xin + eps - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * bininvx) - mbinxlo;
|
||||
ix = (int)((xin+eps) * bininvx) - mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
ix = (int)((xin+eps) * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
if(yin >= yprd) {
|
||||
iy = (int)(((yin+eps) - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * bininvy) - mbinylo;
|
||||
iy = (int)((yin+eps) * bininvy) - mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
iy = (int)((yin+eps) * bininvy) - mbinylo - 1;
|
||||
}
|
||||
|
||||
|
||||
return (iy * mbinx + ix + 1);
|
||||
}
|
||||
|
||||
void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) {
|
||||
if(xin >= xprd) {
|
||||
*ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
*ix = (int)((xin + eps - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
*ix = (int)(xin * bininvx) - mbinxlo;
|
||||
*ix = (int)((xin+eps) * bininvx) - mbinxlo;
|
||||
} else {
|
||||
*ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
*ix = (int)((xin+eps) * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
*iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
if(yin >= yprd) {
|
||||
*iy = (int)((yin + eps - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
*iy = (int)(yin * bininvy) - mbinylo;
|
||||
*iy = (int)((yin+eps) * bininvy) - mbinylo;
|
||||
} else {
|
||||
*iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
*iy = (int)((yin+eps) * bininvy) - mbinylo - 1;
|
||||
}
|
||||
}
|
||||
|
||||
void binAtoms(Atom *atom) {
|
||||
DEBUG_MESSAGE("binAtoms start\n");
|
||||
int resize = 1;
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
@@ -557,7 +584,7 @@ void binAtoms(Atom *atom) {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(resize) {
|
||||
free(bins);
|
||||
atoms_per_bin *= 2;
|
||||
@@ -605,8 +632,7 @@ void buildClusters(Atom *atom) {
|
||||
|
||||
/* bin local atoms */
|
||||
binAtoms(atom);
|
||||
sortAtomsByZCoord(atom);
|
||||
|
||||
sortAtomsByZCoord(atom);
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
int c = bincount[bin];
|
||||
int ac = 0;
|
||||
@@ -678,6 +704,9 @@ void buildClusters(Atom *atom) {
|
||||
void defineJClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("defineJClusters start\n");
|
||||
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
atom->ncj = atom->Nclusters_local / jfac;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int cj0 = CJ0_FROM_CI(ci);
|
||||
|
||||
@@ -820,12 +849,11 @@ void binClusters(Atom *atom) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
int ix = -1, iy = -1;
|
||||
MD_FLOAT xtmp, ytmp;
|
||||
|
||||
if(shellMethod == halfShell && !halfZoneCluster(atom, cj)) continue;
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
@@ -836,6 +864,7 @@ void binClusters(Atom *atom) {
|
||||
coord2bin2D(xtmp, ytmp, &ix, &iy);
|
||||
ix = MAX(MIN(ix, mbinx - 1), 0);
|
||||
iy = MAX(MIN(iy, mbiny - 1), 0);
|
||||
|
||||
for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
int nix, niy;
|
||||
xtmp = cj_x[CL_X_OFFSET + cjj];
|
||||
@@ -843,7 +872,7 @@ void binClusters(Atom *atom) {
|
||||
coord2bin2D(xtmp, ytmp, &nix, &niy);
|
||||
nix = MAX(MIN(nix, mbinx - 1), 0);
|
||||
niy = MAX(MIN(niy, mbiny - 1), 0);
|
||||
|
||||
|
||||
// Always put the cluster on the bin of its innermost atom so
|
||||
// the cluster should be closer to local clusters
|
||||
if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
|
||||
@@ -851,7 +880,6 @@ void binClusters(Atom *atom) {
|
||||
if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
|
||||
if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
|
||||
}
|
||||
|
||||
int bin = iy * mbinx + ix + 1;
|
||||
int c = bin_nclusters[bin];
|
||||
if(c < clusters_per_bin) {
|
||||
@@ -873,25 +901,21 @@ void binClusters(Atom *atom) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!inserted) {
|
||||
bin_clusters[bin * clusters_per_bin + c] = cj;
|
||||
}
|
||||
|
||||
bin_nclusters[bin]++;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(bin_clusters);
|
||||
clusters_per_bin *= 2;
|
||||
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("bin_nclusters\n");
|
||||
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
|
||||
@@ -909,7 +933,6 @@ void updateSingleAtoms(Atom *atom) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
|
||||
atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
|
||||
@@ -918,12 +941,174 @@ void updateSingleAtoms(Atom *atom) {
|
||||
atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
|
||||
atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
|
||||
Natom++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(Natom != atom->Nlocal) {
|
||||
fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n");
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("updateSingleAtoms stop\n");
|
||||
}
|
||||
|
||||
//MPI Shell Methods
|
||||
|
||||
static int eightZoneCluster(Atom* atom, int cj)
|
||||
{
|
||||
//Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7
|
||||
int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7};
|
||||
int zone = 0;
|
||||
MD_FLOAT *hi = atom->mybox.hi;
|
||||
|
||||
if (atom->jclusters[cj].bbminx +eps >=hi[_x]){
|
||||
zone += 1;
|
||||
}
|
||||
if (atom->jclusters[cj].bbminy +eps >=hi[_y]){
|
||||
zone += 2;
|
||||
}
|
||||
if (atom->jclusters[cj].bbminz +eps >=hi[_z]){
|
||||
zone += 4;
|
||||
}
|
||||
return zoneMapping[zone];
|
||||
}
|
||||
|
||||
static int halfZoneCluster(Atom* atom, int cj)
|
||||
{
|
||||
MD_FLOAT *hi = atom->mybox.hi;
|
||||
MD_FLOAT *lo = atom->mybox.lo;
|
||||
|
||||
if(atom->jclusters[cj].bbmaxx < lo[_x] && atom->jclusters[cj].bbmaxy < hi[_y] &&
|
||||
atom->jclusters[cj].bbmaxz < hi[_z]){
|
||||
return 0;
|
||||
} else if(atom->jclusters[cj].bbmaxy < lo[_y] && atom->jclusters[cj].bbmaxz < hi[_z]){
|
||||
return 0;
|
||||
} else if(atom->jclusters[cj].bbmaxz < lo[_z]){
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int BoxGhostDistance(Atom *atom, int ci, int cj) {
|
||||
|
||||
MD_FLOAT dl = atom->jclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->jclusters[ci].bbmaxx;
|
||||
MD_FLOAT dm = MAX(dl, dh);
|
||||
MD_FLOAT dm0 = MAX(dm, 0.0);
|
||||
MD_FLOAT dx2 = dm0 * dm0;
|
||||
|
||||
dl = atom->jclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
|
||||
dh = atom->jclusters[cj].bbminy - atom->jclusters[ci].bbmaxy;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
MD_FLOAT dy2 = dm0 * dm0;
|
||||
|
||||
dl = atom->jclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
|
||||
dh = atom->jclusters[cj].bbminz - atom->jclusters[ci].bbmaxz;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
MD_FLOAT dz2 = dm0 * dm0;
|
||||
|
||||
return dx2 > cutneighsq ? 0 : dy2 > cutneighsq ? 0 : dz2 > cutneighsq ? 0 : 1;
|
||||
}
|
||||
|
||||
static int ghostClusterinRange(Atom *atom, int cs, int cg, MD_FLOAT rsq) {
|
||||
int cs_vec_base = CJ_VECTOR_BASE_INDEX(cs);
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cg);
|
||||
MD_FLOAT *cs_x = &atom->cl_x[cs_vec_base];
|
||||
MD_FLOAT *cg_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->jclusters[cs].natoms; cii++) {
|
||||
for(int cjj = 0; cjj < atom->jclusters[cg].natoms; cjj++) {
|
||||
MD_FLOAT delx = cs_x[CL_X_OFFSET + cii] - cg_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = cs_x[CL_Y_OFFSET + cii] - cg_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = cs_x[CL_Z_OFFSET + cii] - cg_x[CL_Z_OFFSET + cjj];
|
||||
if(delx * delx + dely * dely + delz * delz < rsq) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void neighborGhost(Atom *atom, Neighbor *neighbor) {
|
||||
int Nshell=0;
|
||||
int Ncluster_local = atom->Nclusters_local;
|
||||
int Nclusterghost = atom->Nclusters_ghost;
|
||||
if(neighbor->listshell) free(neighbor->listshell);
|
||||
neighbor->listshell = (int*) malloc(Nclusterghost * sizeof(int));
|
||||
int* listzone = (int*) malloc(8 * Nclusterghost * sizeof(int));
|
||||
int countCluster[8] = {0,0,0,0,0,0,0,0};
|
||||
|
||||
//Selecting ghost atoms for interaction and putting them into regions
|
||||
for(int cg = atom->ncj; cg < atom->ncj+Nclusterghost; cg++) {
|
||||
int czone = eightZoneCluster(atom,cg);
|
||||
int *list = &listzone[Nclusterghost*czone];
|
||||
int n = countCluster[czone];
|
||||
list[n] = cg;
|
||||
countCluster[czone]++;
|
||||
//It is only necessary to find neighbour particles for 3 regions
|
||||
//if(czone == 1 || czone == 2 || czone == 3)
|
||||
//neighbor->listshell[Nshell++] = cg;
|
||||
}
|
||||
|
||||
for(int zone = 1; zone<=3; zone++){
|
||||
int *list = &listzone[Nclusterghost*zone];
|
||||
for(int n=0; n<countCluster[zone]; n++)
|
||||
neighbor->listshell[Nshell++] = list[n];
|
||||
}
|
||||
|
||||
neighbor->Nshell = Nshell;
|
||||
if(neighbor->numNeighShell) free(neighbor->numNeighShell);
|
||||
if(neighbor->neighshell) free(neighbor->neighshell);
|
||||
neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int));
|
||||
|
||||
int resize = 1;
|
||||
|
||||
while(resize)
|
||||
{
|
||||
resize = 0;
|
||||
for(int ic = 0; ic < Nshell; ic++) {
|
||||
int *neighshell = &(neighbor->neighshell[ic*neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
int icluster = neighbor->listshell[ic];
|
||||
int iczone = eightZoneCluster(atom, icluster);
|
||||
|
||||
for(int jczone=0; jczone<8; jczone++){
|
||||
|
||||
if(jczone <=iczone) continue;
|
||||
if(iczone == 1 && (jczone==5||jczone==6||jczone==7)) continue;
|
||||
if(iczone == 2 && (jczone==4||jczone==6||jczone==7)) continue;
|
||||
if(iczone == 3 && (jczone==4||jczone==5||jczone==7)) continue;
|
||||
|
||||
int Ncluster = countCluster[jczone];
|
||||
int* loc_zone = &listzone[jczone * Nclusterghost];
|
||||
|
||||
for(int k = 0; k < Ncluster ; k++) {
|
||||
int jcluster = loc_zone[k];
|
||||
|
||||
if(BoxGhostDistance(atom, icluster, jcluster))
|
||||
{
|
||||
if(ghostClusterinRange(atom, icluster, jcluster, cutneighsq))
|
||||
neighshell[n++] = jcluster;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
neighbor->numNeighShell[ic] = n;
|
||||
|
||||
if(n >= neighbor->maxneighs){
|
||||
resize = 1;
|
||||
neighbor->maxneighs = n * 1.2;
|
||||
fprintf(stdout, "RESIZE EIGHT SHELL %d, PROC %d\n", neighbor->maxneighs,me);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(neighbor->neighshell);
|
||||
neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
free(listzone);
|
||||
}
|
||||
|
||||
@@ -13,7 +13,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
NeighborCluster* neighs;
|
||||
int *neighs;
|
||||
unsigned int *neighs_imask;
|
||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
@@ -34,7 +35,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k].cj;
|
||||
int j = neighs[k];
|
||||
MEM_TRACE(j, 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
|
||||
130
gromacs/vtk.c
130
gromacs/vtk.c
@@ -9,6 +9,11 @@
|
||||
|
||||
#include <atom.h>
|
||||
#include <vtk.h>
|
||||
#include <mpi.h>
|
||||
#include <string.h>
|
||||
|
||||
static MPI_File _fh;
|
||||
static inline void flushBuffer(char*);
|
||||
|
||||
void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
|
||||
write_local_atoms_to_vtk_file(filename, atom, timestep);
|
||||
@@ -188,3 +193,128 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
|
||||
{
|
||||
char msg[256];
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
|
||||
MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
|
||||
if(_fh == MPI_FILE_NULL) {
|
||||
if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (comm->myproc==0){
|
||||
sprintf(msg, "# vtk DataFile Version 2.0\n");
|
||||
sprintf(msg, "%sParticle data\n",msg);
|
||||
sprintf(msg, "%sASCII\n",msg);
|
||||
sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
|
||||
sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);
|
||||
flushBuffer(msg);
|
||||
}
|
||||
}
|
||||
|
||||
int vtkVector(Comm* comm, Atom* atom, Parameter* param)
|
||||
{
|
||||
if (_fh == MPI_FILE_NULL) {
|
||||
if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n"
|
||||
int extrabuff = 100;
|
||||
int sizebuff = sizeline*atom->Nlocal+extrabuff;
|
||||
int mysize = 0;
|
||||
char* msg = (char*) malloc(sizebuff);
|
||||
sprintf(msg, "");
|
||||
for(int i = 0; i < atom->Nlocal; i++){
|
||||
if(mysize+extrabuff >= sizebuff){
|
||||
sizebuff*= 1.5;
|
||||
msg = (char*) realloc(msg, sizebuff);
|
||||
}
|
||||
//TODO: do not forget to add param->xlo, param->ylo, param->zlo
|
||||
sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
|
||||
mysize = strlen(msg);
|
||||
}
|
||||
int gatherSize[comm->numproc];
|
||||
|
||||
MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
|
||||
int offset=0;
|
||||
int globalSize = 0;
|
||||
|
||||
for(int i = 0; i < comm->myproc; i++)
|
||||
offset+= gatherSize[i];
|
||||
|
||||
for(int i = 0; i < comm->numproc; i++)
|
||||
globalSize+= gatherSize[i];
|
||||
|
||||
MPI_Offset displ;
|
||||
MPI_Datatype FileType;
|
||||
int GlobalSize[] = {globalSize};
|
||||
int LocalSize[] = {mysize};
|
||||
int Start[] = {offset};
|
||||
|
||||
if(LocalSize[0]>0){
|
||||
MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);
|
||||
} else {
|
||||
MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
|
||||
}
|
||||
MPI_Type_commit(&FileType);
|
||||
MPI_File_get_size(_fh, &displ);
|
||||
MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
|
||||
MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);
|
||||
|
||||
if (comm->myproc==0){
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2);
|
||||
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1 %d\n", msg, i);
|
||||
flushBuffer(msg);
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1\n",msg);
|
||||
flushBuffer(msg);
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
|
||||
sprintf(msg, "%sSCALARS mass double\n",msg);
|
||||
sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1.0\n",msg);
|
||||
sprintf(msg, "%s\n\n",msg);
|
||||
flushBuffer(msg);
|
||||
}
|
||||
}
|
||||
|
||||
void vtkClose()
|
||||
{
|
||||
MPI_File_close(&_fh);
|
||||
_fh=MPI_FILE_NULL;
|
||||
}
|
||||
|
||||
//TODO: print ghost and cluster using MPI
|
||||
void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
|
||||
{
|
||||
if(comm->numproc == 1)
|
||||
{
|
||||
write_data_to_vtk_file(filename, atom, timestep);
|
||||
return;
|
||||
}
|
||||
|
||||
vtkOpen(filename, comm, atom, timestep);
|
||||
vtkVector(comm, atom, param);
|
||||
vtkClose();
|
||||
}
|
||||
|
||||
static inline void flushBuffer(char* msg){
|
||||
MPI_Offset displ;
|
||||
MPI_File_get_size(_fh, &displ);
|
||||
MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
CC = icc
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
OPENMP = -qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
|
||||
32
include_MPIICC.mk
Normal file
32
include_MPIICC.mk
Normal file
@@ -0,0 +1,32 @@
|
||||
CC = mpiicc
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE) #-g -debug
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
INCLUDES =
|
||||
LIBS = -lm
|
||||
312
lammps/atom.c
312
lammps/atom.c
@@ -9,10 +9,12 @@
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
@@ -21,10 +23,10 @@
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
#define MAX(a,b) ((a) > (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
void initAtom(Atom *atom) {
|
||||
void initAtom(Atom *atom){
|
||||
atom->x = NULL; atom->y = NULL; atom->z = NULL;
|
||||
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
|
||||
atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
|
||||
@@ -41,6 +43,7 @@ void initAtom(Atom *atom) {
|
||||
atom->radius = NULL;
|
||||
atom->av = NULL;
|
||||
atom->r = NULL;
|
||||
atom->border_map = NULL;
|
||||
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL;
|
||||
@@ -52,12 +55,19 @@ void initAtom(Atom *atom) {
|
||||
d_atom->sigma6 = NULL;
|
||||
d_atom->cutforcesq = NULL;
|
||||
d_atom->cutneighsq = NULL;
|
||||
//MPI
|
||||
Box *mybox = &(atom->mybox);
|
||||
mybox->xprd = mybox->yprd = mybox->zprd = 0;
|
||||
mybox->lo[_x] = mybox->lo[_y] = mybox->lo[_z] = 0;
|
||||
mybox->hi[_x] = mybox->hi[_y] = mybox->hi[_z] = 0;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
|
||||
|
||||
MD_FLOAT xlo = 0; MD_FLOAT xhi = param->xprd;
|
||||
MD_FLOAT ylo = 0; MD_FLOAT yhi = param->yprd;
|
||||
MD_FLOAT zlo = 0; MD_FLOAT zhi = param->zprd;
|
||||
|
||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||
atom->Nlocal = 0;
|
||||
atom->ntypes = param->ntypes;
|
||||
@@ -107,15 +117,15 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
xtmp = 0.5 * alat * i;
|
||||
ytmp = 0.5 * alat * j;
|
||||
ztmp = 0.5 * alat * k;
|
||||
|
||||
|
||||
if( xtmp >= xlo && xtmp < xhi &&
|
||||
ytmp >= ylo && ytmp < yhi &&
|
||||
ztmp >= zlo && ztmp < zhi ) {
|
||||
|
||||
|
||||
n = k * (2 * param->ny) * (2 * param->nx) +
|
||||
j * (2 * param->nx) +
|
||||
i + 1;
|
||||
|
||||
|
||||
for(m = 0; m < 5; m++) {
|
||||
myrandom(&n);
|
||||
}
|
||||
@@ -131,7 +141,7 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
}
|
||||
vztmp = myrandom(&n);
|
||||
|
||||
if(atom->Nlocal == atom->Nmax) {
|
||||
while(atom->Nlocal >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
@@ -163,38 +173,42 @@ int type_str2int(const char *type) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom(Atom* atom, Parameter* param) {
|
||||
int readAtom(Atom *atom, Parameter *param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
int len = strlen(param->input_file);
|
||||
if(strncmp(¶m->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
|
||||
if(strncmp(¶m->input_file[len - 3], ".in", 3) == 0) { return readAtom_in(atom, param); }
|
||||
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
|
||||
if(me==0) fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int read_atoms = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
char *item = strtok(line, " ");
|
||||
char *item = strtok(line, "\t ");
|
||||
if(strncmp(item, "CRYST1", 6) == 0) {
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
@@ -203,23 +217,23 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
char *label;
|
||||
int atom_id, comp_id;
|
||||
MD_FLOAT occupancy, charge;
|
||||
atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
|
||||
label = strtok(NULL, " ");
|
||||
comp_id = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom->type[atom_id] = type_str2int(strtok(NULL, "\t "));
|
||||
label = strtok(NULL, "\t ");
|
||||
comp_id = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = 0.0;
|
||||
atom_vy(atom_id) = 0.0;
|
||||
atom_vz(atom_id) = 0.0;
|
||||
occupancy = atof(strtok(NULL, " "));
|
||||
charge = atof(strtok(NULL, " "));
|
||||
occupancy = atof(strtok(NULL, "\t "));
|
||||
charge = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
@@ -231,14 +245,14 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
strncmp(item, "ENDMDL", 6) == 0) {
|
||||
// Do nothing
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(!read_atoms) {
|
||||
fprintf(stderr, "Input error: No atoms read!\n");
|
||||
if(me==0)fprintf(stderr, "Input error: No atoms read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -254,12 +268,15 @@ int readAtom_pdb(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
char desc[MAXLINE];
|
||||
@@ -268,7 +285,7 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
int i = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -277,26 +294,26 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
for(i = 0; desc[i] != '\n'; i++);
|
||||
desc[i] = '\0';
|
||||
readline(line, fp);
|
||||
atoms_to_read = atoi(strtok(line, " "));
|
||||
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
atoms_to_read = atoi(strtok(line, "\t "));
|
||||
if(me==0)fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
|
||||
|
||||
while(!feof(fp) && read_atoms < atoms_to_read) {
|
||||
readline(line, fp);
|
||||
char *label = strtok(line, " ");
|
||||
int type = type_str2int(strtok(NULL, " "));
|
||||
int atom_id = atoi(strtok(NULL, " ")) - 1;
|
||||
char *label = strtok(line, "\t ");
|
||||
int type = type_str2int(strtok(NULL, "\t "));
|
||||
int atom_id = atoi(strtok(NULL, "\t ")) - 1;
|
||||
atom_id = read_atoms;
|
||||
while(atom_id + 1 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
atom->type[atom_id] = type;
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
|
||||
atom->Natoms++;
|
||||
atom->Nlocal++;
|
||||
@@ -306,18 +323,18 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
if(!feof(fp)) {
|
||||
readline(line, fp);
|
||||
param->xlo = 0.0;
|
||||
param->xhi = atof(strtok(line, " "));
|
||||
param->xhi = atof(strtok(line, "\t "));
|
||||
param->ylo = 0.0;
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = 0.0;
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
}
|
||||
|
||||
if(read_atoms != atoms_to_read) {
|
||||
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
if(me==0)fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -333,12 +350,14 @@ int readAtom_gro(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
|
||||
fclose(fp);
|
||||
return read_atoms;
|
||||
}
|
||||
|
||||
int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
@@ -347,7 +366,7 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
int ts = -1;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0)fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -370,47 +389,47 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
}
|
||||
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
|
||||
readline(line, fp);
|
||||
param->xlo = atof(strtok(line, " "));
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->xlo = atof(strtok(line, "\t "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
|
||||
readline(line, fp);
|
||||
param->ylo = atof(strtok(line, " "));
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->ylo = atof(strtok(line, "\t "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
|
||||
readline(line, fp);
|
||||
param->zlo = atof(strtok(line, " "));
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
param->zlo = atof(strtok(line, "\t "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
|
||||
for(int i = 0; i < natoms; i++) {
|
||||
readline(line, fp);
|
||||
atom_id = atoi(strtok(line, " ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_id = atoi(strtok(line, "\t ")) - 1;
|
||||
atom->type[atom_id] = atoi(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
read_atoms++;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid item: %s\n", item);
|
||||
if(me==0)fprintf(stderr, "Invalid item: %s\n", item);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
if(me==0)fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if(ts < 0 || !natoms || !read_atoms) {
|
||||
fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -426,30 +445,34 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
int readAtom_in(Atom* atom, Parameter* param) {
|
||||
int me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &me);
|
||||
FILE *fp = fopen(param->input_file, "r");
|
||||
char line[MAXLINE];
|
||||
int natoms = 0;
|
||||
int atom_id = 0;
|
||||
|
||||
if(!fp) {
|
||||
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
if(me==0) fprintf(stderr, "Could not open input file: %s\n", param->input_file);
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
readline(line, fp);
|
||||
natoms = atoi(strtok(line, " "));
|
||||
param->xlo = atof(strtok(NULL, " "));
|
||||
param->xhi = atof(strtok(NULL, " "));
|
||||
param->ylo = atof(strtok(NULL, " "));
|
||||
param->yhi = atof(strtok(NULL, " "));
|
||||
param->zlo = atof(strtok(NULL, " "));
|
||||
param->zhi = atof(strtok(NULL, " "));
|
||||
natoms = atoi(strtok(line, "\t "));
|
||||
param->xlo = atof(strtok(NULL, "\t "));
|
||||
param->xhi = atof(strtok(NULL, "\t "));
|
||||
param->ylo = atof(strtok(NULL, "\t "));
|
||||
param->yhi = atof(strtok(NULL, "\t "));
|
||||
param->zlo = atof(strtok(NULL, "\t "));
|
||||
param->zhi = atof(strtok(NULL, "\t "));
|
||||
param->xprd = param->xhi - param->xlo;
|
||||
param->yprd = param->yhi - param->ylo;
|
||||
param->zprd = param->zhi - param->zlo;
|
||||
atom->Natoms = natoms;
|
||||
atom->Nlocal = natoms;
|
||||
atom->ntypes = 1;
|
||||
@@ -462,27 +485,26 @@ int readAtom_in(Atom* atom, Parameter* param) {
|
||||
readline(line, fp);
|
||||
|
||||
// TODO: store mass per atom
|
||||
char *s_mass = strtok(line, " ");
|
||||
char *s_mass = strtok(line, "\t ");
|
||||
if(strncmp(s_mass, "inf", 3) == 0) {
|
||||
// Set atom's mass to INFINITY
|
||||
} else {
|
||||
param->mass = atof(s_mass);
|
||||
}
|
||||
|
||||
atom->radius[atom_id] = atof(strtok(NULL, " "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, " "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, " "));
|
||||
|
||||
atom->radius[atom_id] = atof(strtok(NULL, "\t "));
|
||||
atom_x(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_y(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_z(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vx(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vy(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom_vz(atom_id) = atof(strtok(NULL, "\t "));
|
||||
atom->type[atom_id] = 0;
|
||||
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
|
||||
atom_id++;
|
||||
}
|
||||
|
||||
if(!natoms) {
|
||||
fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
if(me==0)fprintf(stderr, "Input error: atom data was not read!\n");
|
||||
exit(-1);
|
||||
return -1;
|
||||
}
|
||||
@@ -498,7 +520,7 @@ int readAtom_in(Atom* atom, Parameter* param) {
|
||||
atom->cutforcesq[i] = param->cutforce * param->cutforce;
|
||||
}
|
||||
|
||||
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
if(me==0)fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
|
||||
return natoms;
|
||||
}
|
||||
|
||||
@@ -530,7 +552,125 @@ void growAtom(Atom *atom) {
|
||||
REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
|
||||
// DEM
|
||||
atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->radius = (MD_FLOAT*) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
atom->r = (MD_FLOAT*) reallocate(atom->r, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
|
||||
}
|
||||
|
||||
/* MPI added*/
|
||||
void packForward(Atom* atom, int n ,int* list, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
int i, j;
|
||||
for(i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
buf_x(i) = atom_x(j) + pbc[0] * atom->mybox.xprd;
|
||||
buf_y(i) = atom_y(j) + pbc[1] * atom->mybox.yprd;
|
||||
buf_z(i) = atom_z(j) + pbc[2] * atom->mybox.zprd;
|
||||
}
|
||||
}
|
||||
|
||||
void unpackForward(Atom* atom, int n, int first, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < n; i++) {
|
||||
atom_x((first + i)) = buf_x(i);
|
||||
atom_y((first + i)) = buf_y(i);
|
||||
atom_z((first + i)) = buf_z(i);
|
||||
}
|
||||
}
|
||||
|
||||
int packGhost(Atom* atom, int i, MD_FLOAT* buf, int* pbc)
|
||||
{
|
||||
int m = 0;
|
||||
buf[m++] = atom_x(i) + pbc[_x] * atom->mybox.xprd;
|
||||
buf[m++] = atom_y(i) + pbc[_y] * atom->mybox.yprd;
|
||||
buf[m++] = atom_z(i) + pbc[_z] * atom->mybox.zprd;
|
||||
buf[m++] = atom->type[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackGhost(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
while (i>=atom->Nmax) growAtom(atom);
|
||||
int m = 0;
|
||||
atom_x(i) = buf[m++];
|
||||
atom_y(i) = buf[m++];
|
||||
atom_z(i) = buf[m++];
|
||||
atom->type[i] = buf[m++];
|
||||
atom->Nghost++;
|
||||
return m;
|
||||
}
|
||||
|
||||
void packReverse(Atom* atom, int n, int first, MD_FLOAT* buf)
|
||||
{
|
||||
for(int i = 0; i < n; i++) {
|
||||
buf_x(i) = atom_fx(first + i);
|
||||
buf_y(i) = atom_fy(first + i);
|
||||
buf_z(i) = atom_fz(first + i);
|
||||
}
|
||||
}
|
||||
|
||||
void unpackReverse(Atom* atom, int n, int* list, MD_FLOAT* buf)
|
||||
{
|
||||
int i, j;
|
||||
for(i = 0; i < n; i++) {
|
||||
j = list[i];
|
||||
atom_fx(j) += buf_x(i);
|
||||
atom_fy(j) += buf_y(i);
|
||||
atom_fz(j) += buf_z(i);
|
||||
}
|
||||
}
|
||||
|
||||
int packExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
int m = 0;
|
||||
buf[m++] = atom_x(i);
|
||||
buf[m++] = atom_y(i);
|
||||
buf[m++] = atom_z(i);
|
||||
buf[m++] = atom_vx(i);
|
||||
buf[m++] = atom_vy(i);
|
||||
buf[m++] = atom_vz(i);
|
||||
buf[m++] = atom->type[i];
|
||||
return m;
|
||||
}
|
||||
|
||||
int unpackExchange(Atom* atom, int i, MD_FLOAT* buf)
|
||||
{
|
||||
while(i >= atom->Nmax) growAtom(atom);
|
||||
int m = 0;
|
||||
atom_x(i) = buf[m++];
|
||||
atom_y(i) = buf[m++];
|
||||
atom_z(i) = buf[m++];
|
||||
atom_vx(i) = buf[m++];
|
||||
atom_vy(i) = buf[m++];
|
||||
atom_vz(i) = buf[m++];
|
||||
atom->type[i] = buf[m++];
|
||||
return m;
|
||||
}
|
||||
|
||||
void pbc(Atom* atom)
|
||||
{
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
MD_FLOAT xprd = atom->mybox.xprd;
|
||||
MD_FLOAT yprd = atom->mybox.yprd;
|
||||
MD_FLOAT zprd = atom->mybox.zprd;
|
||||
|
||||
if(atom_x(i) < 0.0) atom_x(i) += xprd;
|
||||
if(atom_y(i) < 0.0) atom_y(i) += yprd;
|
||||
if(atom_z(i) < 0.0) atom_z(i)+= zprd;
|
||||
if(atom_x(i) >= xprd) atom_x(i) -= xprd;
|
||||
if(atom_y(i) >= yprd) atom_y(i) -= yprd;
|
||||
if(atom_z(i) >= zprd) atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
|
||||
void copy(Atom* atom, int i, int j)
|
||||
{
|
||||
atom_x(i) = atom_x(j);
|
||||
atom_y(i) = atom_y(j);
|
||||
atom_z(i) = atom_z(j);
|
||||
atom_vx(i) = atom_vx(j);
|
||||
atom_vy(i) = atom_vy(j);
|
||||
atom_vz(i) = atom_vz(j);
|
||||
atom->type[i] = atom->type[j];
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ extern "C" {
|
||||
}
|
||||
|
||||
// cuda kernel
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= Nlocal) {
|
||||
return;
|
||||
@@ -46,6 +46,10 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neigh_neighbors[Nlocal * k + i];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
@@ -55,7 +59,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
@@ -109,7 +113,7 @@ extern "C" {
|
||||
|
||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
|
||||
@@ -123,7 +127,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
|
||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
|
||||
@@ -136,13 +140,11 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
}
|
||||
|
||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int Nlocal = atom->Nlocal;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
/*
|
||||
int nDevices;
|
||||
@@ -165,7 +167,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
|
||||
cuda_assert("calc_force", cudaPeekAtLastError());
|
||||
cuda_assert("calc_force", cudaDeviceSynchronize());
|
||||
cudaProfilerStop();
|
||||
|
||||
@@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
|
||||
|
||||
__global__ void compute_neighborhood(
|
||||
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nlocal) {
|
||||
@@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
@@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
|
||||
|
||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
cudaProfilerStart();
|
||||
@@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
|
||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||
c_new_maxneighs,
|
||||
cutneighsq);
|
||||
cutneighsq, atom->ntypes);
|
||||
|
||||
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
|
||||
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
|
||||
|
||||
@@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
|
||||
if(reneigh) {
|
||||
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
@@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
}
|
||||
|
||||
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
@@ -14,6 +14,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||
@@ -22,6 +23,7 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||
}
|
||||
|
||||
@@ -13,13 +13,17 @@
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
#include <mpi.h>
|
||||
#include <util.h>
|
||||
#ifdef __SIMD_KERNEL__
|
||||
#include <simd.h>
|
||||
#endif
|
||||
|
||||
void computeForceGhostShell(Parameter*, Atom*, Neighbor*);
|
||||
|
||||
double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int Nghost = atom->Nghost;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
@@ -48,14 +52,14 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
MD_FLOAT fix = 0.0;
|
||||
MD_FLOAT fiy = 0.0;
|
||||
MD_FLOAT fiz = 0.0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
@@ -70,25 +74,30 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
fiz += delz * force;
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
addStat(stats->atoms_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->atoms_outside_cutoff, 1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
atom_fx(i) += fix;
|
||||
atom_fy(i) += fiy;
|
||||
atom_fz(i) += fiz;
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
if(numneighs % VECTOR_WIDTH > 0) {
|
||||
addStat(stats->atoms_outside_cutoff, VECTOR_WIDTH - (numneighs % VECTOR_WIDTH));
|
||||
}
|
||||
#endif
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
@@ -96,13 +105,13 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int Nghost = atom->Nghost;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
@@ -113,12 +122,11 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
for(int i = 0; i < Nlocal+Nghost; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
@@ -166,16 +174,14 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
// We do not need to update forces for ghost atoms
|
||||
if(j < Nlocal) {
|
||||
// We need to update forces for ghost atoms if shell_method or half stencil is requiered
|
||||
if((param->half_neigh && j<Nlocal) || param->method){
|
||||
atom_fx(j) -= delx * force;
|
||||
atom_fy(j) -= dely * force;
|
||||
atom_fz(j) -= delz * force;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) += fix;
|
||||
atom_fy(i) += fiy;
|
||||
atom_fz(i) += fiz;
|
||||
@@ -184,6 +190,7 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
if(param->method == eightShell) computeForceGhostShell(param, atom, neighbor);
|
||||
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
||||
}
|
||||
|
||||
@@ -270,3 +277,58 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void computeForceGhostShell(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
int Nshell = neighbor->Nshell;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for(int i = 0; i < Nshell; i++) {
|
||||
neighs = &(neighbor->neighshell[i * neighbor->maxneighs]);
|
||||
int numneigh = neighbor->numNeighShell[i];
|
||||
int iatom = neighbor->listshell[i];
|
||||
MD_FLOAT xtmp = atom_x(iatom);
|
||||
MD_FLOAT ytmp = atom_y(iatom);
|
||||
MD_FLOAT ztmp = atom_z(iatom);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneigh; k++) {
|
||||
int jatom = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(jatom);
|
||||
MD_FLOAT dely = ytmp - atom_y(jatom);
|
||||
MD_FLOAT delz = ztmp - atom_z(jatom);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
atom_fx(jatom) -= delx * force;
|
||||
atom_fy(jatom) -= dely * force;
|
||||
atom_fz(jatom) -= delz * force;
|
||||
}
|
||||
}
|
||||
atom_fx(iatom) += fix;
|
||||
atom_fy(iatom) += fiy;
|
||||
atom_fz(iatom) += fiz;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,8 +4,9 @@
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
|
||||
#include <box.h>
|
||||
#include <parameter.h>
|
||||
#ifndef __ATOM_H_
|
||||
#define __ATOM_H_
|
||||
|
||||
@@ -56,6 +57,8 @@ typedef struct {
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
//TODO: insert the id number
|
||||
//MD_FLOAT *Atom_id;
|
||||
|
||||
// DEM
|
||||
MD_FLOAT *radius;
|
||||
@@ -64,6 +67,9 @@ typedef struct {
|
||||
|
||||
// Device data
|
||||
DeviceAtom d_atom;
|
||||
|
||||
//Info Subdomain
|
||||
Box mybox;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
@@ -75,6 +81,17 @@ extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern int readAtom_in(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
|
||||
int packGhost(Atom*, int, MD_FLOAT*, int*);
|
||||
int unpackGhost(Atom*, int, MD_FLOAT*);
|
||||
int packExchange(Atom*, int, MD_FLOAT*);
|
||||
int unpackExchange(Atom*, int, MD_FLOAT*);
|
||||
void packForward(Atom*, int, int*, MD_FLOAT*, int*);
|
||||
void unpackForward(Atom*, int, int, MD_FLOAT*);
|
||||
void packReverse(Atom* , int , int , MD_FLOAT*);
|
||||
void unpackReverse(Atom*, int, int*, MD_FLOAT*);
|
||||
void pbc(Atom*);
|
||||
void copy(Atom*, int, int);
|
||||
|
||||
#ifdef AOS
|
||||
# define POS_DATA_LAYOUT "AoS"
|
||||
# define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
@@ -99,4 +116,8 @@ extern void growAtom(Atom*);
|
||||
# define atom_fz(i) atom->fz[i]
|
||||
#endif
|
||||
|
||||
# define buf_x(i) buf[3*(i)]
|
||||
# define buf_y(i) buf[3*(i)+1]
|
||||
# define buf_z(i) buf[3*(i)+2]
|
||||
|
||||
#endif
|
||||
|
||||
@@ -20,9 +20,14 @@ typedef struct {
|
||||
int ncalls;
|
||||
int maxneighs;
|
||||
int half_neigh;
|
||||
int half_stencil;
|
||||
int *neighbors;
|
||||
int *numneigh;
|
||||
|
||||
//MPI
|
||||
int Nshell; //# of atoms in listShell
|
||||
int *numNeighShell; //# of neighs for each atom in listShell
|
||||
int *neighshell; //list of neighs for each atom in listShell
|
||||
int *listshell; //Atoms to compute the force
|
||||
// Device data
|
||||
DeviceNeighbor d_neighbor;
|
||||
} Neighbor;
|
||||
|
||||
@@ -5,8 +5,11 @@
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <comm.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __VTK_H_
|
||||
#define __VTK_H_
|
||||
extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep);
|
||||
#endif
|
||||
|
||||
@@ -59,12 +59,6 @@ void init(Parameter *param) {
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
// Show debug messages
|
||||
#define DEBUG(msg) printf(msg)
|
||||
// Do not show debug messages
|
||||
//#define DEBUG(msg)
|
||||
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
|
||||
@@ -125,7 +119,7 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG("Initializing parameters...\n");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
@@ -196,11 +190,11 @@ int main(int argc, const char *argv[]) {
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG("Initializing EAM parameters...\n");
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG("Initializing atoms...\n");
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
@@ -216,7 +210,7 @@ int main(int argc, const char *argv[]) {
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG("Creating atoms...\n");
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
for(int i = 0; i < natoms; ++i) {
|
||||
while(atom->Nlocal > atom->Nmax - natoms) {
|
||||
growAtom(atom);
|
||||
@@ -247,11 +241,11 @@ int main(int argc, const char *argv[]) {
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG("Initializing neighbor lists...\n");
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG("Creating neighbor lists...\n");
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
|
||||
DEBUG("Computing forces...\n");
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
||||
|
||||
308
lammps/main.c
308
lammps/main.c
@@ -11,9 +11,7 @@
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
@@ -23,13 +21,19 @@
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <pbc.h>
|
||||
#include <stats.h>
|
||||
#include <timers.h>
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
#include <comm.h>
|
||||
#include <grid.h>
|
||||
#include <shell_methods.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
#ifdef CUDA_TARGET
|
||||
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
|
||||
#endif
|
||||
|
||||
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
@@ -37,62 +41,6 @@ extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
|
||||
S = getTimeStamp();
|
||||
initAtom(atom);
|
||||
initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if(param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
|
||||
setupNeighbor(param);
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
setupPbc(atom, param);
|
||||
initDevice(atom, neighbor);
|
||||
updatePbc(atom, param, true);
|
||||
buildNeighbor(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateAtomsPbc(atom, param);
|
||||
setupPbc(atom, param);
|
||||
updatePbc(atom, param, true);
|
||||
//sortAtom(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void printAtomState(Atom *atom) {
|
||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
||||
// int nall = atom->Nlocal + atom->Nghost;
|
||||
// for (int i=0; i<nall; i++) {
|
||||
// printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) {
|
||||
return computeForceEam(eam, param, atom, neighbor, stats);
|
||||
@@ -105,7 +53,7 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
if(param->half_neigh) {
|
||||
if(param->half_neigh || param->method) {
|
||||
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
|
||||
}
|
||||
|
||||
@@ -116,6 +64,102 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
#endif
|
||||
}
|
||||
|
||||
double dynamicBalance(Comm* comm, Grid* grid, Atom* atom, Parameter* param, double time){
|
||||
double S, E;
|
||||
int dims = 3; //TODO: Adjust to do in 3d and 2d
|
||||
S = getTimeStamp();
|
||||
if(param->balance == RCB) {
|
||||
rcbBalance(grid, atom, param, meanBisect,dims,0);
|
||||
neighComm(comm, param, grid);
|
||||
}else if(param->balance == meanTimeRCB){
|
||||
rcbBalance(grid, atom, param, meanTimeBisect,dims,time);
|
||||
neighComm(comm, param, grid);
|
||||
}else if(param->balance == Staggered) {
|
||||
staggeredBalance(grid, atom, param, time);
|
||||
neighComm(comm, param, grid);
|
||||
exchangeComm(comm,atom);
|
||||
}else { } //Do nothing
|
||||
//printGrid(grid);
|
||||
E = getTimeStamp();
|
||||
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double initialBalance(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid)
|
||||
{
|
||||
double E,S,time;
|
||||
int me;
|
||||
MPI_Comm_rank(world,&me);
|
||||
S = getTimeStamp();
|
||||
if(param->balance == meanTimeRCB || param->balance == RCB){
|
||||
rcbBalance(grid, atom, param, meanBisect,3,0);
|
||||
neighComm(comm, param, grid);
|
||||
}
|
||||
MPI_Allreduce(&atom->Nlocal, &atom->Natoms, 1, MPI_INT, MPI_SUM, world);
|
||||
printf("Processor:%i, Local atoms:%i, Total atoms:%i\n",me, atom->Nlocal,atom->Natoms);
|
||||
MPI_Barrier(world);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats, Comm *comm, Grid *grid) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
S = getTimeStamp();
|
||||
initAtom(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if(param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
setupGrid(grid,atom,param);
|
||||
setupNeighbor(param);
|
||||
setupComm(comm, param, grid);
|
||||
if(param->balance){
|
||||
initialBalance(param, eam, atom, neighbor, stats, comm, grid);
|
||||
}
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
#ifdef SORT_ATOMS
|
||||
atom->Nghost = 0;
|
||||
sortAtom(atom);
|
||||
#endif
|
||||
initDevice(atom, neighbor);
|
||||
ghostNeighbor(comm, atom, param);
|
||||
buildNeighbor(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(Comm* comm, Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
#ifdef SORT_ATOMS
|
||||
atom->Nghost = 0;
|
||||
sortAtom(atom);
|
||||
#endif
|
||||
ghostNeighbor(comm, atom, param);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double updateAtoms(Comm* comm, Atom* atom){
|
||||
double S,E;
|
||||
S = getTimeStamp();
|
||||
exchangeComm(comm, atom);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void writeInput(Parameter *param, Atom *atom) {
|
||||
FILE *fpin = fopen("input.in", "w");
|
||||
fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
|
||||
@@ -134,15 +178,16 @@ int main(int argc, char** argv) {
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
|
||||
Comm comm;
|
||||
Grid grid;
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
//LIKWID_MARKER_REGISTER("reneighbour");
|
||||
//LIKWID_MARKER_REGISTER("pbc");
|
||||
}
|
||||
|
||||
}
|
||||
initComm(&argc, &argv, &comm);
|
||||
initParameter(¶m);
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
@@ -183,6 +228,24 @@ int main(int argc, char** argv) {
|
||||
if((strcmp(argv[i], "-half") == 0)) {
|
||||
param.half_neigh = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-method") == 0)) {
|
||||
param.method = atoi(argv[++i]);
|
||||
if (param.method>3 || param.method< 0){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Method does not exist!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-bal") == 0)) {
|
||||
param.balance = atoi(argv[++i]);
|
||||
if (param.balance>3 || param.balance< 0){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Load Balance does not exist!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
|
||||
param.cutforce = atof(argv[++i]);
|
||||
@@ -201,60 +264,70 @@ int main(int argc, char** argv) {
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
|
||||
printf("-f <string>: force field (lj, eam or dem), default lj\n");
|
||||
printf("-i <string>: input file with atom positions (dump)\n");
|
||||
printf("-e <string>: input file for EAM\n");
|
||||
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
|
||||
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
|
||||
printf("-r / --radius <real>: set cutoff radius\n");
|
||||
printf("-s / --skin <real>: set skin (verlet buffer)\n");
|
||||
printf("--freq <real>: processor frequency (GHz)\n");
|
||||
printf("--vtk <string>: VTK file for visualization\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
if(comm.myproc ==0 ){
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
|
||||
printf("-f <string>: force field (lj, eam or dem), default lj\n");
|
||||
printf("-i <string>: input file with atom positions (dump)\n");
|
||||
printf("-e <string>: input file for EAM\n");
|
||||
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
|
||||
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
|
||||
printf("-r / --radius <real>: set cutoff radius\n");
|
||||
printf("-s / --skin <real>: set skin (verlet buffer)\n");
|
||||
printf("--freq <real>: processor frequency (GHz)\n");
|
||||
printf("--vtk <string>: VTK file for visualization\n");
|
||||
printf(HLINE);
|
||||
}
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
if(param.balance>0 && param.method == 1){
|
||||
if(comm.myproc == 0) fprintf(stderr, "Half Shell is not supported by load balance!\n");
|
||||
endComm(&comm);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
timer[SETUP]=setup(¶m, &eam, &atom, &neighbor, &stats, &comm, &grid);
|
||||
if(comm.myproc == 0)printParameter(¶m);
|
||||
if(comm.myproc == 0)printf(HLINE);
|
||||
if(comm.myproc == 0) printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);// TODO: trace adress
|
||||
#endif
|
||||
|
||||
//writeInput(¶m, &atom);
|
||||
|
||||
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
|
||||
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[FORWARD] = 0.0;
|
||||
timer[UPDATE] = 0.0;
|
||||
timer[BALANCE] = 0.0;
|
||||
timer[REVERSE] = reverse(&comm, &atom, ¶m);
|
||||
MPI_Barrier(world);
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
if(param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
||||
}
|
||||
|
||||
printvtk(param.vtk_file, &comm, &atom, ¶m, 0);
|
||||
}
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
bool reneigh = (n + 1) % param.reneigh_every == 0;
|
||||
initialIntegrate(reneigh, ¶m, &atom);
|
||||
if((n + 1) % param.reneigh_every) {
|
||||
updatePbc(&atom, ¶m, false);
|
||||
if(reneigh) {
|
||||
timer[UPDATE] +=updateAtoms(&comm,&atom);
|
||||
if(param.balance && !((n+1)%param.balance_every))
|
||||
timer[BALANCE] +=dynamicBalance(&comm, &grid, &atom , ¶m, timer[FORCE]);
|
||||
timer[NEIGH] += reneighbour(&comm, ¶m, &atom, &neighbor);
|
||||
} else {
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
}
|
||||
|
||||
timer[FORWARD] += forward(&comm, &atom, ¶m);
|
||||
}
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
timer[REVERSE] += reverse(&comm, &atom, ¶m);
|
||||
finalIntegrate(reneigh, ¶m, &atom);
|
||||
|
||||
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
#ifdef CUDA_TARGET
|
||||
memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
|
||||
@@ -263,23 +336,42 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
if(param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
|
||||
}
|
||||
printvtk(param.vtk_file, &comm, &atom ,¶m, n+1);
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(world);
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
double mint[NUMTIMER];
|
||||
double maxt[NUMTIMER];
|
||||
double sumt[NUMTIMER];
|
||||
timer[REST] = timer[TOTAL]-timer[FORCE]-timer[NEIGH]-timer[BALANCE]-timer[FORWARD]-timer[REVERSE];
|
||||
MPI_Reduce(timer,mint,NUMTIMER,MPI_DOUBLE,MPI_MIN,0,world);
|
||||
MPI_Reduce(timer,maxt,NUMTIMER,MPI_DOUBLE,MPI_MAX,0,world);
|
||||
MPI_Reduce(timer,sumt,NUMTIMER,MPI_DOUBLE,MPI_SUM,0,world);
|
||||
int Nghost;
|
||||
MPI_Reduce(&atom.Nghost,&Nghost,1,MPI_INT,MPI_SUM,0,world);
|
||||
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
if(comm.myproc == 0){
|
||||
int n = comm.numproc;
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs\n\n",timer[TOTAL]);
|
||||
printf("%4s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|%7s|\n","","FORCE ", "NEIGH ", "BALANCE", "FORWARD", "REVERSE","UPDATE","REST ","SETUP");
|
||||
printf("----|-------|-------|-------|-------|-------|-------|-------|-------|\n");
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "AVG", sumt[FORCE]/n,sumt[NEIGH]/n,sumt[BALANCE]/n,sumt[FORWARD]/n,sumt[REVERSE]/n,sumt[UPDATE]/n,sumt[REST]/n,sumt[SETUP]/n);
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MIN", mint[FORCE],mint[NEIGH],mint[BALANCE],mint[FORWARD],mint[REVERSE],mint[UPDATE],mint[REST],mint[SETUP]);
|
||||
printf("%4s|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|%7.2f|\n", "MAX", maxt[FORCE],maxt[NEIGH],maxt[BALANCE],maxt[FORWARD],maxt[REVERSE],maxt[UPDATE],maxt[REST],maxt[SETUP]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
||||
}
|
||||
endComm(&comm);
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -11,27 +11,39 @@
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <util.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#define SMALL 1.0e-6
|
||||
#define FACTOR 0.999
|
||||
|
||||
MD_FLOAT xprd, yprd, zprd;
|
||||
MD_FLOAT bininvx, bininvy, bininvz;
|
||||
int mbinxlo, mbinylo, mbinzlo;
|
||||
int pad_x, pad_y, pad_z;
|
||||
int nbinx, nbiny, nbinz;
|
||||
int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
int mbinx, mbiny, mbinz; // m bins in x, y, z
|
||||
int *bincount;
|
||||
int *bins;
|
||||
int mbins; //total number of bins
|
||||
int atoms_per_bin; // max atoms per bin
|
||||
int mbins; //total number of bins
|
||||
int atoms_per_bin; // max atoms per bin
|
||||
MD_FLOAT cutneigh;
|
||||
MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
int nmax;
|
||||
int nstencil; // # of bins in stencil
|
||||
int* stencil; // stencil list of bin offsets
|
||||
int nstencil; // # of bins in stencil
|
||||
int* stencil; // stencil list of bin offsets
|
||||
MD_FLOAT binsizex, binsizey, binsizez;
|
||||
int me; //rank
|
||||
int method; // method
|
||||
int half_stencil; //If half stencil exist
|
||||
int shellMethod; //If shell method exist
|
||||
|
||||
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
|
||||
static MD_FLOAT bindist(int, int, int);
|
||||
static int ghostZone(Atom*, int);
|
||||
static int eightZone(Atom*, int);
|
||||
static int halfZone(Atom*, int);
|
||||
static void neighborGhost(Atom*, Neighbor*);
|
||||
static inline int interaction(Atom* atom, int i, int j);
|
||||
|
||||
/* exported subroutines */
|
||||
void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
@@ -51,7 +63,25 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
//========== MPI =============
|
||||
shellMethod = 0;
|
||||
half_stencil = 0;
|
||||
method = param->method;
|
||||
if(method == halfShell || method == eightShell){
|
||||
param->half_neigh = 1;
|
||||
shellMethod = 1;
|
||||
}
|
||||
if(method == halfStencil){
|
||||
param->half_neigh = 0;
|
||||
half_stencil = 1;
|
||||
}
|
||||
me = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD,&me);
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->Nshell = 0;
|
||||
neighbor->numNeighShell = NULL;
|
||||
neighbor->neighshell = NULL;
|
||||
neighbor->listshell = NULL;
|
||||
}
|
||||
|
||||
void setupNeighbor(Parameter* param) {
|
||||
@@ -64,7 +94,6 @@ void setupNeighbor(Parameter* param) {
|
||||
yprd = param->yprd;
|
||||
zprd = param->zprd;
|
||||
}
|
||||
|
||||
// TODO: update lo and hi for standard case and use them here instead
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
|
||||
@@ -93,54 +122,48 @@ void setupNeighbor(Parameter* param) {
|
||||
bininvy = 1.0 / binsizey;
|
||||
bininvz = 1.0 / binsizez;
|
||||
}
|
||||
|
||||
coord = xlo - cutneigh - SMALL * xprd;
|
||||
mbinxlo = (int) (coord * bininvx);
|
||||
if (coord < 0.0) { mbinxlo = mbinxlo - 1; }
|
||||
coord = xhi + cutneigh + SMALL * xprd;
|
||||
mbinxhi = (int) (coord * bininvx);
|
||||
|
||||
coord = ylo - cutneigh - SMALL * yprd;
|
||||
mbinylo = (int) (coord * bininvy);
|
||||
if (coord < 0.0) { mbinylo = mbinylo - 1; }
|
||||
coord = yhi + cutneigh + SMALL * yprd;
|
||||
mbinyhi = (int) (coord * bininvy);
|
||||
|
||||
coord = zlo - cutneigh - SMALL * zprd;
|
||||
mbinzlo = (int) (coord * bininvz);
|
||||
if (coord < 0.0) { mbinzlo = mbinzlo - 1; }
|
||||
coord = zhi + cutneigh + SMALL * zprd;
|
||||
mbinzhi = (int) (coord * bininvz);
|
||||
|
||||
mbinxlo = mbinxlo - 1;
|
||||
mbinxhi = mbinxhi + 1;
|
||||
mbinx = mbinxhi - mbinxlo + 1;
|
||||
|
||||
mbinylo = mbinylo - 1;
|
||||
mbinyhi = mbinyhi + 1;
|
||||
mbiny = mbinyhi - mbinylo + 1;
|
||||
|
||||
mbinzlo = mbinzlo - 1;
|
||||
mbinzhi = mbinzhi + 1;
|
||||
mbinz = mbinzhi - mbinzlo + 1;
|
||||
pad_x = (int)(cutneigh*bininvx);
|
||||
while(pad_x * binsizex < FACTOR * cutneigh) pad_x++;
|
||||
pad_y = (int)(cutneigh*bininvy);
|
||||
while(pad_y * binsizey < FACTOR * cutneigh) pad_y++;
|
||||
pad_z = (int)(cutneigh*bininvz);
|
||||
while(pad_z * binsizez < FACTOR * cutneigh) pad_z++;
|
||||
|
||||
nextx = (int) (cutneigh * bininvx);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
if(nextx * binsizex < FACTOR * cutneigh){
|
||||
nextx++;
|
||||
pad_x++;
|
||||
}
|
||||
nexty = (int) (cutneigh * bininvy);
|
||||
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
|
||||
if(nexty * binsizey < FACTOR * cutneigh){
|
||||
nexty++;
|
||||
pad_y++;
|
||||
}
|
||||
nextz = (int) (cutneigh * bininvz);
|
||||
if(nextz * binsizez < FACTOR * cutneigh) nextz++;
|
||||
if(nextz * binsizez < FACTOR * cutneigh){
|
||||
nextz++;
|
||||
pad_z++;
|
||||
}
|
||||
|
||||
mbinx = nbinx+4*pad_x;
|
||||
mbiny = nbiny+4*pad_y;
|
||||
mbinz = nbinz+4*pad_z;
|
||||
|
||||
if (stencil) { free(stencil); }
|
||||
stencil = (int*) malloc((2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
|
||||
nstencil = 0;
|
||||
|
||||
int kstart = -nextz;
|
||||
|
||||
int jstart = -nexty;
|
||||
int istart = -nextx;
|
||||
int ibin = 0;
|
||||
for(int k = kstart; k <= nextz; k++) {
|
||||
for(int j = -nexty; j <= nexty; j++) {
|
||||
for(int i = -nextx; i <= nextx; i++) {
|
||||
if(bindist(i, j, k) < cutneighsq) {
|
||||
stencil[nstencil++] = k * mbiny * mbinx + j * mbinx + i;
|
||||
for(int j = jstart; j <= nexty; j++) {
|
||||
for(int i = istart; i <= nextx; i++) {
|
||||
if(bindist(i, j, k) < cutneighsq) {
|
||||
int jbin = k * mbiny * mbinx + j * mbinx + i;
|
||||
if(ibin>jbin && half_stencil) continue;
|
||||
stencil[nstencil++] = jbin;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -154,8 +177,7 @@ void setupNeighbor(Parameter* param) {
|
||||
}
|
||||
|
||||
void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
/* extend atom arrays if necessary */
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
@@ -164,16 +186,13 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms(atom);
|
||||
int resize = 1;
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
@@ -184,21 +203,22 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
if((j == i) || (neighbor->half_neigh && (j < i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if((j==i) || (neighbor->half_neigh && (j<i)))
|
||||
continue;
|
||||
if(half_stencil && ibin==jbin && !interaction(atom,i,j))
|
||||
continue;
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
@@ -210,8 +230,8 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
@@ -220,14 +240,15 @@ void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
printf("RESIZE %d, PROC %d\n", neighbor->maxneighs,me);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
if(method == eightShell) neighborGhost(atom, neighbor);
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
@@ -257,44 +278,28 @@ MD_FLOAT bindist(int i, int j, int k) {
|
||||
} else {
|
||||
delz = (k + 1) * binsizez;
|
||||
}
|
||||
|
||||
return (delx * delx + dely * dely + delz * delz);
|
||||
}
|
||||
|
||||
int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin) {
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= xprd) {
|
||||
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * bininvx) - mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * bininvy) - mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= zprd) {
|
||||
iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * bininvz) - mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * bininvz) - mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
|
||||
int ix, iy, iz;
|
||||
MD_FLOAT eps = 1e-9;
|
||||
MD_FLOAT xlo=0.0; MD_FLOAT ylo=0.0; MD_FLOAT zlo=0.0;
|
||||
xlo = fabs(xlo - pad_x*binsizex)+eps;
|
||||
ylo = fabs(ylo - pad_y*binsizey)+eps;
|
||||
zlo = fabs(zlo - pad_z*binsizez)+eps;
|
||||
ix = (int) ((xin + xlo)*bininvx);
|
||||
iy = (int) ((yin + ylo)*bininvy);
|
||||
iz = (int) ((zin + zlo)*bininvz);
|
||||
|
||||
return (iz * mbiny * mbinx + iy * mbinx + ix);
|
||||
//return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
|
||||
}
|
||||
|
||||
void binatoms(Atom *atom) {
|
||||
void binatoms(Atom *atom) {
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
@@ -304,7 +309,7 @@ void binatoms(Atom *atom) {
|
||||
|
||||
for(int i = 0; i < nall; i++) {
|
||||
int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
|
||||
|
||||
if(shellMethod && !ghostZone(atom, i)) continue;
|
||||
if(bincount[ibin] < atoms_per_bin) {
|
||||
int ac = bincount[ibin]++;
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
@@ -325,54 +330,51 @@ void sortAtom(Atom* atom) {
|
||||
binatoms(atom);
|
||||
int Nmax = atom->Nmax;
|
||||
int* binpos = bincount;
|
||||
|
||||
for(int i=1; i<mbins; i++) {
|
||||
binpos[i] += binpos[i-1];
|
||||
for(int i = 1; i < mbins; i++) {
|
||||
binpos[i] += binpos[i - 1];
|
||||
}
|
||||
|
||||
#ifdef AOS
|
||||
#ifdef AOS
|
||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
#else
|
||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
#endif
|
||||
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
|
||||
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
|
||||
|
||||
for(int mybin = 0; mybin<mbins; mybin++) {
|
||||
int start = mybin>0?binpos[mybin-1]:0;
|
||||
for(int mybin = 0; mybin < mbins; mybin++) {
|
||||
int start = mybin > 0 ? binpos[mybin - 1] : 0;
|
||||
int count = binpos[mybin] - start;
|
||||
for(int k=0; k<count; k++) {
|
||||
for(int k = 0; k < count; k++) {
|
||||
int new_i = start + k;
|
||||
int old_i = bins[mybin * atoms_per_bin + k];
|
||||
#ifdef AOS
|
||||
#ifdef AOS
|
||||
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
|
||||
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
|
||||
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
|
||||
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
|
||||
new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
|
||||
new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
|
||||
#else
|
||||
#else
|
||||
new_x[new_i] = old_x[old_i];
|
||||
new_y[new_i] = old_y[old_i];
|
||||
new_z[new_i] = old_z[old_i];
|
||||
new_vx[new_i] = old_vx[old_i];
|
||||
new_vy[new_i] = old_vy[old_i];
|
||||
new_vz[new_i] = old_vz[old_i];
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
free(atom->x);
|
||||
free(atom->vx);
|
||||
atom->x = new_x;
|
||||
atom->vx = new_vx;
|
||||
#ifndef AOS
|
||||
#ifndef AOS
|
||||
free(atom->y);
|
||||
free(atom->z);
|
||||
free(atom->vy);
|
||||
@@ -381,5 +383,160 @@ void sortAtom(Atom* atom) {
|
||||
atom->z = new_z;
|
||||
atom->vy = new_vy;
|
||||
atom->vz = new_vz;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/* internal subroutines
|
||||
Added with MPI*/
|
||||
|
||||
static int ghostZone(Atom* atom, int i){
|
||||
if(i<atom->Nlocal) return 1;
|
||||
else if(method == halfShell) return halfZone(atom,i);
|
||||
else if(method == eightShell) return eightZone(atom,i);
|
||||
else return 0;
|
||||
}
|
||||
|
||||
static int eightZone(Atom* atom, int i)
|
||||
{
|
||||
//Mapping: 0->0, 1->1, 2->2, 3->6, 4->3, 5->5, 6->4, 7->7
|
||||
int zoneMapping[] = {0, 1, 2, 6, 3, 5, 4, 7};
|
||||
MD_FLOAT *hi = atom->mybox.hi;
|
||||
int zone = 0;
|
||||
|
||||
if(BigOrEqual(atom_x(i),hi[_x])) {
|
||||
zone += 1;
|
||||
}
|
||||
if(BigOrEqual(atom_y(i),hi[_y])) {
|
||||
zone += 2;
|
||||
}
|
||||
if(BigOrEqual(atom_z(i),hi[_z])) {
|
||||
zone += 4;
|
||||
}
|
||||
return zoneMapping[zone];
|
||||
}
|
||||
|
||||
static int halfZone(Atom* atom, int i)
|
||||
{
|
||||
MD_FLOAT *hi = atom->mybox.hi;
|
||||
MD_FLOAT *lo = atom->mybox.lo;
|
||||
|
||||
if(atom_x(i)<lo[_x] && atom_y(i)<hi[_y] && atom_z(i)<hi[_z]){
|
||||
return 0;
|
||||
} else if(atom_y(i)<lo[_y] && atom_z(i)<hi[_z]){
|
||||
return 0;
|
||||
} else if(atom_z(i)<lo[_z]){
|
||||
return 0;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void neighborGhost(Atom *atom, Neighbor *neighbor) {
|
||||
int Nshell=0;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int Nghost = atom->Nghost;
|
||||
if(neighbor->listshell) free(neighbor->listshell);
|
||||
neighbor->listshell = (int*) malloc(Nghost * sizeof(int));
|
||||
int* listzone = (int*) malloc(8 * Nghost * sizeof(int));
|
||||
int countAtoms[8] = {0,0,0,0,0,0,0,0};
|
||||
|
||||
//Selecting ghost atoms for interaction
|
||||
for(int i = Nlocal; i < Nlocal+Nghost; i++) {
|
||||
int izone = ghostZone(atom,i);
|
||||
int *list = &listzone[Nghost*izone];
|
||||
int n = countAtoms[izone];
|
||||
list[n] = i;
|
||||
countAtoms[izone]++;
|
||||
}
|
||||
|
||||
for(int zone = 1; zone<=3; zone++){
|
||||
int *list = &listzone[Nghost*zone];
|
||||
for(int n=0; n<countAtoms[zone]; n++)
|
||||
neighbor->listshell[Nshell++] = list[n];
|
||||
}
|
||||
|
||||
neighbor->Nshell = Nshell;
|
||||
if(neighbor->numNeighShell) free(neighbor->numNeighShell);
|
||||
if(neighbor->neighshell) free(neighbor->neighshell);
|
||||
neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->numNeighShell = (int*) malloc(Nshell * sizeof(int));
|
||||
int resize = 1;
|
||||
|
||||
while(resize)
|
||||
{
|
||||
resize = 0;
|
||||
for(int i = 0; i < Nshell; i++) {
|
||||
int *neighshell = &(neighbor->neighshell[i*neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
int iatom = neighbor->listshell[i];
|
||||
int izone = ghostZone(atom, iatom);
|
||||
MD_FLOAT xtmp = atom_x(iatom);
|
||||
MD_FLOAT ytmp = atom_y(iatom);
|
||||
MD_FLOAT ztmp = atom_z(iatom);
|
||||
int ibin = coord2bin(xtmp, ytmp, ztmp);
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[iatom];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int jatom = loc_bin[m];
|
||||
|
||||
int jzone = ghostZone(atom,jatom);
|
||||
|
||||
if(jzone <=izone) continue;
|
||||
if(izone == 1 && (jzone==5||jzone==6||jzone==7)) continue;
|
||||
if(izone == 2 && (jzone==4||jzone==6||jzone==7)) continue;
|
||||
if(izone == 3 && (jzone==4||jzone==5||jzone==7)) continue;
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(jatom);
|
||||
MD_FLOAT dely = ytmp - atom_y(jatom);
|
||||
MD_FLOAT delz = ztmp - atom_z(jatom);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[jatom];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
if(rsq <= cutoff) {
|
||||
neighshell[n++] = jatom;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numNeighShell[i] = n;
|
||||
if(n >= neighbor->maxneighs){
|
||||
resize = 1;
|
||||
neighbor->maxneighs = n * 1.2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(neighbor->neighshell);
|
||||
neighbor->neighshell = (int*) malloc(Nshell * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
free(listzone);
|
||||
}
|
||||
|
||||
static inline int interaction(Atom* atom, int i, int j) {
|
||||
|
||||
if(i<j && j<atom->Nlocal) {
|
||||
return 1;
|
||||
} else if( atom_z(j)>atom_z(i) && j>=atom->Nlocal) {
|
||||
return 1;
|
||||
} else if(Equal(atom_z(j),atom_z(i)) && atom_y(j)<atom_y(i) && j>=atom->Nlocal){
|
||||
return 1;
|
||||
} else if(Equal(atom_z(j),atom_z(i)) && Equal(atom_y(j),atom_y(i)) && atom_x(j)<atom_x(i) && j>=atom->Nlocal){
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,7 +125,7 @@ void setupPbc(Atom *atom, Parameter *param) {
|
||||
if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
|
||||
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
||||
if (x < Cutneigh && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
||||
if (x < Cutneigh && y >= Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (x < Cutneigh && y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (x < Cutneigh && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
||||
if (x >= (xprd-Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
||||
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
||||
|
||||
173
lammps/vtk.c
173
lammps/vtk.c
@@ -6,8 +6,12 @@
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <vtk.h>
|
||||
#include <mpi.h>
|
||||
|
||||
#include <atom.h>
|
||||
static MPI_File _fh;
|
||||
static inline void flushBuffer(char*);
|
||||
|
||||
int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
@@ -18,12 +22,12 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
|
||||
}
|
||||
@@ -48,3 +52,168 @@ int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vtkOpen(const char* filename, Comm* comm, Atom* atom ,int timestep)
|
||||
{
|
||||
char msg[256];
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d.vtk", filename, timestep);
|
||||
MPI_File_open(MPI_COMM_WORLD, timestep_filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &_fh);
|
||||
if(_fh == MPI_FILE_NULL) {
|
||||
if(comm->myproc == 0) fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (comm->myproc==0){
|
||||
sprintf(msg, "# vtk DataFile Version 2.0\n");
|
||||
sprintf(msg, "%sParticle data\n",msg);
|
||||
sprintf(msg, "%sASCII\n",msg);
|
||||
sprintf(msg, "%sDATASET UNSTRUCTURED_GRID\n",msg);
|
||||
sprintf(msg, "%sPOINTS %d double\n",msg, atom->Natoms);
|
||||
flushBuffer(msg);
|
||||
}
|
||||
}
|
||||
|
||||
int vtkVector(Comm* comm, Atom* atom, Parameter* param)
|
||||
{
|
||||
if (_fh == MPI_FILE_NULL) {
|
||||
if(comm->myproc==0) printf("vtk not initialize! Call vtkOpen first!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
int sizeline= 25; //#initial guess of characters in "%.4f %.4f %.4f\n"
|
||||
int extrabuff = 100;
|
||||
int sizebuff = sizeline*atom->Nlocal+extrabuff;
|
||||
int mysize = 0;
|
||||
char* msg = (char*) malloc(sizebuff);
|
||||
sprintf(msg, "");
|
||||
for(int i = 0; i < atom->Nlocal; i++){
|
||||
if(mysize+extrabuff >= sizebuff){
|
||||
sizebuff*= 1.5;
|
||||
msg = (char*) realloc(msg, sizebuff);
|
||||
}
|
||||
//TODO: do not forget to add param->xlo, param->ylo, param->zlo
|
||||
sprintf(msg, "%s%.4f %.4f %.4f\n",msg, atom_x(i), atom_y(i), atom_z(i));
|
||||
mysize = strlen(msg);
|
||||
}
|
||||
int gatherSize[comm->numproc];
|
||||
|
||||
MPI_Allgather(&mysize, 1, MPI_INT, gatherSize, 1, MPI_INT, MPI_COMM_WORLD);
|
||||
int offset=0;
|
||||
int globalSize = 0;
|
||||
|
||||
for(int i = 0; i < comm->myproc; i++)
|
||||
offset+= gatherSize[i];
|
||||
|
||||
for(int i = 0; i < comm->numproc; i++)
|
||||
globalSize+= gatherSize[i];
|
||||
|
||||
MPI_Offset displ;
|
||||
MPI_Datatype FileType;
|
||||
int GlobalSize[] = {globalSize};
|
||||
int LocalSize[] = {mysize};
|
||||
int Start[] = {offset};
|
||||
|
||||
if(LocalSize[0]>0){
|
||||
MPI_Type_create_subarray(1, GlobalSize, LocalSize, Start, MPI_ORDER_C, MPI_CHAR, &FileType);
|
||||
} else {
|
||||
MPI_Type_vector(0,0,0,MPI_CHAR,&FileType);
|
||||
}
|
||||
MPI_Type_commit(&FileType);
|
||||
MPI_File_get_size(_fh, &displ);
|
||||
MPI_File_set_view(_fh, displ, MPI_CHAR, FileType, "native", MPI_INFO_NULL);
|
||||
MPI_File_write_all (_fh, msg, mysize , MPI_CHAR ,MPI_STATUS_IGNORE);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_File_set_view(_fh,0,MPI_CHAR, MPI_CHAR, "native", MPI_INFO_NULL);
|
||||
|
||||
if (comm->myproc==0){
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sCELLS %d %d\n", msg, atom->Natoms, atom->Natoms * 2);
|
||||
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1 %d\n", msg, i);
|
||||
flushBuffer(msg);
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sCELL_TYPES %d\n",msg, atom->Natoms);
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1\n",msg);
|
||||
flushBuffer(msg);
|
||||
|
||||
sprintf(msg, "\n\n");
|
||||
sprintf(msg, "%sPOINT_DATA %d\n",msg,atom->Natoms);
|
||||
sprintf(msg, "%sSCALARS mass double\n",msg);
|
||||
sprintf(msg, "%sLOOKUP_TABLE default\n",msg);
|
||||
for(int i = 0; i < atom->Natoms; i++)
|
||||
sprintf(msg, "%s1.0\n",msg);
|
||||
sprintf(msg, "%s\n\n",msg);
|
||||
flushBuffer(msg);
|
||||
}
|
||||
}
|
||||
|
||||
void vtkClose()
|
||||
{
|
||||
MPI_File_close(&_fh);
|
||||
_fh=MPI_FILE_NULL;
|
||||
}
|
||||
|
||||
int printGhost(const char* filename, Atom* atom, int timestep, int me) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_%d_ghost%i.vtk", filename, timestep,me);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
|
||||
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", atom_x(i), atom_y(i), atom_z(i));
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
|
||||
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
|
||||
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = atom->Nlocal; i < atom->Nlocal+atom->Nghost; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void printvtk(const char* filename, Comm* comm, Atom* atom ,Parameter* param, int timestep)
|
||||
{
|
||||
if(comm->numproc == 1)
|
||||
{
|
||||
write_atoms_to_vtk_file(filename, atom, timestep);
|
||||
return;
|
||||
}
|
||||
|
||||
vtkOpen(filename, comm, atom, timestep);
|
||||
vtkVector(comm, atom, param);
|
||||
vtkClose();
|
||||
//printGhost(filename, atom, timestep, comm->myproc);
|
||||
}
|
||||
|
||||
static inline void flushBuffer(char* msg){
|
||||
MPI_Offset displ;
|
||||
MPI_File_get_size(_fh, &displ);
|
||||
MPI_File_write_at(_fh, displ, msg, strlen(msg), MPI_CHAR, MPI_STATUS_IGNORE);
|
||||
}
|
||||
@@ -1,88 +0,0 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1062.9120
|
||||
Estimated atom data volume (kB): 6.1440
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2735, Mega atom updates/s: 0.1872
|
||||
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 127.3632
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 6553600
|
||||
Useful read data volume for force computation: 1.47GB
|
||||
Cycles/SIMD iteration: 83.4598
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.110776 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8643 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1367 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 9124 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1354 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 9138 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1356 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 5586 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1297 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 5328 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1269 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 5280 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1295 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.1108 |
|
||||
| Runtime unhalted [s] | 0.0878 |
|
||||
| Clock [MHz] | 1995.2564 |
|
||||
| CPI | 0.8202 |
|
||||
| Energy [J] | 10.9296 |
|
||||
| Power [W] | 98.6643 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 14233.3287 |
|
||||
| AVX DP [MFLOP/s] | 14231.8898 |
|
||||
| Packed [MUOPS/s] | 1778.9862 |
|
||||
| Scalar [MUOPS/s] | 1.4389 |
|
||||
| Memory read bandwidth [MBytes/s] | 24.9001 |
|
||||
| Memory read data volume [GBytes] | 0.0028 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.5861 |
|
||||
| Memory write data volume [GBytes] | 0.0005 |
|
||||
| Memory bandwidth [MBytes/s] | 29.4863 |
|
||||
| Memory data volume [GBytes] | 0.0033 |
|
||||
| Operational intensity | 482.7104 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: double
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200895e-01 6.923143e-01
|
||||
200 7.961495e-01 6.721043e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.28 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0352
|
||||
Average SIMD iterations per atom: 9.9181
|
||||
Total number of computed pair interactions: 2003182862
|
||||
Total number of SIMD iterations: 261297661
|
||||
Useful read data volume for force computation: 57.46GB
|
||||
Cycles/SIMD iteration: 40.4432
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.115807 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.1158 |
|
||||
| Runtime unhalted [s] | 4.0885 |
|
||||
| Clock [MHz] | 1995.2508 |
|
||||
| CPI | 0.8098 |
|
||||
| Energy [J] | 307.9429 |
|
||||
| Power [W] | 60.1944 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 12644.6041 |
|
||||
| AVX DP [MFLOP/s] | 12629.1535 |
|
||||
| Packed [MUOPS/s] | 1578.6442 |
|
||||
| Scalar [MUOPS/s] | 15.4506 |
|
||||
| Memory read bandwidth [MBytes/s] | 1713.4438 |
|
||||
| Memory read data volume [GBytes] | 8.7656 |
|
||||
| Memory write bandwidth [MBytes/s] | 86.5003 |
|
||||
| Memory write data volume [GBytes] | 0.4425 |
|
||||
| Memory bandwidth [MBytes/s] | 1799.9442 |
|
||||
| Memory data volume [GBytes] | 9.2082 |
|
||||
| Operational intensity | 7.0250 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.897385 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.8974 |
|
||||
| Runtime unhalted [s] | 4.7026 |
|
||||
| Clock [MHz] | 1995.2473 |
|
||||
| CPI | 0.6440 |
|
||||
| Energy [J] | 338.9000 |
|
||||
| Power [W] | 57.4661 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 1059.4978 |
|
||||
| AVX DP [MFLOP/s] | 1.3335 |
|
||||
| Packed [MUOPS/s] | 0.1667 |
|
||||
| Scalar [MUOPS/s] | 1058.1643 |
|
||||
| Memory read bandwidth [MBytes/s] | 136.3006 |
|
||||
| Memory read data volume [GBytes] | 0.8038 |
|
||||
| Memory write bandwidth [MBytes/s] | 72.2612 |
|
||||
| Memory write data volume [GBytes] | 0.4262 |
|
||||
| Memory bandwidth [MBytes/s] | 208.5618 |
|
||||
| Memory data volume [GBytes] | 1.2300 |
|
||||
| Operational intensity | 5.0800 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1056.7680
|
||||
Estimated atom data volume (kB): 3.0720
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2466, Mega atom updates/s: 0.2076
|
||||
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 63.6816
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 3276800
|
||||
Useful read data volume for force computation: 0.84GB
|
||||
Cycles/SIMD iteration: 150.4999
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.085843 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8354 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1126 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 7863 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1105 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 7990 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1113 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 4775 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1112 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 4201 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1127 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 4035 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1120 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.0858 |
|
||||
| Runtime unhalted [s] | 0.0691 |
|
||||
| Clock [MHz] | 1995.2787 |
|
||||
| CPI | 1.3277 |
|
||||
| Energy [J] | 9.2849 |
|
||||
| Power [W] | 108.1610 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 16606.5397 |
|
||||
| AVX SP [MFLOP/s] | 16604.7458 |
|
||||
| Packed [MUOPS/s] | 1037.7966 |
|
||||
| Scalar [MUOPS/s] | 1.7940 |
|
||||
| Memory read bandwidth [MBytes/s] | 27.7476 |
|
||||
| Memory read data volume [GBytes] | 0.0024 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.9974 |
|
||||
| Memory write data volume [GBytes] | 0.0004 |
|
||||
| Memory bandwidth [MBytes/s] | 32.7450 |
|
||||
| Memory data volume [GBytes] | 0.0028 |
|
||||
| Operational intensity | 507.1471 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: single
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200897e-01 6.923144e-01
|
||||
200 7.961481e-01 6.721031e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.42 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0351
|
||||
Average SIMD iterations per atom: 5.0875
|
||||
Total number of computed pair interactions: 2003181259
|
||||
Total number of SIMD iterations: 134032075
|
||||
Useful read data volume for force computation: 32.79GB
|
||||
Cycles/SIMD iteration: 68.9511
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 4.452877 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 595747 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 597090 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 595219 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 632443 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 633169 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 634112 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 4.4529 |
|
||||
| Runtime unhalted [s] | 3.5585 |
|
||||
| Clock [MHz] | 1995.2693 |
|
||||
| CPI | 1.1947 |
|
||||
| Energy [J] | 265.5057 |
|
||||
| Power [W] | 59.6257 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 14156.9661 |
|
||||
| AVX SP [MFLOP/s] | 14139.2165 |
|
||||
| Packed [MUOPS/s] | 883.7010 |
|
||||
| Scalar [MUOPS/s] | 17.7496 |
|
||||
| Memory read bandwidth [MBytes/s] | 1708.8254 |
|
||||
| Memory read data volume [GBytes] | 7.6092 |
|
||||
| Memory write bandwidth [MBytes/s] | 53.0035 |
|
||||
| Memory write data volume [GBytes] | 0.2360 |
|
||||
| Memory bandwidth [MBytes/s] | 1761.8288 |
|
||||
| Memory data volume [GBytes] | 7.8452 |
|
||||
| Operational intensity | 8.0354 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.935627 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 975760 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 977433 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 979122 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 967621 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 967179 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 969349 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.9356 |
|
||||
| Runtime unhalted [s] | 4.7334 |
|
||||
| Clock [MHz] | 1995.2675 |
|
||||
| CPI | 0.6483 |
|
||||
| Energy [J] | 340.7903 |
|
||||
| Power [W] | 57.4144 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 1052.6723 |
|
||||
| AVX SP [MFLOP/s] | 1.3249 |
|
||||
| Packed [MUOPS/s] | 0.0828 |
|
||||
| Scalar [MUOPS/s] | 1051.3474 |
|
||||
| Memory read bandwidth [MBytes/s] | 114.9736 |
|
||||
| Memory read data volume [GBytes] | 0.6824 |
|
||||
| Memory write bandwidth [MBytes/s] | 62.9308 |
|
||||
| Memory write data volume [GBytes] | 0.3735 |
|
||||
| Memory bandwidth [MBytes/s] | 177.9044 |
|
||||
| Memory data volume [GBytes] | 1.0560 |
|
||||
| Operational intensity | 5.9171 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -1,148 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-avx512-dp-ICX.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 47.68 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 42.0 0.0 | 12.5 | 5.0 5.0 | 5.0 5.0 | 0.0 | 42.0 | 12.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | movsxd rbx, dword ptr [r12+r14*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rcx, ptr [rbx+rbx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rcx, 0x6
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm4, zmm3, zmm29
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x140]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rbx+rbx*1]
|
||||
| 1 | | | | | | | 1.0 | | cmp rdi, rcx
|
||||
| 1 | | | | | | | 1.0 | | setnz dl
|
||||
| 1 | | | | | | | 1.0 | | setz cl
|
||||
| 1 | | 1.0 | | | | | | | lea ebx, ptr [rbx+rbx*1+0x1]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm25, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm4, zmm4
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm18
|
||||
| 1 | | 1.0 | | | | | | | cmp rdi, rbx
|
||||
| 1 | | | | | | | 1.0 | | setz bl
|
||||
| 1* | | | | | | | | | mov ebp, ebx
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm20, zmm19, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm21, zmm20
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm21, zmmword ptr [rsp+0x80]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm21, zmm29
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm1, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm20, zmm20, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm20, zmmword ptr [rsp+0x100]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm20, zmm30
|
||||
| 1 | | 1.0 | | | | | | | not bpl
|
||||
| 1 | | 1.0 | | | | | | | sub bpl, cl
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm18, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm26, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15{k1}, zmm19, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm4, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm19, zmm3
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm4
|
||||
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rdx+rdx*1]
|
||||
| 1* | | | | | | | | | mov eax, ebx
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm3, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm3, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm19, zmm17
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm29
|
||||
| 1 | | | | | | | 1.0 | | shl al, 0x5
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm1, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm17, zmm17, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm23, zmm30
|
||||
| 1 | | 0.5 | | | | | 0.5 | | sub cl, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add cl, 0xfd
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm4, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm4, zmm27, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14{k1}, zmm3, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm21, zmm4, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm21, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm3, zmm20
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm20, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm20, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm1, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||
| 1* | | | | | | | | | mov ecx, ebx
|
||||
| 1 | | | | | | | 1.0 | | shl cl, 0x6
|
||||
| 1 | | 0.5 | | | | | 0.5 | | sub al, cl
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add al, 0xfb
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm21, zmm0, 0x1
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm18, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm24, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm28, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm16{k1}, zmm3, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k1}, zmm3, zmm17
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm3, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm17, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm1, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm4, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl bl, 0x7
|
||||
| 1 | | 1.0 | | | | | | | sub dl, bl
|
||||
| 1 | | 1.0 | | | | | | | add dl, 0xf7
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, edx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k1}, zmm3, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k1}, zmm3, zmm21
|
||||
| 1 | | 0.5 | | | | | 0.5 | | inc r14
|
||||
| 1* | | | | | | | | | cmp r11, r14
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffd99
|
||||
Total Num Of Uops: 123
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -1,159 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-dp-ICX.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-01-03 00:07:20
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2287 | | | | | | | | || | | .LBB5_11: #
|
||||
2288 | | | | | | | | || | | # Parent Loop BB5_6 Depth=1
|
||||
2289 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2290 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r12,%r14,4), %rbx
|
||||
2291 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rbx,%rbx,2), %rcx
|
||||
2292 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rcx
|
||||
2293 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rsi,%rcx), %zmm29
|
||||
2294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rsi,%rcx), %zmm30
|
||||
2295 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rsi,%rcx), %zmm31
|
||||
2296 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
|
||||
2297 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm4
|
||||
2298 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 320(%rsp), %zmm3 # 64-byte Reload
|
||||
2299 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm30, %zmm3, %zmm3
|
||||
2300 | | 1.00 | | | | 0.00 | | || | | leal (%rbx,%rbx), %ecx
|
||||
2301 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %rdi
|
||||
2302 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||
2303 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||
2304 | | 1.00 | | | | | | || | | leal 1(%rbx,%rbx), %ebx
|
||||
2305 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm31, %zmm25, %zmm17
|
||||
2306 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18
|
||||
2307 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
|
||||
2308 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
|
||||
2309 | 2.75 | | | | | 0.25 | | || 8.0 | | vrcp14pd %zmm18, %zmm19
|
||||
2310 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | cmpq %rbx, %rdi
|
||||
2311 | 0.00 | | | | | | 1.00 | || | | sete %bl
|
||||
2312 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ebp
|
||||
2313 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm19, %zmm20
|
||||
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm19, %zmm21
|
||||
2315 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm21, %zmm20
|
||||
2316 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm21 # 64-byte Reload
|
||||
2317 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm21, %zmm21
|
||||
2318 | 0.00 | | | | | | 1.00 | || | | shlb $4, %bpl
|
||||
2319 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm1, %zmm19
|
||||
2320 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||
2321 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm20, %zmm20
|
||||
2322 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||
2323 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 256(%rsp), %zmm20 # 64-byte Reload
|
||||
2324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm20, %zmm20
|
||||
2325 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | notb %bpl
|
||||
2326 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | subb %cl, %bpl
|
||||
2327 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2328 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
|
||||
2329 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm26, %zmm18
|
||||
2330 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
|
||||
2331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm4
|
||||
2332 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
|
||||
2333 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
|
||||
2334 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
|
||||
2335 | 2.25 | | | | | 0.75 | | || | | vrcp14pd %zmm4, %zmm3
|
||||
2336 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %ecx
|
||||
2337 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %eax
|
||||
2338 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
|
||||
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm3, %zmm17
|
||||
2340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm19
|
||||
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm19, %zmm17
|
||||
2342 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm19 # 64-byte Reload
|
||||
2343 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm19, %zmm19
|
||||
2344 | 0.00 | | | | | | 1.00 | || | | shlb $5, %al
|
||||
2345 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm1, %zmm3
|
||||
2346 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||
2347 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm17, %zmm17
|
||||
2348 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||
2349 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm23, %zmm17
|
||||
2350 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %al, %cl
|
||||
2351 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | addb $-3, %cl
|
||||
2352 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
2353 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
|
||||
2354 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm27, %zmm4
|
||||
2355 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
|
||||
2356 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm4, %zmm4, %zmm21
|
||||
2357 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
|
||||
2358 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
|
||||
2359 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
|
||||
2360 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm21, %zmm20
|
||||
2361 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
|
||||
2362 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm20, %zmm3
|
||||
2363 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm18
|
||||
2364 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||
2365 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm1, %zmm18
|
||||
2366 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm18
|
||||
2367 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2368 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||
2369 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||
2370 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ecx
|
||||
2371 | 0.00 | | | | | | 1.00 | || | | shlb $6, %cl
|
||||
2372 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %cl, %al
|
||||
2373 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-5, %al
|
||||
2374 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||
2375 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
|
||||
2376 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
|
||||
2377 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18
|
||||
2378 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm24, %zmm20
|
||||
2379 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm28, %zmm21
|
||||
2380 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
|
||||
2381 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm21, %zmm21, %zmm19
|
||||
2382 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
|
||||
2383 | 0.25 | | | | | 0.75 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
|
||||
2384 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
|
||||
2385 | 2.00 | | | | | 1.00 | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2386 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
|
||||
2387 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm22, %zmm17, %zmm3
|
||||
2388 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm17, %zmm4
|
||||
2389 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||
2390 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm1, %zmm4
|
||||
2391 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm4
|
||||
2392 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2393 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||
2394 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||
2395 | 0.00 | | | | | | 1.00 | || | | shlb $7, %bl
|
||||
2396 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %bl, %dl
|
||||
2397 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addb $-9, %dl
|
||||
2398 | 1.00 | | | | | | | || | | kmovd %edx, %k1
|
||||
2399 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2400 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
|
||||
2401 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
|
||||
2402 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
|
||||
2403 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %r14
|
||||
2404 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %r14, %r11
|
||||
2405 | | | | | | | | || | | * jne .LBB5_11
|
||||
|
||||
40.0 14.5 5.00 5.00 5.00 5.00 40.0 14.5 50.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2402 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
|
||||
2401 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
|
||||
2400 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
|
||||
2386 | 4.0 | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
|
||||
2384 | 4.0 | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
|
||||
2380 | 4.0 | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
|
||||
2361 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
|
||||
2359 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
|
||||
2355 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
|
||||
2338 | 4.0 | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
|
||||
2334 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
|
||||
2330 | 4.0 | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
|
||||
2394 | 3.0 | shlb $3, %dl | [2394, 2396, 2397]
|
||||
2318 | 3.0 | shlb $4, %bpl | [2318, 2325, 2326]
|
||||
2403 | 1.0 | incq %r14 | [2403]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,198 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icc-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
|
||||
| 1 | | | | | | | 1.0 | | inc rsi
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
|
||||
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
|
||||
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
|
||||
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
|
||||
| 1 | | | | | | | 1.0 | | mov edx, 0x0
|
||||
| 1 | | | | | | | 1.0 | | setz dl
|
||||
| 1 | | 1.0 | | | | | | | cmp eax, r11d
|
||||
| 1 | | | | | | | 1.0 | | mov eax, 0x0
|
||||
| 1* | | | | | | | | | mov r13d, edx
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
|
||||
| 1 | | | | | | | 1.0 | | setz al
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
|
||||
| 1 | | 1.0 | | | | | | | neg r13d
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
|
||||
| 1* | | | | | | | | | mov r12d, eax
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
|
||||
| 1 | | 1.0 | | | | | | | add r13d, 0xff
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
|
||||
| 1 | | | | | | | 1.0 | | nop
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
|
||||
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||
| 1* | | | | | | | | | mov r13d, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
|
||||
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
|
||||
| 1 | | | | | | | 1.0 | | neg r12d
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
|
||||
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
|
||||
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
|
||||
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
|
||||
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
|
||||
| 1 | | 1.0 | | | | | | | sub r12d, r13d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
|
||||
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
|
||||
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
|
||||
| 1* | | | | | | | | | mov r12d, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
|
||||
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
|
||||
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
|
||||
| 1 | | | | | | | 1.0 | | neg r13d
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
|
||||
| 1 | | | | | | | 1.0 | | add r13d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
|
||||
| 1 | | | | | | | 1.0 | | shl edx, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
|
||||
| 1 | | 1.0 | | | | | | | neg edx
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
|
||||
| 1 | | 1.0 | | | | | | | add edx, 0xff
|
||||
| 1 | | | | | | | 1.0 | | shl eax, 0x7
|
||||
| 1 | | 1.0 | | | | | | | sub edx, eax
|
||||
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||
| 1 | 1.0 | | | | | | | | kmovw eax, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovb k7, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, edx
|
||||
| 1 | 1.0 | | | | | | | | kmovb edx, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovw k7, edx
|
||||
| 1 | 1.0 | | | | | | | | kmovw edx, k0
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
|
||||
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||
| 1 | | | | | | 1.0 | | | kmovb k0, edx
|
||||
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
|
||||
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
|
||||
| 1* | | | | | | | | | cmp rsi, rdi
|
||||
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
|
||||
Total Num Of Uops: 187
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -1,152 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icc-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
|
||||
| 1* | | | | | | | | | mov r12d, r13d
|
||||
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
|
||||
| 1 | | 1.0 | | | | | | | inc rax
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
|
||||
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
|
||||
| 1 | | | | | | | 1.0 | | setz r12b
|
||||
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | shl r14, 0x5
|
||||
| 1* | | | | | | | | | mov r8d, r12d
|
||||
| 1 | | 1.0 | | | | | | | neg r8d
|
||||
| 1* | | | | | | | | | mov r11d, r12d
|
||||
| 1 | | 1.0 | | | | | | | add r8d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
|
||||
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
|
||||
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
|
||||
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
|
||||
| 1 | | | | | | | 1.0 | | neg r9d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | | | | | | | 1.0 | | add r9d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
|
||||
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
|
||||
| 1 | | | | | | | 1.0 | | neg r10d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
|
||||
| 1 | | 1.0 | | | | | | | add r10d, r12d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
|
||||
| 1 | | | | | | | 1.0 | | add r10d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
|
||||
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
|
||||
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub r12d, r11d
|
||||
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
|
||||
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
|
||||
| 1* | | | | | | | | | cmp rax, rdx
|
||||
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
|
||||
Total Num Of Uops: 142
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -1,154 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icx-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
|
||||
| 1 | | | | | | | 1.0 | | cmp r11, rdx
|
||||
| 1 | | | | | | | 1.0 | | setnz dl
|
||||
| 1 | | | | | | | 1.0 | | setz al
|
||||
| 1 | | 1.0 | | | | | | | add ecx, ecx
|
||||
| 1 | | 1.0 | | | | | | | inc ecx
|
||||
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
|
||||
| 1 | | | | | | | 1.0 | | setz cl
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
|
||||
| 1 | | | | | | | 1.0 | | setnz dil
|
||||
| 1* | | | | | | | | | mov ebp, edi
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub bpl, al
|
||||
| 1 | | 1.0 | | | | | | | add bpl, 0xef
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
|
||||
| 1* | | | | | | | | | mov ebp, edi
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
|
||||
| 1 | | 1.0 | | | | | | | or bpl, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||
| 1 | | | | | | | 1.0 | | shl dil, 0x6
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, edi
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
|
||||
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl cl, 0x7
|
||||
| 1 | | 1.0 | | | | | | | or cl, dl
|
||||
| 1 | | 1.0 | | | | | | | add cl, 0xf7
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
|
||||
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
|
||||
| 1* | | | | | | | | | cmp r9, rbx
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
|
||||
Total Num Of Uops: 129
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -1,288 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 12200
|
||||
Total Cycles: 4745
|
||||
Total uOps: 14000
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.95
|
||||
IPC: 2.57
|
||||
Block RThroughput: 34.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
|
||||
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
|
||||
1 1 0.50 shlq $6, %rdx
|
||||
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
|
||||
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
|
||||
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
|
||||
2 8 0.50 * vmovupd 16(%rsp), %zmm3
|
||||
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
|
||||
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
|
||||
2 8 0.50 * vmovupd 336(%rsp), %zmm16
|
||||
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
|
||||
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||
3 4 2.00 vrcp14pd %zmm17, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
|
||||
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
|
||||
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
|
||||
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
|
||||
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
|
||||
1 1 0.50 leal (%rcx,%rcx), %edx
|
||||
1 1 0.25 cmpq %rdx, %r11
|
||||
1 1 0.50 setne %dl
|
||||
1 1 0.50 sete %al
|
||||
1 1 0.25 addl %ecx, %ecx
|
||||
1 1 0.25 incl %ecx
|
||||
1 1 0.25 cmpq %rcx, %r11
|
||||
1 1 0.50 sete %cl
|
||||
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
|
||||
2 8 0.50 * vmovupd 528(%rsp), %zmm19
|
||||
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
|
||||
1 1 0.50 setne %dil
|
||||
1 1 0.25 movl %edi, %ebp
|
||||
1 1 0.50 shlb $4, %bpl
|
||||
1 1 0.25 subb %al, %bpl
|
||||
1 1 0.25 addb $-17, %bpl
|
||||
1 1 1.00 kmovd %ebp, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 272(%rsp), %zmm17
|
||||
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
|
||||
1 1 0.50 leal (%rdx,%rdx), %eax
|
||||
1 1 0.25 movl %edi, %ebp
|
||||
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm3, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
|
||||
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
|
||||
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
|
||||
2 8 0.50 * vmovupd 464(%rsp), %zmm31
|
||||
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
|
||||
1 1 0.50 shlb $5, %bpl
|
||||
1 1 0.25 orb %al, %bpl
|
||||
1 1 0.25 orb $-35, %bpl
|
||||
1 1 1.00 kmovd %ebp, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 208(%rsp), %zmm3
|
||||
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
|
||||
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
|
||||
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
|
||||
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm19, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
|
||||
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
|
||||
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
|
||||
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
|
||||
1 1 0.50 leal (,%rdx,4), %eax
|
||||
1 1 0.50 shlb $6, %dil
|
||||
1 1 0.25 orb %al, %dil
|
||||
1 1 0.25 orb $-69, %dil
|
||||
1 1 1.00 kmovd %edi, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 400(%rsp), %zmm17
|
||||
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
|
||||
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
|
||||
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
|
||||
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm28, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
|
||||
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
|
||||
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
|
||||
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
|
||||
1 1 0.50 shlb $3, %dl
|
||||
1 1 0.50 shlb $7, %cl
|
||||
1 1 0.25 orb %dl, %cl
|
||||
1 1 0.25 addb $-9, %cl
|
||||
1 1 1.00 kmovd %ecx, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||
1 1 0.25 incq %rbx
|
||||
1 1 0.25 cmpq %rbx, %r9
|
||||
1 1 0.50 jne .LBB5_12
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
|
||||
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
|
||||
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
|
||||
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
|
||||
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
|
||||
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
|
||||
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
|
||||
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
|
||||
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
|
||||
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
|
||||
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
|
||||
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
|
||||
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
|
||||
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
|
||||
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
|
||||
- - - - - - - - 1.00 - cmpq %rdx, %r11
|
||||
- - - - - - - - 1.00 - setne %dl
|
||||
- - 0.44 - - - - - 0.56 - sete %al
|
||||
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
|
||||
- - - 0.53 - - - 0.46 0.01 - incl %ecx
|
||||
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
|
||||
- - 0.02 - - - - - 0.98 - sete %cl
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
|
||||
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
|
||||
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
|
||||
- - 0.04 - - - - - 0.96 - setne %dil
|
||||
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
|
||||
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
|
||||
- - - 0.96 - - - - 0.04 - subb %al, %bpl
|
||||
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
|
||||
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
|
||||
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
|
||||
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
|
||||
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
|
||||
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
|
||||
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
|
||||
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
|
||||
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
|
||||
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
|
||||
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
|
||||
- - - 0.94 - - - - 0.06 - orb %al, %bpl
|
||||
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
|
||||
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
|
||||
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
|
||||
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
|
||||
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
|
||||
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
|
||||
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
|
||||
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
|
||||
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
|
||||
- - - - - - - - 1.00 - shlb $6, %dil
|
||||
- - - 0.02 - - - - 0.98 - orb %al, %dil
|
||||
- - - 0.48 - - - - 0.52 - orb $-69, %dil
|
||||
- - - - - - - 1.00 - - kmovd %edi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
|
||||
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
|
||||
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
|
||||
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
|
||||
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
|
||||
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
|
||||
- - - - - - - - 1.00 - shlb $3, %dl
|
||||
- - - - - - - - 1.00 - shlb $7, %cl
|
||||
- - - 1.00 - - - - - - orb %dl, %cl
|
||||
- - - 0.52 - - - - 0.48 - addb $-9, %cl
|
||||
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
|
||||
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||
- - - 0.48 - - - - 0.52 - incq %rbx
|
||||
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
|
||||
- - - - - - - - 1.00 - jne .LBB5_12
|
||||
@@ -1,167 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-dp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-14 12:51:57
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
2241 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2242 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2243 | | | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||
2244 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r10,%rbx,4), %rcx
|
||||
2246 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
2247 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rdx
|
||||
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||
2252 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||
2253 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||
2255 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2256 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||
2257 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||
2258 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||
2259 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm17, %zmm18
|
||||
2260 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||
2261 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2262 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2263 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||
2264 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||
2265 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||
2266 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||
2267 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||
2268 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rdx, %r11
|
||||
2269 | 0.00 | | | | | | 1.00 | | | || | | setne %dl
|
||||
2270 | 0.00 | | | | | | 1.00 | | | || | | sete %al
|
||||
2271 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl %ecx, %ecx
|
||||
2272 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | incl %ecx
|
||||
2273 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rcx, %r11
|
||||
2274 | 0.00 | | | | | | 1.00 | | | || | | sete %cl
|
||||
2275 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||
2277 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||
2278 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||
2279 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||
2280 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $4, %bpl
|
||||
2281 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | subb %al, %bpl
|
||||
2282 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | addb $-17, %bpl
|
||||
2283 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||
2284 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||
2286 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||
2287 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rdx,%rdx), %eax
|
||||
2288 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||
2289 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||
2290 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||
2291 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||
2292 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||
2293 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||
2294 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||
2295 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm3, %zmm16
|
||||
2296 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||
2297 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||
2298 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2299 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2300 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||
2301 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||
2302 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||
2304 | 0.75 | | | | | 0.250 | | | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||
2305 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $5, %bpl
|
||||
2306 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb %al, %bpl
|
||||
2307 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb $-35, %bpl
|
||||
2308 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||
2309 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||
2311 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||
2312 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||
2313 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||
2314 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2315 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||
2316 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||
2317 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||
2318 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||
2319 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||
2320 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2321 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||
2322 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||
2323 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2324 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2325 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||
2326 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||
2327 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||
2328 | 0.75 | | | | | 0.250 | | | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||
2329 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (,%rdx,4), %eax
|
||||
2330 | 0.00 | | | | | | 1.00 | | | || | | shlb $6, %dil
|
||||
2331 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %al, %dil
|
||||
2332 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb $-69, %dil
|
||||
2333 | 1.00 | | | | | | | | | || | | kmovd %edi, %k1
|
||||
2334 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||
2336 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||
2337 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||
2338 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||
2339 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2340 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||
2341 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||
2342 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||
2343 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||
2344 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||
2345 | 2.00 | | | | | 1.000 | | | | || | | vrcp14pd %zmm28, %zmm3
|
||||
2346 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||
2347 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||
2348 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2349 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2350 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||
2351 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||
2352 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||
2353 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||
2354 | 0.00 | | | | | | 1.00 | | | || | | shlb $3, %dl
|
||||
2355 | 0.00 | | | | | | 1.00 | | | || | | shlb $7, %cl
|
||||
2356 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %dl, %cl
|
||||
2357 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addb $-9, %cl
|
||||
2358 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k1
|
||||
2359 | 0.00 | | | | | 1.000 | | | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
2360 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||
2361 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||
2362 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||
2363 | 0.24 | | | | | 0.760 | | | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||
2364 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rbx
|
||||
2365 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rbx, %r9
|
||||
2366 | | | | | | | | | | || | | * jne .LBB5_12
|
||||
2367 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
44.0 15.0 5.50 5.50 5.50 5.50 43.99 15.0 71 6.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||
2364 | 1.0 | incq %rbx | [2364]
|
||||
|
||||
@@ -1,167 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-dp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:30:53
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
|
||||
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
|
||||
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
|
||||
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
|
||||
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
|
||||
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
|
||||
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
|
||||
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
|
||||
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
|
||||
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
|
||||
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
|
||||
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
|
||||
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
|
||||
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
|
||||
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
|
||||
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
|
||||
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
|
||||
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
|
||||
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
|
||||
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
|
||||
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
|
||||
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
|
||||
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
|
||||
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
|
||||
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
|
||||
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
|
||||
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
|
||||
2366 | | | | | | | | || | | * jne .LBB5_12
|
||||
2367 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||
2364 | 1.0 | incq %rbx | [2364]
|
||||
|
||||
@@ -1,162 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icx-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
|
||||
| 1* | | | | | | | | | mov rsi, rax
|
||||
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
|
||||
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
|
||||
| 1* | | | | | | | | | xor esi, esi
|
||||
| 1* | | | | | | | | | xor edi, edi
|
||||
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
|
||||
| 1 | | | | | | | 1.0 | | setz sil
|
||||
| 1 | | | | | | | 1.0 | | setnz dil
|
||||
| 1 | | 1.0 | | | | | | | mov eax, 0xff
|
||||
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
|
||||
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
|
||||
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
|
||||
| 1 | | 1.0 | | | | | | | xor esi, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
|
||||
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | or esi, 0xfc
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
|
||||
| 1* | | | | | | | | | cmp r10, rdx
|
||||
| 0*F | | | | | | | | | jz 0x34
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
|
||||
| 1 | | 1.0 | | | | | | | inc rdx
|
||||
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
|
||||
Total Num Of Uops: 140
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -1,304 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 13000
|
||||
Total Cycles: 5640
|
||||
Total uOps: 15400
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.73
|
||||
IPC: 2.30
|
||||
Block RThroughput: 40.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 5 0.50 * movslq (%r11,%rdx,4), %rax
|
||||
1 1 0.25 movq %rax, %rsi
|
||||
1 1 0.50 shlq $5, %rsi
|
||||
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
|
||||
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
|
||||
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
|
||||
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
|
||||
2 8 0.50 * vmovups 128(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
|
||||
2 8 0.50 * vmovups 320(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
|
||||
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
|
||||
2 8 0.50 * vmovups (%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
|
||||
2 8 0.50 * vmovups 256(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
|
||||
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
|
||||
2 8 0.50 * vmovups 448(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
|
||||
2 8 0.50 * vmovups 192(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
|
||||
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
|
||||
2 8 0.50 * vmovups 384(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
|
||||
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
|
||||
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
|
||||
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
|
||||
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
|
||||
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
|
||||
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
|
||||
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||
3 4 2.00 vrcp14ps %zmm27, %zmm31
|
||||
3 4 2.00 vrcp14ps %zmm28, %zmm1
|
||||
3 4 2.00 vrcp14ps %zmm29, %zmm2
|
||||
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||
3 4 2.00 vrcp14ps %zmm30, %zmm3
|
||||
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
|
||||
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
|
||||
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
|
||||
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
|
||||
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
|
||||
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
|
||||
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
|
||||
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
|
||||
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
|
||||
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
|
||||
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
|
||||
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
|
||||
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
|
||||
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
|
||||
1 0 0.17 xorl %esi, %esi
|
||||
1 0 0.17 xorl %edi, %edi
|
||||
1 1 0.25 testl $2147483647, %eax
|
||||
1 1 0.50 sete %sil
|
||||
1 1 0.50 setne %dil
|
||||
1 1 0.25 movl $255, %eax
|
||||
1 1 0.50 cmovel %r8d, %eax
|
||||
1 1 0.25 movl $255, %ecx
|
||||
1 1 0.50 cmovel %r9d, %ecx
|
||||
1 1 0.25 xorl $255, %esi
|
||||
1 1 1.00 kmovd %esi, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
1 1 0.50 leal (%rdi,%rdi,2), %esi
|
||||
1 1 0.25 orl $252, %esi
|
||||
1 1 1.00 kmovd %esi, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
|
||||
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
|
||||
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
|
||||
1 1 1.00 kmovd %eax, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
|
||||
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
1 1 1.00 kmovd %ecx, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
|
||||
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
|
||||
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
|
||||
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
|
||||
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
|
||||
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
1 5 0.50 * movq 176(%r15), %rax
|
||||
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
|
||||
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
|
||||
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
|
||||
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
|
||||
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
|
||||
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
|
||||
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
|
||||
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
|
||||
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
|
||||
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
|
||||
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
|
||||
1 1 0.25 cmpq %rdx, %r10
|
||||
1 1 0.50 je .LBB4_18
|
||||
1 5 0.50 * movq 160(%r15), %rdi
|
||||
1 1 0.25 incq %rdx
|
||||
1 1 0.50 jmp .LBB4_8
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
|
||||
- - - - - - - - 1.00 - movq %rax, %rsi
|
||||
- - - - - - - - 1.00 - shlq $5, %rsi
|
||||
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
|
||||
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
|
||||
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
|
||||
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
|
||||
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
|
||||
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
|
||||
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
|
||||
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
|
||||
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
|
||||
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
|
||||
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
|
||||
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
|
||||
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
|
||||
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
|
||||
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
|
||||
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
|
||||
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
|
||||
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
|
||||
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
|
||||
- - - - - - - - - - xorl %esi, %esi
|
||||
- - - - - - - - - - xorl %edi, %edi
|
||||
- - - - - - - - 1.00 - testl $2147483647, %eax
|
||||
- - - - - - - - 1.00 - sete %sil
|
||||
- - - - - - - - 1.00 - setne %dil
|
||||
- - - 1.00 - - - - - - movl $255, %eax
|
||||
- - - - - - - - 1.00 - cmovel %r8d, %eax
|
||||
- - - 1.00 - - - - - - movl $255, %ecx
|
||||
- - - - - - - - 1.00 - cmovel %r9d, %ecx
|
||||
- - - 1.00 - - - - - - xorl $255, %esi
|
||||
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
|
||||
- - - - - - - - 1.00 - orl $252, %esi
|
||||
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
|
||||
- - - - - - - 1.00 - - kmovd %eax, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
|
||||
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
- - - - 1.00 - - - - - movq 176(%r15), %rax
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
|
||||
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
|
||||
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
|
||||
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
|
||||
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
|
||||
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
|
||||
- - - - - - - - 1.00 - cmpq %rdx, %r10
|
||||
- - - - - - - - 1.00 - je .LBB4_18
|
||||
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
|
||||
- - - 1.00 - - - - - - incq %rdx
|
||||
- - - - - - - - 1.00 - jmp .LBB4_8
|
||||
@@ -1,116 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-sp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-14 12:51:43
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||
1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1
|
||||
1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx
|
||||
1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx
|
||||
1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16
|
||||
1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3]
|
||||
1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3]
|
||||
1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
|
||||
1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18
|
||||
1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17
|
||||
1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16
|
||||
1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22
|
||||
1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22
|
||||
1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22
|
||||
1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23
|
||||
1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24
|
||||
1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||
1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||
1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25
|
||||
1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23
|
||||
1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23
|
||||
1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23
|
||||
1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||
1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi
|
||||
1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp
|
||||
1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12
|
||||
1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||
1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx
|
||||
1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl
|
||||
1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx
|
||||
1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx
|
||||
1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12
|
||||
1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl
|
||||
1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx
|
||||
1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl
|
||||
1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx
|
||||
1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d
|
||||
1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d
|
||||
1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d
|
||||
1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx
|
||||
1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx
|
||||
1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d
|
||||
1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00
|
||||
1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx
|
||||
1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||
1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2}
|
||||
1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21
|
||||
1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20
|
||||
1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19
|
||||
1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22
|
||||
1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
|
||||
1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18
|
||||
1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18
|
||||
1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18
|
||||
1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
|
||||
1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17
|
||||
1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
|
||||
1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16
|
||||
1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||
1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||
1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22
|
||||
1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17
|
||||
1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17
|
||||
1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16
|
||||
1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx
|
||||
1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx
|
||||
1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx
|
||||
1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx
|
||||
1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx
|
||||
1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx
|
||||
1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB
|
||||
1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||
1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2}
|
||||
1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16
|
||||
1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
|
||||
1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
|
||||
1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
|
||||
1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax
|
||||
1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10
|
||||
1420 | | | | | | | | | | || | | * jne .LBB2_12
|
||||
1421 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
|
||||
1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
|
||||
1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
|
||||
1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
|
||||
1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
|
||||
1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
|
||||
1418 | 1.0 | incq %rax | [1418]
|
||||
|
||||
@@ -1,161 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-sp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:31:04
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
|
||||
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
|
||||
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
|
||||
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
|
||||
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
|
||||
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
|
||||
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
|
||||
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
|
||||
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
|
||||
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
|
||||
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
|
||||
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
|
||||
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
|
||||
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
|
||||
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
|
||||
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
|
||||
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
|
||||
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
|
||||
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
|
||||
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
|
||||
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
|
||||
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
|
||||
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
|
||||
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
|
||||
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
|
||||
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
|
||||
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
|
||||
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
|
||||
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
|
||||
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
|
||||
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
|
||||
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
|
||||
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
|
||||
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
|
||||
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
|
||||
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
|
||||
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
|
||||
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
|
||||
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
|
||||
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
|
||||
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
|
||||
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
|
||||
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
|
||||
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
|
||||
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
|
||||
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
|
||||
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
|
||||
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
|
||||
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
|
||||
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
|
||||
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
|
||||
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
|
||||
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
|
||||
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
|
||||
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
|
||||
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
|
||||
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
|
||||
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
|
||||
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
|
||||
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
|
||||
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
|
||||
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
|
||||
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
|
||||
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
|
||||
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
|
||||
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
|
||||
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
|
||||
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
|
||||
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
|
||||
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
|
||||
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
|
||||
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
|
||||
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
|
||||
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
|
||||
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
|
||||
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
|
||||
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
|
||||
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
|
||||
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
|
||||
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
|
||||
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
|
||||
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
|
||||
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
|
||||
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
|
||||
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
|
||||
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
|
||||
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
|
||||
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
|
||||
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
|
||||
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
|
||||
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
|
||||
1791 | | | | | | | | || | | * je .LBB4_18
|
||||
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
|
||||
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
|
||||
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
|
||||
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
|
||||
1796 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1794 | 1.0 | incq %rdx | [1794]
|
||||
|
||||
@@ -1,88 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - lammps-icc-avx2.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
|
||||
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
|
||||
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
|
||||
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
|
||||
| 1* | | | | | | | | | mov r8d, ecx
|
||||
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
|
||||
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
|
||||
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
|
||||
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
|
||||
| 1* | | | | | | | | | mov r14d, r15d
|
||||
| 1 | | | | | | | 1.0 | | shr r15, 0x20
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
|
||||
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
|
||||
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
|
||||
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
|
||||
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
|
||||
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
|
||||
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
|
||||
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
|
||||
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
|
||||
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
|
||||
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
|
||||
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
|
||||
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
|
||||
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
|
||||
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
|
||||
| 1 | | | | | | | 1.0 | | add rdx, 0x4
|
||||
| 1* | | | | | | | | | cmp rdx, rsi
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff02
|
||||
Total Num Of Uops: 62
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -1,156 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 5600
|
||||
Total Cycles: 2352
|
||||
Total uOps: 6300
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.68
|
||||
IPC: 2.38
|
||||
Block RThroughput: 10.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
1 2 1.00 vmovq %xmm0, %rcx
|
||||
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
1 2 1.00 vmovq %xmm2, %r15
|
||||
1 1 0.25 movl %ecx, %r8d
|
||||
1 1 0.50 shrq $32, %rcx
|
||||
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||
1 1 0.25 movslq %r8d, %rcx
|
||||
1 1 0.25 movslq %r14d, %r8
|
||||
1 1 0.25 movl %r15d, %r14d
|
||||
1 1 0.50 shrq $32, %r15
|
||||
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||
1 1 0.25 movslq %r14d, %r14
|
||||
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||
1 1 0.25 movslq %r15d, %r15
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
2 3 1.00 vptest %ymm7, %ymm1
|
||||
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||
1 1 0.25 addq $4, %rdx
|
||||
1 1 0.25 cmpq %rsi, %rdx
|
||||
1 1 0.50 jb ..B1.22
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
|
||||
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
- - 1.00 - - - - - - - vmovq %xmm2, %r15
|
||||
- - - - - - - - 1.00 - movl %ecx, %r8d
|
||||
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
|
||||
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
|
||||
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
|
||||
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
|
||||
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
|
||||
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
|
||||
- - 0.51 - - - - - 0.49 - shrq $32, %r15
|
||||
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
|
||||
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
|
||||
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
|
||||
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
|
||||
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
|
||||
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
|
||||
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
|
||||
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
|
||||
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||
- - 0.01 - - - - - 0.99 - addq $4, %rdx
|
||||
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
|
||||
- - 0.45 - - - - - 0.55 - jb ..B1.22
|
||||
@@ -1,158 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 5600
|
||||
Total Cycles: 2306
|
||||
Total uOps: 6300
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.73
|
||||
IPC: 2.43
|
||||
Block RThroughput: 10.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
1 2 1.00 vmovq %xmm0, %rcx
|
||||
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
1 2 1.00 vmovq %xmm2, %r15
|
||||
1 1 0.25 movl %ecx, %r8d
|
||||
1 1 0.50 shrq $32, %rcx
|
||||
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||
1 1 0.25 movslq %r8d, %rcx
|
||||
1 1 0.25 movslq %r14d, %r8
|
||||
1 1 0.25 movl %r15d, %r14d
|
||||
1 1 0.50 shrq $32, %r15
|
||||
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||
1 1 0.25 movslq %r14d, %r14
|
||||
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||
1 1 0.25 movslq %r15d, %r15
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
2 3 1.00 vptest %ymm7, %ymm1
|
||||
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||
1 1 0.25 addq $4, %rdx
|
||||
1 1 0.25 cmpq %rsi, %rdx
|
||||
1 1 0.50 jb ..B1.22
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - ICXDivider
|
||||
[1] - ICXFPDivider
|
||||
[2] - ICXPort0
|
||||
[3] - ICXPort1
|
||||
[4] - ICXPort2
|
||||
[5] - ICXPort3
|
||||
[6] - ICXPort4
|
||||
[7] - ICXPort5
|
||||
[8] - ICXPort6
|
||||
[9] - ICXPort7
|
||||
[10] - ICXPort8
|
||||
[11] - ICXPort9
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
|
||||
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
|
||||
- - - - - - - - 1.00 - - - movl %ecx, %r8d
|
||||
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
|
||||
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
|
||||
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
|
||||
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
|
||||
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
|
||||
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
|
||||
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
|
||||
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
|
||||
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
|
||||
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
|
||||
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
|
||||
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
|
||||
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
|
||||
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
|
||||
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
|
||||
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
|
||||
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
|
||||
- - 0.01 - - - - - 0.99 - - - jb ..B1.22
|
||||
@@ -1,97 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx2.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:29:58
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
----------------------------------------------------------------------------------------------------
|
||||
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||
259 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
|
||||
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
|
||||
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
|
||||
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
|
||||
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
|
||||
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
|
||||
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
|
||||
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
|
||||
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||
299 | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||
319 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
|
||||
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
|
||||
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||
323 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx2.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-10 16:29:48
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
|
||||
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
|
||||
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
|
||||
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
|
||||
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
|
||||
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
|
||||
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
|
||||
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
|
||||
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
|
||||
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
|
||||
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||
323 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - lammps-icc-avx512.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
|
||||
| 1 | | | | | | | 1.0 | | add r15, 0x8
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k5
|
||||
| 1 | 1.0 | | | | | | | | kmovw k3, k5
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k5
|
||||
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
|
||||
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
|
||||
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
|
||||
| 1* | | | | | | | | | vmovaps zmm23, zmm31
|
||||
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
|
||||
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
|
||||
| 1* | | | | | | | | | cmp r15, r14
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
|
||||
Total Num Of Uops: 57
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
||||
@@ -1,128 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 4200
|
||||
Total Cycles: 2465
|
||||
Total uOps: 5800
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.35
|
||||
IPC: 1.70
|
||||
Block RThroughput: 13.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||
1 1 0.25 addq $8, %r15
|
||||
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||
1 1 1.00 kmovw %k5, %k2
|
||||
1 1 1.00 kmovw %k5, %k3
|
||||
1 1 1.00 kmovw %k5, %k1
|
||||
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
1 1 1.00 knotw %k0, %k4
|
||||
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
1 1 0.25 cmpq %r14, %r15
|
||||
1 1 0.50 jb ..B1.16
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
|
||||
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
|
||||
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k2
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k3
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k1
|
||||
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
|
||||
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
|
||||
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
|
||||
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
|
||||
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
|
||||
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
- - 1.00 - - - - - - - knotw %k0, %k4
|
||||
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
|
||||
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
|
||||
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
|
||||
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
|
||||
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
|
||||
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
|
||||
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
|
||||
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
|
||||
- - 0.14 - - - - - 0.86 - jb ..B1.16
|
||||
@@ -1,130 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 4200
|
||||
Total Cycles: 2465
|
||||
Total uOps: 5800
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.35
|
||||
IPC: 1.70
|
||||
Block RThroughput: 13.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||
1 1 0.25 addq $8, %r15
|
||||
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||
1 1 1.00 kmovw %k5, %k2
|
||||
1 1 1.00 kmovw %k5, %k3
|
||||
1 1 1.00 kmovw %k5, %k1
|
||||
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
1 1 1.00 knotw %k0, %k4
|
||||
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
1 1 0.25 cmpq %r14, %r15
|
||||
1 1 0.50 jb ..B1.16
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - ICXDivider
|
||||
[1] - ICXFPDivider
|
||||
[2] - ICXPort0
|
||||
[3] - ICXPort1
|
||||
[4] - ICXPort2
|
||||
[5] - ICXPort3
|
||||
[6] - ICXPort4
|
||||
[7] - ICXPort5
|
||||
[8] - ICXPort6
|
||||
[9] - ICXPort7
|
||||
[10] - ICXPort8
|
||||
[11] - ICXPort9
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
|
||||
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
|
||||
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k2
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k3
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k1
|
||||
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
|
||||
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
|
||||
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
|
||||
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
|
||||
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
|
||||
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
|
||||
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
- - 1.00 - - - - - - - - - knotw %k0, %k4
|
||||
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
|
||||
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
|
||||
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
|
||||
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
|
||||
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
|
||||
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
|
||||
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
|
||||
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
|
||||
- - 0.14 - - - - - 0.86 - - - jb ..B1.16
|
||||
@@ -1,77 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx512.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:30:08
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||
203 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
|
||||
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
|
||||
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
|
||||
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||
246 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx512.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-10 16:29:42
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
|
||||
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
|
||||
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
|
||||
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||
246 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 7000
|
||||
Total Cycles: 3866
|
||||
Total uOps: 7900
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.04
|
||||
IPC: 1.81
|
||||
Block RThroughput: 21.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
2 4 1.50 vpmovsxdq %xmm11, %ymm1
|
||||
1 1 0.50 vpsllq $3, %ymm1, %ymm1
|
||||
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
|
||||
1 1 1.00 vmovq %xmm1, %r14
|
||||
2 1 1.00 vpextrq $1, %xmm1, %r9
|
||||
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
|
||||
1 8 0.50 * vmovsd (%r14), %xmm2
|
||||
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
2 4 1.50 vpmovsxdq %xmm6, %ymm6
|
||||
1 1 0.50 vpsllq $3, %ymm6, %ymm6
|
||||
1 1 1.00 vmovq %xmm1, %rdi
|
||||
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
|
||||
1 1 1.00 vmovq %xmm6, %rcx
|
||||
2 1 1.00 vpextrq $1, %xmm1, %rbx
|
||||
2 1 1.00 vpextrq $1, %xmm6, %rax
|
||||
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
|
||||
1 8 0.50 * vmovsd (%rdi), %xmm6
|
||||
1 1 1.00 vmovq %xmm1, %rdi
|
||||
2 1 1.00 vpextrq $1, %xmm1, %rsi
|
||||
1 8 0.50 * vmovsd (%rdi), %xmm1
|
||||
1 8 0.50 * vmovsd (%rcx), %xmm7
|
||||
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
|
||||
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
|
||||
2 4 1.50 vpmovsxdq %xmm4, %ymm4
|
||||
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
|
||||
1 1 0.50 vpsllq $3, %ymm4, %ymm4
|
||||
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
|
||||
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
|
||||
2 1 1.00 vpextrq $1, %xmm4, %rax
|
||||
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
|
||||
1 1 1.00 vmovq %xmm4, %rcx
|
||||
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
|
||||
1 1 1.00 vmovq %xmm4, %rsi
|
||||
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
2 1 1.00 vpextrq $1, %xmm4, %rdi
|
||||
1 8 0.50 * vmovsd (%rsi), %xmm4
|
||||
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
|
||||
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
|
||||
1 8 0.50 * vmovsd (%rcx), %xmm6
|
||||
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
|
||||
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
|
||||
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
|
||||
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
|
||||
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
|
||||
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
|
||||
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
|
||||
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
|
||||
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
|
||||
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
1 1 0.25 addq $4, %rbp
|
||||
1 1 0.25 cmpq %rdx, %rbp
|
||||
1 1 0.50 jb .LBB0_9
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - Zn3AGU0
|
||||
[1] - Zn3AGU1
|
||||
[2] - Zn3AGU2
|
||||
[3] - Zn3ALU0
|
||||
[4] - Zn3ALU1
|
||||
[5] - Zn3ALU2
|
||||
[6] - Zn3ALU3
|
||||
[7] - Zn3BRU1
|
||||
[8] - Zn3FPP0
|
||||
[9] - Zn3FPP1
|
||||
[10] - Zn3FPP2
|
||||
[11] - Zn3FPP3
|
||||
[12.0] - Zn3FPP45
|
||||
[12.1] - Zn3FPP45
|
||||
[13] - Zn3FPSt
|
||||
[14.0] - Zn3LSU
|
||||
[14.1] - Zn3LSU
|
||||
[14.2] - Zn3LSU
|
||||
[15.0] - Zn3Load
|
||||
[15.1] - Zn3Load
|
||||
[15.2] - Zn3Load
|
||||
[16.0] - Zn3Store
|
||||
[16.1] - Zn3Store
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
|
||||
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
|
||||
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
|
||||
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
|
||||
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
|
||||
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
|
||||
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
|
||||
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
|
||||
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
|
||||
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
|
||||
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
|
||||
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
|
||||
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
|
||||
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
|
||||
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
|
||||
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
|
||||
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
|
||||
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
|
||||
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
|
||||
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
|
||||
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
|
||||
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
|
||||
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
|
||||
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
|
||||
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
|
||||
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
|
||||
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
|
||||
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
|
||||
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
|
||||
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
|
||||
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
|
||||
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
|
||||
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
|
||||
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
|
||||
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
|
||||
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9
|
||||
@@ -1,108 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icx-avx2zen.s
|
||||
Architecture: ZEN3
|
||||
Timestamp: 2023-02-10 16:31:30
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------
|
||||
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
|
||||
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
|
||||
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
|
||||
247 | 1.0 | addq $4, %rbp | [247]
|
||||
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
|
||||
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
|
||||
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -1,640 +0,0 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
movl $111, %ebx # OSACA START MARKER
|
||||
.byte 100 # OSACA START MARKER
|
||||
.byte 103 # OSACA START MARKER
|
||||
.byte 144 # OSACA START MARKER
|
||||
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
# LLVM-MCA-BEGIN
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # OSACA END MARKER
|
||||
.byte 100 # OSACA END MARKER
|
||||
.byte 103 # OSACA END MARKER
|
||||
.byte 144 # OSACA END MARKER
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $40, %rsp
|
||||
.cfi_def_cfa_offset 96
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r13d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r13,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r15), %rax
|
||||
movq 24(%r15), %rcx
|
||||
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||
movslq 8(%r15), %rdx
|
||||
movq 16(%r12), %rsi
|
||||
movq 64(%r12), %rdi
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 16(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, (%rsp) # 8-byte Spill
|
||||
xorl %r12d, %r12d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r9, %rdx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r15,8)
|
||||
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r10,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r9), %ecx
|
||||
addl $6, %r9d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r9d
|
||||
sarl $2, %r9d
|
||||
movslq %r9d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r12
|
||||
addq (%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r13, %r12
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq 8(%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r12,4), %r9
|
||||
leaq (%r12,%r12,2), %rcx
|
||||
leal 1(%rcx), %r10d
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r15d
|
||||
testq %r9, %r9
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r9d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ecx, %ecx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
je .LBB1_6
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rcx,4), %r8
|
||||
leaq (%r8,%r8,2), %r14
|
||||
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||
movslq %r14d, %rbp
|
||||
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm3, %xmm8, %xmm3
|
||||
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm7, %xmm3, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm9, %xmm9
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r13d, %r8d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rbp), %rbx
|
||||
addq $2, %rbp
|
||||
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r14,8)
|
||||
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
jmp .LBB1_13
|
||||
.LBB1_7: #
|
||||
movq 16(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $40, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.2, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.2, 66
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
||||
@@ -1,105 +0,0 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: force_lj_icx_avx2_markers.s
|
||||
Architecture: ZEN3
|
||||
Timestamp: 2022-12-12 12:47:07
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------
|
||||
172 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||
173 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||
174 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
175 | | 0.250 | 0.75 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
176 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
177 | 0.00 | 1.010 | 0.25 | 0.74 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||
178 | | 0.000 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||
179 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||
180 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||
181 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||
182 | | 1.000 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||
183 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
184 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
185 | 0.00 | 0.750 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||
186 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||
187 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
188 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||
189 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||
190 | | 1.000 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||
191 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||
192 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||
193 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
194 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
195 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||
196 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
197 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
198 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
199 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
200 | 0.00 | 0.000 | 0.62 | 0.38 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||
201 | 0.00 | 0.750 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||
202 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
203 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||
204 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||
205 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
206 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||
207 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
208 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||
209 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||
210 | 0.00 | -0.01 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||
211 | | 1.000 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
212 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||
213 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
214 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||
215 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
216 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
217 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
218 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
219 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
220 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||
221 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||
222 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||
223 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
224 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
225 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
226 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||
227 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||
228 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||
229 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
230 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||
231 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||
232 | 1.00 | 0.000 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
233 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||
234 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||
235 | | | 0.12 | 0.88 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
236 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
237 | 1.00 | 0.000 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
238 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
239 | 1.00 | 0.000 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
240 | 0.62 | 0.380 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
241 | 0.50 | 0.500 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
242 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||
243 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||
244 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||
|
||||
16.1 15.63 15.6 15.6 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
239 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
|
||||
238 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
|
||||
236 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
|
||||
242 | 1.0 | addq $4, %rbp | [242]
|
||||
241 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [241]
|
||||
240 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [240]
|
||||
237 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [237]
|
||||
|
||||
@@ -1,638 +0,0 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
# OSACA-BEGIN
|
||||
# LLVM-MCA-BEGIN
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
# OSACA-END
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $40, %rsp
|
||||
.cfi_def_cfa_offset 96
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r13d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r13,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r15), %rax
|
||||
movq 24(%r15), %rcx
|
||||
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||
movslq 8(%r15), %rdx
|
||||
movq 16(%r12), %rsi
|
||||
movq 64(%r12), %rdi
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 16(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, (%rsp) # 8-byte Spill
|
||||
xorl %r12d, %r12d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r9, %rdx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r15,8)
|
||||
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r10,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r9), %ecx
|
||||
addl $6, %r9d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r9d
|
||||
sarl $2, %r9d
|
||||
movslq %r9d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r12
|
||||
addq (%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r13, %r12
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq 8(%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r12,4), %r9
|
||||
leaq (%r12,%r12,2), %rcx
|
||||
leal 1(%rcx), %r10d
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r15d
|
||||
testq %r9, %r9
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r9d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ecx, %ecx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
je .LBB1_6
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rcx,4), %r8
|
||||
leaq (%r8,%r8,2), %r14
|
||||
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||
movslq %r14d, %rbp
|
||||
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm3, %xmm8, %xmm3
|
||||
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm7, %xmm3, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm9, %xmm9
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r13d, %r8d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rbp), %rbx
|
||||
addq $2, %rbp
|
||||
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r14,8)
|
||||
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
jmp .LBB1_13
|
||||
.LBB1_7: #
|
||||
movq 16(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $40, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.2, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.2, 66
|
||||
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
||||
@@ -15,7 +15,7 @@ ISA="${BIN_INFO##*-}"
|
||||
CORE="${CORE:-0}"
|
||||
FREQ="${FREQ:-2.4}"
|
||||
NRUNS="${NRUNS:-3}"
|
||||
LOG="${LOG:-latencies_and_cfds.log}"
|
||||
LOG="${LOG:-latencies_and_cfds.$(hostname).log}"
|
||||
STUB_ONLY="${STUB_ONLY:-false}"
|
||||
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
|
||||
|
||||
@@ -37,10 +37,14 @@ CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
|
||||
|
||||
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
|
||||
ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
|
||||
PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
|
||||
DEFAULT_PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
|
||||
else
|
||||
ALL_PREFETCHERS=""
|
||||
PREFETCHERS=("IGNORE")
|
||||
DEFAULT_PREFETCHERS=("IGNORE")
|
||||
fi
|
||||
|
||||
if [ -z ${PREFETCHERS+x} ]; then
|
||||
PREFETCHERS=${DEFAULT_PREFETCHERS}
|
||||
fi
|
||||
|
||||
if [ "$OPT_SCHEME" == "gromacs" ]; then
|
||||
|
||||
52
util/gather-bench/.gitignore
vendored
Normal file
52
util/gather-bench/.gitignore
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
||||
# Object files
|
||||
*.o
|
||||
*.ko
|
||||
*.obj
|
||||
*.elf
|
||||
|
||||
# Linker output
|
||||
*.ilk
|
||||
*.map
|
||||
*.exp
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Libraries
|
||||
*.lib
|
||||
*.a
|
||||
*.la
|
||||
*.lo
|
||||
|
||||
# Shared objects (inc. Windows DLLs)
|
||||
*.dll
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
*.i*86
|
||||
*.x86_64
|
||||
*.hex
|
||||
|
||||
# Debug files
|
||||
*.dSYM/
|
||||
*.su
|
||||
*.idb
|
||||
*.pdb
|
||||
|
||||
# Kernel Module Compile Results
|
||||
*.mod*
|
||||
*.cmd
|
||||
.tmp_versions/
|
||||
modules.order
|
||||
Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user