added asm files and analysis output

This commit is contained in:
JanLJL 2023-02-13 14:15:08 +01:00
parent d0277765c3
commit 95d63334fa
36 changed files with 13596 additions and 5383 deletions

View File

@ -0,0 +1,198 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icc-avx512-dp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
| 1 | | | | | | | 1.0 | | inc rsi
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
| 1 | | | | | | | 1.0 | | mov edx, 0x0
| 1 | | | | | | | 1.0 | | setz dl
| 1 | | 1.0 | | | | | | | cmp eax, r11d
| 1 | | | | | | | 1.0 | | mov eax, 0x0
| 1* | | | | | | | | | mov r13d, edx
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
| 1 | | | | | | | 1.0 | | setz al
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
| 1 | | 1.0 | | | | | | | neg r13d
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
| 1* | | | | | | | | | mov r12d, eax
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
| 1 | | 1.0 | | | | | | | add r13d, 0xff
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
| 1 | | | | | | | 1.0 | | nop
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
| 1 | | 1.0 | | | | | | | sub r13d, r12d
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
| 1* | | | | | | | | | mov r13d, eax
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
| 1 | | | | | | | 1.0 | | neg r12d
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
| 1 | | 1.0 | | | | | | | add r12d, 0xff
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
| 1 | | 1.0 | | | | | | | sub r12d, r13d
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
| 1* | | | | | | | | | mov r12d, eax
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
| 1 | | | | | | | 1.0 | | neg r13d
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
| 1 | | | | | | | 1.0 | | add r13d, 0xff
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
| 1 | | | | | | | 1.0 | | shl edx, 0x3
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
| 1 | | 1.0 | | | | | | | neg edx
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
| 1 | | 1.0 | | | | | | | sub r13d, r12d
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
| 1 | | 1.0 | | | | | | | add edx, 0xff
| 1 | | | | | | | 1.0 | | shl eax, 0x7
| 1 | | 1.0 | | | | | | | sub edx, eax
| 1 | 1.0 | | | | | | | | kmovb eax, k6
| 1 | | | | | | 1.0 | | | kmovb k6, eax
| 1 | 1.0 | | | | | | | | kmovw eax, k7
| 1 | | | | | | 1.0 | | | kmovb k7, eax
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
| 1 | | | | | | 1.0 | | | kmovb k6, edx
| 1 | 1.0 | | | | | | | | kmovb edx, k7
| 1 | | | | | | 1.0 | | | kmovw k7, edx
| 1 | 1.0 | | | | | | | | kmovw edx, k0
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
| 1 | 1.0 | | | | | | | | kmovb eax, k6
| 1 | | | | | | 1.0 | | | kmovb k6, eax
| 1 | | | | | | 1.0 | | | kmovb k0, edx
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
| 1* | | | | | | | | | cmp rsi, rdi
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
Total Num Of Uops: 187
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@ -0,0 +1,152 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icc-avx512-sp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
| 1* | | | | | | | | | mov r12d, r13d
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
| 1 | | 1.0 | | | | | | | inc rax
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
| 1 | | | | | | | 1.0 | | setz r12b
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
| 1 | | | | | | | 1.0 | | shl r14, 0x5
| 1* | | | | | | | | | mov r8d, r12d
| 1 | | 1.0 | | | | | | | neg r8d
| 1* | | | | | | | | | mov r11d, r12d
| 1 | | 1.0 | | | | | | | add r8d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
| 1 | | | | | | | 1.0 | | neg r9d
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
| 1 | | | | | | | 1.0 | | add r9d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
| 1 | | | | | | | 1.0 | | neg r10d
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
| 1 | | 1.0 | | | | | | | add r10d, r12d
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
| 1 | | | | | | | 1.0 | | add r10d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
| 1 | | 1.0 | | | | | | | sub r12d, r11d
| 1 | | 1.0 | | | | | | | add r12d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
| 1* | | | | | | | | | cmp rax, rdx
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
Total Num Of Uops: 142
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@ -0,0 +1,154 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icx-avx512-dp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
| 1 | | | | | | | 1.0 | | cmp r11, rdx
| 1 | | | | | | | 1.0 | | setnz dl
| 1 | | | | | | | 1.0 | | setz al
| 1 | | 1.0 | | | | | | | add ecx, ecx
| 1 | | 1.0 | | | | | | | inc ecx
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
| 1 | | | | | | | 1.0 | | setz cl
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
| 1 | | | | | | | 1.0 | | setnz dil
| 1* | | | | | | | | | mov ebp, edi
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
| 1 | | 1.0 | | | | | | | sub bpl, al
| 1 | | 1.0 | | | | | | | add bpl, 0xef
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
| 1* | | | | | | | | | mov ebp, edi
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
| 1 | | 1.0 | | | | | | | or bpl, al
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
| 1 | | | | | | | 1.0 | | shl dil, 0x6
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
| 1 | | | | | | 1.0 | | | kmovd k1, edi
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
| 1 | | | | | | | 1.0 | | shl dl, 0x3
| 1 | | | | | | | 1.0 | | shl cl, 0x7
| 1 | | 1.0 | | | | | | | or cl, dl
| 1 | | 1.0 | | | | | | | add cl, 0xf7
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
| 1* | | | | | | | | | cmp r9, rbx
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
Total Num Of Uops: 129
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@ -0,0 +1,288 @@
[0] Code Region
Iterations: 100
Instructions: 12200
Total Cycles: 4745
Total uOps: 14000
Dispatch Width: 6
uOps Per Cycle: 2.95
IPC: 2.57
Block RThroughput: 34.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
1 1 0.50 shlq $6, %rdx
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
2 8 0.50 * vmovupd 16(%rsp), %zmm3
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
2 8 0.50 * vmovupd 336(%rsp), %zmm16
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
3 4 2.00 vrcp14pd %zmm17, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
1 1 0.50 leal (%rcx,%rcx), %edx
1 1 0.25 cmpq %rdx, %r11
1 1 0.50 setne %dl
1 1 0.50 sete %al
1 1 0.25 addl %ecx, %ecx
1 1 0.25 incl %ecx
1 1 0.25 cmpq %rcx, %r11
1 1 0.50 sete %cl
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
2 8 0.50 * vmovupd 528(%rsp), %zmm19
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
1 1 0.50 setne %dil
1 1 0.25 movl %edi, %ebp
1 1 0.50 shlb $4, %bpl
1 1 0.25 subb %al, %bpl
1 1 0.25 addb $-17, %bpl
1 1 1.00 kmovd %ebp, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2 8 0.50 * vmovupd 272(%rsp), %zmm17
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
1 1 0.50 leal (%rdx,%rdx), %eax
1 1 0.25 movl %edi, %ebp
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
3 4 2.00 vrcp14pd %zmm3, %zmm16
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
2 8 0.50 * vmovupd 464(%rsp), %zmm31
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
1 1 0.50 shlb $5, %bpl
1 1 0.25 orb %al, %bpl
1 1 0.25 orb $-35, %bpl
1 1 1.00 kmovd %ebp, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2 8 0.50 * vmovupd 208(%rsp), %zmm3
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
3 4 2.00 vrcp14pd %zmm19, %zmm17
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
1 1 0.50 leal (,%rdx,4), %eax
1 1 0.50 shlb $6, %dil
1 1 0.25 orb %al, %dil
1 1 0.25 orb $-69, %dil
1 1 1.00 kmovd %edi, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2 8 0.50 * vmovupd 400(%rsp), %zmm17
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
3 4 2.00 vrcp14pd %zmm28, %zmm3
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
1 1 0.50 shlb $3, %dl
1 1 0.50 shlb $7, %cl
1 1 0.25 orb %dl, %cl
1 1 0.25 addb $-9, %cl
1 1 1.00 kmovd %ecx, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
1 1 0.25 incq %rbx
1 1 0.25 cmpq %rbx, %r9
1 1 0.50 jne .LBB5_12
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
- - - - - - - - 1.00 - cmpq %rdx, %r11
- - - - - - - - 1.00 - setne %dl
- - 0.44 - - - - - 0.56 - sete %al
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
- - - 0.53 - - - 0.46 0.01 - incl %ecx
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
- - 0.02 - - - - - 0.98 - sete %cl
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
- - 0.04 - - - - - 0.96 - setne %dil
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
- - - 0.96 - - - - 0.04 - subb %al, %bpl
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
- - - - - - - 1.00 - - kmovd %ebp, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
- - - 0.94 - - - - 0.06 - orb %al, %bpl
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
- - - - - - - 1.00 - - kmovd %ebp, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
- - - - - - - - 1.00 - shlb $6, %dil
- - - 0.02 - - - - 0.98 - orb %al, %dil
- - - 0.48 - - - - 0.52 - orb $-69, %dil
- - - - - - - 1.00 - - kmovd %edi, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
- - - - - - - - 1.00 - shlb $3, %dl
- - - - - - - - 1.00 - shlb $7, %cl
- - - 1.00 - - - - - - orb %dl, %cl
- - - 0.52 - - - - 0.48 - addb $-9, %cl
- - - - - - - 1.00 - - kmovd %ecx, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
- - - 0.48 - - - - 0.52 - incq %rbx
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
- - - - - - - - 1.00 - jne .LBB5_12

View File

@ -0,0 +1,167 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-dp.s
Architecture: CSX
Timestamp: 2023-02-10 16:30:53
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
2366 | | | | | | | | || | | * jne .LBB5_12
2367 | | | | | | | | || | | # LLVM-MCA-END
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
2364 | 1.0 | incq %rbx | [2364]

View File

@ -0,0 +1,162 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icx-avx512-sp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
| 1* | | | | | | | | | mov rsi, rax
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
| 1* | | | | | | | | | xor esi, esi
| 1* | | | | | | | | | xor edi, edi
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
| 1 | | | | | | | 1.0 | | setz sil
| 1 | | | | | | | 1.0 | | setnz dil
| 1 | | 1.0 | | | | | | | mov eax, 0xff
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
| 1 | | 1.0 | | | | | | | xor esi, 0xff
| 1 | | | | | | 1.0 | | | kmovd k1, esi
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
| 1 | | | | | | | 1.0 | | or esi, 0xfc
| 1 | | | | | | 1.0 | | | kmovd k1, esi
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
| 1 | | | | | | 1.0 | | | kmovd k1, eax
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
| 1* | | | | | | | | | cmp r10, rdx
| 0*F | | | | | | | | | jz 0x34
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
| 1 | | 1.0 | | | | | | | inc rdx
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
Total Num Of Uops: 140
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@ -0,0 +1,304 @@
[0] Code Region
Iterations: 100
Instructions: 13000
Total Cycles: 5640
Total uOps: 15400
Dispatch Width: 6
uOps Per Cycle: 2.73
IPC: 2.30
Block RThroughput: 40.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 5 0.50 * movslq (%r11,%rdx,4), %rax
1 1 0.25 movq %rax, %rsi
1 1 0.50 shlq $5, %rsi
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
2 8 0.50 * vmovups 128(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
2 8 0.50 * vmovups 320(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
2 8 0.50 * vmovups (%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
2 8 0.50 * vmovups 256(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
2 8 0.50 * vmovups 448(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
2 8 0.50 * vmovups 192(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
2 8 0.50 * vmovups 384(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
3 4 2.00 vrcp14ps %zmm27, %zmm31
3 4 2.00 vrcp14ps %zmm28, %zmm1
3 4 2.00 vrcp14ps %zmm29, %zmm2
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
3 4 2.00 vrcp14ps %zmm30, %zmm3
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
1 0 0.17 xorl %esi, %esi
1 0 0.17 xorl %edi, %edi
1 1 0.25 testl $2147483647, %eax
1 1 0.50 sete %sil
1 1 0.50 setne %dil
1 1 0.25 movl $255, %eax
1 1 0.50 cmovel %r8d, %eax
1 1 0.25 movl $255, %ecx
1 1 0.50 cmovel %r9d, %ecx
1 1 0.25 xorl $255, %esi
1 1 1.00 kmovd %esi, %k1
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
1 1 0.50 leal (%rdi,%rdi,2), %esi
1 1 0.25 orl $252, %esi
1 1 1.00 kmovd %esi, %k1
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
1 1 1.00 kmovd %eax, %k1
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
1 1 1.00 kmovd %ecx, %k1
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
1 5 0.50 * movq 176(%r15), %rax
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
1 1 0.25 cmpq %rdx, %r10
1 1 0.50 je .LBB4_18
1 5 0.50 * movq 160(%r15), %rdi
1 1 0.25 incq %rdx
1 1 0.50 jmp .LBB4_8
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
- - - - - - - - 1.00 - movq %rax, %rsi
- - - - - - - - 1.00 - shlq $5, %rsi
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
- - - - - - - - - - xorl %esi, %esi
- - - - - - - - - - xorl %edi, %edi
- - - - - - - - 1.00 - testl $2147483647, %eax
- - - - - - - - 1.00 - sete %sil
- - - - - - - - 1.00 - setne %dil
- - - 1.00 - - - - - - movl $255, %eax
- - - - - - - - 1.00 - cmovel %r8d, %eax
- - - 1.00 - - - - - - movl $255, %ecx
- - - - - - - - 1.00 - cmovel %r9d, %ecx
- - - 1.00 - - - - - - xorl $255, %esi
- - - - - - - 1.00 - - kmovd %esi, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
- - - - - - - - 1.00 - orl $252, %esi
- - - - - - - 1.00 - - kmovd %esi, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
- - - - - - - 1.00 - - kmovd %eax, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
- - - - - - - 1.00 - - kmovd %ecx, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
- - - - 1.00 - - - - - movq 176(%r15), %rax
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
- - - - - - - - 1.00 - cmpq %rdx, %r10
- - - - - - - - 1.00 - je .LBB4_18
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
- - - 1.00 - - - - - - incq %rdx
- - - - - - - - 1.00 - jmp .LBB4_8

View File

@ -0,0 +1,161 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-sp.s
Architecture: CSX
Timestamp: 2023-02-10 16:31:04
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
1791 | | | | | | | | || | | * je .LBB4_18
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
1796 | | | | | | | | || | | # LLVM-MCA-END
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
1794 | 1.0 | incq %rdx | [1794]

View File

@ -0,0 +1,88 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - lammps-icc-avx2.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
| 1* | | | | | | | | | mov r8d, ecx
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
| 1* | | | | | | | | | mov r14d, r15d
| 1 | | | | | | | 1.0 | | shr r15, 0x20
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
| 1 | | | | | | | 1.0 | | add rdx, 0x4
| 1* | | | | | | | | | cmp rdx, rsi
| 0*F | | | | | | | | | jb 0xffffffffffffff02
Total Num Of Uops: 62
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@ -0,0 +1,156 @@
[0] Code Region
Iterations: 100
Instructions: 5600
Total Cycles: 2352
Total uOps: 6300
Dispatch Width: 6
uOps Per Cycle: 2.68
IPC: 2.38
Block RThroughput: 10.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
1 2 1.00 vmovq %xmm0, %rcx
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
1 2 1.00 vmovq %xmm2, %r15
1 1 0.25 movl %ecx, %r8d
1 1 0.50 shrq $32, %rcx
1 1 0.50 leal (%rcx,%rcx,2), %r14d
1 1 0.50 leal (%r8,%r8,2), %r8d
1 1 0.25 movslq %r8d, %rcx
1 1 0.25 movslq %r14d, %r8
1 1 0.25 movl %r15d, %r14d
1 1 0.50 shrq $32, %r15
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
1 1 0.50 leal (%r14,%r14,2), %r14d
1 1 0.25 movslq %r14d, %r14
1 1 0.50 leal (%r15,%r15,2), %r15d
1 1 0.25 movslq %r15d, %r15
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
2 3 1.00 vptest %ymm7, %ymm1
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
1 1 0.25 addq $4, %rdx
1 1 0.25 cmpq %rsi, %rdx
1 1 0.50 jb ..B1.22
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
- - 1.00 - - - - - - - vmovq %xmm2, %r15
- - - - - - - - 1.00 - movl %ecx, %r8d
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
- - 0.51 - - - - - 0.49 - shrq $32, %r15
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
- - 0.01 - - - - - 0.99 - addq $4, %rdx
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
- - 0.45 - - - - - 0.55 - jb ..B1.22

View File

@ -0,0 +1,158 @@
[0] Code Region
Iterations: 100
Instructions: 5600
Total Cycles: 2306
Total uOps: 6300
Dispatch Width: 6
uOps Per Cycle: 2.73
IPC: 2.43
Block RThroughput: 10.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
1 2 1.00 vmovq %xmm0, %rcx
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
1 2 1.00 vmovq %xmm2, %r15
1 1 0.25 movl %ecx, %r8d
1 1 0.50 shrq $32, %rcx
1 1 0.50 leal (%rcx,%rcx,2), %r14d
1 1 0.50 leal (%r8,%r8,2), %r8d
1 1 0.25 movslq %r8d, %rcx
1 1 0.25 movslq %r14d, %r8
1 1 0.25 movl %r15d, %r14d
1 1 0.50 shrq $32, %r15
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
1 1 0.50 leal (%r14,%r14,2), %r14d
1 1 0.25 movslq %r14d, %r14
1 1 0.50 leal (%r15,%r15,2), %r15d
1 1 0.25 movslq %r15d, %r15
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
2 3 1.00 vptest %ymm7, %ymm1
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
1 1 0.25 addq $4, %rdx
1 1 0.25 cmpq %rsi, %rdx
1 1 0.50 jb ..B1.22
Resources:
[0] - ICXDivider
[1] - ICXFPDivider
[2] - ICXPort0
[3] - ICXPort1
[4] - ICXPort2
[5] - ICXPort3
[6] - ICXPort4
[7] - ICXPort5
[8] - ICXPort6
[9] - ICXPort7
[10] - ICXPort8
[11] - ICXPort9
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
- - - - - - - - 1.00 - - - movl %ecx, %r8d
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
- - 0.01 - - - - - 0.99 - - - jb ..B1.22

View File

@ -0,0 +1,97 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx2.s
Architecture: CSX
Timestamp: 2023-02-10 16:29:58
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
----------------------------------------------------------------------------------------------------
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
259 | | | | | | | | || | | # Execution count [2.50e+01]
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
299 | | | | | | | | || | | # Execution count [1.25e+01]
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
319 | | | | | | | | || | | # Execution count [2.50e+01]
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
323 | | | | | | | | || | | # LLVM-MCA-END
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
320 | 1.0 | addq $4, %rdx #59.9| [320]

View File

@ -0,0 +1,97 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx2.s
Architecture: ICX
Timestamp: 2023-02-10 16:29:48
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
-----------------------------------------------------------------------------------------------------------------------
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
323 | | | | | | | | | | || | | # LLVM-MCA-END
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
320 | 1.0 | addq $4, %rdx #59.9| [320]

View File

@ -0,0 +1,75 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - lammps-icc-avx512.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
| 1 | | | | | | | 1.0 | | add r15, 0x8
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
| 1 | 1.0 | | | | | | | | kmovw k2, k5
| 1 | 1.0 | | | | | | | | kmovw k3, k5
| 1 | 1.0 | | | | | | | | kmovw k1, k5
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
| 1* | | | | | | | | | vmovaps zmm23, zmm31
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k4, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
| 1* | | | | | | | | | cmp r15, r14
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
Total Num Of Uops: 57
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,128 @@
[0] Code Region
Iterations: 100
Instructions: 4200
Total Cycles: 2465
Total uOps: 5800
Dispatch Width: 6
uOps Per Cycle: 2.35
IPC: 1.70
Block RThroughput: 13.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
1 1 0.25 addq $8, %r15
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
1 1 1.00 kmovw %k5, %k2
1 1 1.00 kmovw %k5, %k3
1 1 1.00 kmovw %k5, %k1
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
3 4 2.00 vrcp14pd %zmm31, %zmm30
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
1 4 1.00 vfpclasspd $30, %zmm30, %k0
1 1 0.50 vmovaps %zmm31, %zmm23
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
1 1 1.00 knotw %k0, %k4
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
1 1 0.25 cmpq %r14, %r15
1 1 0.50 jb ..B1.16
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
- - 1.00 - - - - - - - kmovw %k5, %k2
- - 1.00 - - - - - - - kmovw %k5, %k3
- - 1.00 - - - - - - - kmovw %k5, %k1
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
- - 1.00 - - - - - - - knotw %k0, %k4
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
- - 0.14 - - - - - 0.86 - jb ..B1.16

View File

@ -0,0 +1,130 @@
[0] Code Region
Iterations: 100
Instructions: 4200
Total Cycles: 2465
Total uOps: 5800
Dispatch Width: 6
uOps Per Cycle: 2.35
IPC: 1.70
Block RThroughput: 13.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
1 1 0.25 addq $8, %r15
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
1 1 1.00 kmovw %k5, %k2
1 1 1.00 kmovw %k5, %k3
1 1 1.00 kmovw %k5, %k1
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
3 4 2.00 vrcp14pd %zmm31, %zmm30
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
1 4 1.00 vfpclasspd $30, %zmm30, %k0
1 1 0.50 vmovaps %zmm31, %zmm23
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
1 1 1.00 knotw %k0, %k4
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
1 1 0.25 cmpq %r14, %r15
1 1 0.50 jb ..B1.16
Resources:
[0] - ICXDivider
[1] - ICXFPDivider
[2] - ICXPort0
[3] - ICXPort1
[4] - ICXPort2
[5] - ICXPort3
[6] - ICXPort4
[7] - ICXPort5
[8] - ICXPort6
[9] - ICXPort7
[10] - ICXPort8
[11] - ICXPort9
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
- - 1.00 - - - - - - - - - kmovw %k5, %k2
- - 1.00 - - - - - - - - - kmovw %k5, %k3
- - 1.00 - - - - - - - - - kmovw %k5, %k1
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
- - 1.00 - - - - - - - - - knotw %k0, %k4
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
- - 0.14 - - - - - 0.86 - - - jb ..B1.16

View File

@ -0,0 +1,77 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx512.s
Architecture: CSX
Timestamp: 2023-02-10 16:30:08
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
203 | | | | | | | | || | | # Execution count [2.50e+01]
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
246 | | | | | | | | || | | # LLVM-MCA-END
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
208 | 1.0 | addq $8, %r15 #59.9| [208]
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]

View File

@ -0,0 +1,77 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx512.s
Architecture: ICX
Timestamp: 2023-02-10 16:29:42
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
------------------------------------------------------------------------------------------------------------------------
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
246 | | | | | | | | | | || | | # LLVM-MCA-END
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
208 | 1.0 | addq $8, %r15 #59.9| [208]
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]

View File

@ -0,0 +1,197 @@
[0] Code Region
Iterations: 100
Instructions: 7000
Total Cycles: 3866
Total uOps: 7900
Dispatch Width: 6
uOps Per Cycle: 2.04
IPC: 1.81
Block RThroughput: 21.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
2 4 1.50 vpmovsxdq %xmm11, %ymm1
1 1 0.50 vpsllq $3, %ymm1, %ymm1
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
1 1 1.00 vmovq %xmm1, %r14
2 1 1.00 vpextrq $1, %xmm1, %r9
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
1 8 0.50 * vmovsd (%r14), %xmm2
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
2 4 1.50 vpmovsxdq %xmm6, %ymm6
1 1 0.50 vpsllq $3, %ymm6, %ymm6
1 1 1.00 vmovq %xmm1, %rdi
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
1 1 1.00 vmovq %xmm6, %rcx
2 1 1.00 vpextrq $1, %xmm1, %rbx
2 1 1.00 vpextrq $1, %xmm6, %rax
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
1 8 0.50 * vmovsd (%rdi), %xmm6
1 1 1.00 vmovq %xmm1, %rdi
2 1 1.00 vpextrq $1, %xmm1, %rsi
1 8 0.50 * vmovsd (%rdi), %xmm1
1 8 0.50 * vmovsd (%rcx), %xmm7
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
2 4 1.50 vpmovsxdq %xmm4, %ymm4
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
1 1 0.50 vpsllq $3, %ymm4, %ymm4
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
2 1 1.00 vpextrq $1, %xmm4, %rax
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
1 1 1.00 vmovq %xmm4, %rcx
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
1 1 1.00 vmovq %xmm4, %rsi
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
2 1 1.00 vpextrq $1, %xmm4, %rdi
1 8 0.50 * vmovsd (%rsi), %xmm4
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
1 8 0.50 * vmovsd (%rcx), %xmm6
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
1 1 0.25 addq $4, %rbp
1 1 0.25 cmpq %rdx, %rbp
1 1 0.50 jb .LBB0_9
Resources:
[0] - Zn3AGU0
[1] - Zn3AGU1
[2] - Zn3AGU2
[3] - Zn3ALU0
[4] - Zn3ALU1
[5] - Zn3ALU2
[6] - Zn3ALU3
[7] - Zn3BRU1
[8] - Zn3FPP0
[9] - Zn3FPP1
[10] - Zn3FPP2
[11] - Zn3FPP3
[12.0] - Zn3FPP45
[12.1] - Zn3FPP45
[13] - Zn3FPSt
[14.0] - Zn3LSU
[14.1] - Zn3LSU
[14.2] - Zn3LSU
[15.0] - Zn3Load
[15.1] - Zn3Load
[15.2] - Zn3Load
[16.0] - Zn3Store
[16.1] - Zn3Store
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9

View File

@ -0,0 +1,108 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icx-avx2zen.s
Architecture: ZEN3
Timestamp: 2023-02-10 16:31:30
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
--------------------------------------------------------------------------------------------------------------------------------------------
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
247 | 1.0 | addq $4, %rbp | [247]
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -2238,8 +2238,8 @@ movl $111, %ebx # OSACA START MARKER
.byte 100 # OSACA START MARKER
.byte 103 # OSACA START MARKER
.byte 144 # OSACA START MARKER
# pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
# LLVM-MCA-BEGIN
# pointer_increment=256 da67166e5736661e6b03ea29ee7bfd67
.LBB5_12: # Parent Loop BB5_7 Depth=1
# => This Inner Loop Header: Depth=2
movslq (%r10,%rbx,4), %rcx

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -172,8 +172,8 @@ movl $111, %ebx # OSACA START MARKER
.byte 100 # OSACA START MARKER
.byte 103 # OSACA START MARKER
.byte 144 # OSACA START MARKER
# LLVM-MCA-BEGIN
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
# LLVM-MCA-BEGIN
.LBB0_9: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
@ -386,8 +386,6 @@ movl $222, %ebx # OSACA END MARKER
.quad 4607182418800017408 # 1
.LCPI1_2:
.quad -4620693217682128896 # -0.5
.LCPI1_3:
.quad 4741671816366391296 # 1.0E+9
.text
.globl computeForceLJHalfNeigh
.p2align 4, 0x90
@ -408,128 +406,125 @@ computeForceLJHalfNeigh: #
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $56, %rsp
.cfi_def_cfa_offset 112
subq $40, %rsp
.cfi_def_cfa_offset 96
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, 24(%rsp) # 8-byte Spill
movq %rdx, %r12
movq %rsi, %r13
movl 4(%rsi), %r15d
movq %rcx, 16(%rsp) # 8-byte Spill
movq %rdx, %r15
movq %rsi, %r12
movl 4(%rsi), %r13d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
movq %rdi, 40(%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 16(%rsp) # 8-byte Spill
testl %r15d, %r15d
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
testl %r13d, %r13d
jle .LBB1_2
# %bb.1: #
movq 64(%r13), %rdi
leaq (,%r15,8), %rax
movq 64(%r12), %rdi
leaq (,%r13,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB1_2: #
xorl %r14d, %r14d
xorl %eax, %eax
callq getTimeStamp
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
movl $.L.str.1, %edi
callq likwid_markerStartRegion
testl %r15d, %r15d
testl %r13d, %r13d
jle .LBB1_8
# %bb.3: #
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm12
movq 16(%r12), %rax
movq 24(%r12), %rcx
movq %rcx, (%rsp) # 8-byte Spill
movslq 8(%r12), %rdx
movq 16(%r13), %rsi
movq 64(%r13), %rdi
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm12
movq 16(%r15), %rax
movq 24(%r15), %rcx
movq %rcx, 8(%rsp) # 8-byte Spill
movslq 8(%r15), %rdx
movq 16(%r12), %rsi
movq 64(%r12), %rdi
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
movq 24(%rsp), %rcx # 8-byte Reload
movq 16(%rsp), %rcx # 8-byte Reload
vmovdqu (%rcx), %xmm10
shlq $2, %rdx
movq %rdx, 48(%rsp) # 8-byte Spill
xorl %r13d, %r13d
xorl %r14d, %r14d
movq %rdx, (%rsp) # 8-byte Spill
xorl %r12d, %r12d
jmp .LBB1_4
.p2align 4, 0x90
.LBB1_14: #
.LBB1_5: #
# in Loop: Header=BB1_4 Depth=1
movq 8(%rsp), %rbp # 8-byte Reload
vxorpd %xmm13, %xmm13, %xmm13
movq %r9, %rdx
vxorpd %xmm9, %xmm9, %xmm9
vxorpd %xmm14, %xmm14, %xmm14
.LBB1_6: #
# in Loop: Header=BB1_4 Depth=1
addl %r10d, %r14d
vaddsd (%rdi,%r12,8), %xmm14, %xmm0
vmovsd %xmm0, (%rdi,%r12,8)
vaddsd (%rdi,%rbp,8), %xmm15, %xmm0
vmovsd %xmm0, (%rdi,%rbp,8)
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
vmovsd %xmm0, (%rdi,%r15,8)
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
vmovsd %xmm0, (%rdi,%r10,8)
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
vmovsd %xmm0, (%rdi,%r11,8)
leal 3(%r10), %ecx
addl $6, %r10d
leal 3(%r9), %ecx
addl $6, %r9d
testl %ecx, %ecx
cmovnsl %ecx, %r10d
sarl $2, %r10d
movslq %r10d, %rcx
cmovnsl %ecx, %r9d
sarl $2, %r9d
movslq %r9d, %rcx
vmovq %rcx, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm10, %xmm10
incq %r13
addq 48(%rsp), %rax # 8-byte Folded Reload
cmpq %r15, %r13
incq %r12
addq (%rsp), %rax # 8-byte Folded Reload
cmpq %r13, %r12
je .LBB1_7
.LBB1_4: #
# =>This Loop Header: Depth=1
# Child Loop BB1_10 Depth 2
movq (%rsp), %rcx # 8-byte Reload
movslq (%rcx,%r13,4), %r10
leaq (,%r13,2), %rcx
addq %r13, %rcx
leal 1(%rcx), %ebp
movq 8(%rsp), %rcx # 8-byte Reload
movslq (%rcx,%r12,4), %r9
leaq (%r12,%r12,2), %rcx
leal 1(%rcx), %r10d
leal 2(%rcx), %r11d
movl %ecx, %r12d
testq %r10, %r10
movl %ecx, %r15d
testq %r9, %r9
jle .LBB1_5
# %bb.9: #
# in Loop: Header=BB1_4 Depth=1
vmovsd (%rsi,%r12,8), %xmm9 # xmm9 = mem[0],zero
movq %rbp, 8(%rsp) # 8-byte Spill
vmovsd (%rsi,%rbp,8), %xmm4 # xmm4 = mem[0],zero
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
movl %r10d, %edx
movl %r9d, %edx
vxorpd %xmm14, %xmm14, %xmm14
xorl %ebx, %ebx
vxorpd %xmm15, %xmm15, %xmm15
xorl %ecx, %ecx
vxorpd %xmm9, %xmm9, %xmm9
vxorpd %xmm13, %xmm13, %xmm13
jmp .LBB1_10
.p2align 4, 0x90
.LBB1_13: #
# in Loop: Header=BB1_10 Depth=2
incq %rbx
cmpq %rbx, %rdx
je .LBB1_14
incq %rcx
cmpq %rcx, %rdx
je .LBB1_6
.LBB1_10: #
# Parent Loop BB1_4 Depth=1
# => This Inner Loop Header: Depth=2
movslq (%rax,%rbx,4), %r9
leaq (%r9,%r9,2), %r8
vsubsd (%rsi,%r8,8), %xmm9, %xmm2
movslq %r8d, %rcx
vsubsd 8(%rsi,%rcx,8), %xmm4, %xmm5
vsubsd 16(%rsi,%rcx,8), %xmm1, %xmm0
movslq (%rax,%rcx,4), %r8
leaq (%r8,%r8,2), %r14
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
movslq %r14d, %rbp
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
vmulsd %xmm2, %xmm2, %xmm6
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
@ -539,70 +534,45 @@ computeForceLJHalfNeigh: #
# in Loop: Header=BB1_10 Depth=2
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
vdivsd %xmm6, %xmm3, %xmm6
vmulsd 16(%rsp), %xmm6, %xmm7 # 8-byte Folded Reload
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
vmulsd %xmm6, %xmm6, %xmm8
vmulsd %xmm7, %xmm8, %xmm7
vaddsd .LCPI1_2(%rip), %xmm7, %xmm3
vmulsd %xmm3, %xmm8, %xmm3
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
vmulsd %xmm6, %xmm11, %xmm6
vmulsd %xmm7, %xmm6, %xmm6
vmulsd %xmm3, %xmm6, %xmm3
vmulsd %xmm7, %xmm3, %xmm3
vmulsd %xmm2, %xmm3, %xmm6
vaddsd %xmm6, %xmm14, %xmm14
vmulsd %xmm5, %xmm3, %xmm2
vaddsd %xmm2, %xmm15, %xmm15
vaddsd %xmm2, %xmm9, %xmm9
vmulsd %xmm0, %xmm3, %xmm0
vaddsd %xmm0, %xmm13, %xmm13
cmpl %r15d, %r9d
cmpl %r13d, %r8d
jge .LBB1_13
# %bb.12: #
# in Loop: Header=BB1_10 Depth=2
leaq 1(%rcx), %rbp
addq $2, %rcx
vmovsd (%rdi,%r8,8), %xmm3 # xmm3 = mem[0],zero
leaq 1(%rbp), %rbx
addq $2, %rbp
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm6, %xmm3, %xmm3
vmovsd %xmm3, (%rdi,%r8,8)
vmovsd (%rdi,%rbp,8), %xmm3 # xmm3 = mem[0],zero
vmovsd %xmm3, (%rdi,%r14,8)
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm2, %xmm3, %xmm2
vmovsd %xmm2, (%rdi,%rbp,8)
vmovsd (%rdi,%rcx,8), %xmm2 # xmm2 = mem[0],zero
vmovsd %xmm2, (%rdi,%rbx,8)
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
vsubsd %xmm0, %xmm2, %xmm0
vmovsd %xmm0, (%rdi,%rcx,8)
vmovsd %xmm0, (%rdi,%rbp,8)
jmp .LBB1_13
.p2align 4, 0x90
.LBB1_5: #
# in Loop: Header=BB1_4 Depth=1
vxorpd %xmm13, %xmm13, %xmm13
movq %r10, %rdx
vxorpd %xmm15, %xmm15, %xmm15
vxorpd %xmm14, %xmm14, %xmm14
jmp .LBB1_6
.LBB1_7: #
movq 24(%rsp), %rax # 8-byte Reload
movq 16(%rsp), %rax # 8-byte Reload
vmovdqu %xmm10, (%rax)
.LBB1_8: #
movl $.L.str.1, %edi
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
movq 40(%rsp), %rax # 8-byte Reload
vmovsd 264(%rax), %xmm3 # xmm3 = mem[0],zero
vsubsd 32(%rsp), %xmm0, %xmm2 # 8-byte Folded Reload
vmulsd .LCPI1_3(%rip), %xmm3, %xmm0
vmulsd %xmm2, %xmm0, %xmm0
vmovapd %xmm2, %xmm1
vmovsd %xmm2, 16(%rsp) # 8-byte Spill
movl %r14d, %eax
vxorps %xmm12, %xmm12, %xmm12
vcvtsi2sd %rax, %xmm12, %xmm2
vdivsd %xmm2, %xmm0, %xmm2
movl $.L.str.2, %edi
movl %r14d, %esi
vmovapd %xmm3, %xmm0
movb $3, %al
callq printf
vmovsd 16(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
addq $56, %rsp
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
addq $40, %rsp
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
@ -642,10 +612,8 @@ computeForceLJFullNeigh_simd: #
.LBB2_2: #
xorl %eax, %eax
callq getTimeStamp
movl $.L.str, %edi
callq likwid_markerStartRegion
movq stderr(%rip), %rcx
movl $.L.str.3, %edi
movl $.L.str.2, %edi
movl $65, %esi
movl $1, %edx
callq fwrite
@ -666,11 +634,7 @@ computeForceLJFullNeigh_simd: #
.size .L.str.1, 18
.type .L.str.2,@object #
.L.str.2:
.asciz "Its: %u Freq: %f Time: %f\nCy/it: %f (half-neigh)\n"
.size .L.str.2, 52
.type .L.str.3,@object #
.L.str.3:
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
.size .L.str.3, 66
.size .L.str.2, 66
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
.section ".note.GNU-stack","",@progbits