added asm files and analysis output

This commit is contained in:
JanLJL
2023-02-13 14:15:08 +01:00
parent d0277765c3
commit 95d63334fa
36 changed files with 13596 additions and 5383 deletions

View File

@@ -0,0 +1,198 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icc-avx512-dp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
| 1 | | | | | | | 1.0 | | inc rsi
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
| 1 | | | | | | | 1.0 | | mov edx, 0x0
| 1 | | | | | | | 1.0 | | setz dl
| 1 | | 1.0 | | | | | | | cmp eax, r11d
| 1 | | | | | | | 1.0 | | mov eax, 0x0
| 1* | | | | | | | | | mov r13d, edx
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
| 1 | | | | | | | 1.0 | | setz al
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
| 1 | | 1.0 | | | | | | | neg r13d
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
| 1* | | | | | | | | | mov r12d, eax
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
| 1 | | 1.0 | | | | | | | add r13d, 0xff
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
| 1 | | | | | | | 1.0 | | nop
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
| 1 | | 1.0 | | | | | | | sub r13d, r12d
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
| 1* | | | | | | | | | mov r13d, eax
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
| 1 | | | | | | | 1.0 | | neg r12d
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
| 1 | | 1.0 | | | | | | | add r12d, 0xff
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
| 1 | | 1.0 | | | | | | | sub r12d, r13d
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
| 1* | | | | | | | | | mov r12d, eax
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
| 1 | | | | | | | 1.0 | | neg r13d
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
| 1 | | | | | | | 1.0 | | add r13d, 0xff
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
| 1 | | | | | | | 1.0 | | shl edx, 0x3
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
| 1 | | 1.0 | | | | | | | neg edx
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
| 1 | | 1.0 | | | | | | | sub r13d, r12d
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
| 1 | | 1.0 | | | | | | | add edx, 0xff
| 1 | | | | | | | 1.0 | | shl eax, 0x7
| 1 | | 1.0 | | | | | | | sub edx, eax
| 1 | 1.0 | | | | | | | | kmovb eax, k6
| 1 | | | | | | 1.0 | | | kmovb k6, eax
| 1 | 1.0 | | | | | | | | kmovw eax, k7
| 1 | | | | | | 1.0 | | | kmovb k7, eax
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
| 1 | | | | | | 1.0 | | | kmovb k6, edx
| 1 | 1.0 | | | | | | | | kmovb edx, k7
| 1 | | | | | | 1.0 | | | kmovw k7, edx
| 1 | 1.0 | | | | | | | | kmovw edx, k0
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
| 1 | 1.0 | | | | | | | | kmovb eax, k6
| 1 | | | | | | 1.0 | | | kmovb k6, eax
| 1 | | | | | | 1.0 | | | kmovb k0, edx
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
| 1* | | | | | | | | | cmp rsi, rdi
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
Total Num Of Uops: 187
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -0,0 +1,152 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icc-avx512-sp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
| 1* | | | | | | | | | mov r12d, r13d
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
| 1 | | 1.0 | | | | | | | inc rax
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
| 1 | | | | | | | 1.0 | | setz r12b
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
| 1 | | | | | | | 1.0 | | shl r14, 0x5
| 1* | | | | | | | | | mov r8d, r12d
| 1 | | 1.0 | | | | | | | neg r8d
| 1* | | | | | | | | | mov r11d, r12d
| 1 | | 1.0 | | | | | | | add r8d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
| 1 | | | | | | | 1.0 | | neg r9d
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
| 1 | | | | | | | 1.0 | | add r9d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
| 1 | | | | | | | 1.0 | | neg r10d
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
| 1 | | 1.0 | | | | | | | add r10d, r12d
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
| 1 | | | | | | | 1.0 | | add r10d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
| 1 | | 1.0 | | | | | | | sub r12d, r11d
| 1 | | 1.0 | | | | | | | add r12d, 0xff
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
| 1* | | | | | | | | | cmp rax, rdx
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
Total Num Of Uops: 142
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -0,0 +1,154 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icx-avx512-dp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
| 1 | | | | | | | 1.0 | | cmp r11, rdx
| 1 | | | | | | | 1.0 | | setnz dl
| 1 | | | | | | | 1.0 | | setz al
| 1 | | 1.0 | | | | | | | add ecx, ecx
| 1 | | 1.0 | | | | | | | inc ecx
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
| 1 | | | | | | | 1.0 | | setz cl
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
| 1 | | | | | | | 1.0 | | setnz dil
| 1* | | | | | | | | | mov ebp, edi
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
| 1 | | 1.0 | | | | | | | sub bpl, al
| 1 | | 1.0 | | | | | | | add bpl, 0xef
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
| 1* | | | | | | | | | mov ebp, edi
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
| 1 | | 1.0 | | | | | | | or bpl, al
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
| 1 | | | | | | | 1.0 | | shl dil, 0x6
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
| 1 | | | | | | 1.0 | | | kmovd k1, edi
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
| 1 | | | | | | | 1.0 | | shl dl, 0x3
| 1 | | | | | | | 1.0 | | shl cl, 0x7
| 1 | | 1.0 | | | | | | | or cl, dl
| 1 | | 1.0 | | | | | | | add cl, 0xf7
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
| 1* | | | | | | | | | cmp r9, rbx
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
Total Num Of Uops: 129
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -0,0 +1,288 @@
[0] Code Region
Iterations: 100
Instructions: 12200
Total Cycles: 4745
Total uOps: 14000
Dispatch Width: 6
uOps Per Cycle: 2.95
IPC: 2.57
Block RThroughput: 34.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
1 1 0.50 shlq $6, %rdx
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
2 8 0.50 * vmovupd 16(%rsp), %zmm3
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
2 8 0.50 * vmovupd 336(%rsp), %zmm16
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
3 4 2.00 vrcp14pd %zmm17, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
1 1 0.50 leal (%rcx,%rcx), %edx
1 1 0.25 cmpq %rdx, %r11
1 1 0.50 setne %dl
1 1 0.50 sete %al
1 1 0.25 addl %ecx, %ecx
1 1 0.25 incl %ecx
1 1 0.25 cmpq %rcx, %r11
1 1 0.50 sete %cl
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
2 8 0.50 * vmovupd 528(%rsp), %zmm19
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
1 1 0.50 setne %dil
1 1 0.25 movl %edi, %ebp
1 1 0.50 shlb $4, %bpl
1 1 0.25 subb %al, %bpl
1 1 0.25 addb $-17, %bpl
1 1 1.00 kmovd %ebp, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2 8 0.50 * vmovupd 272(%rsp), %zmm17
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
1 1 0.50 leal (%rdx,%rdx), %eax
1 1 0.25 movl %edi, %ebp
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
3 4 2.00 vrcp14pd %zmm3, %zmm16
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
2 8 0.50 * vmovupd 464(%rsp), %zmm31
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
1 1 0.50 shlb $5, %bpl
1 1 0.25 orb %al, %bpl
1 1 0.25 orb $-35, %bpl
1 1 1.00 kmovd %ebp, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2 8 0.50 * vmovupd 208(%rsp), %zmm3
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
3 4 2.00 vrcp14pd %zmm19, %zmm17
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
1 1 0.50 leal (,%rdx,4), %eax
1 1 0.50 shlb $6, %dil
1 1 0.25 orb %al, %dil
1 1 0.25 orb $-69, %dil
1 1 1.00 kmovd %edi, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2 8 0.50 * vmovupd 400(%rsp), %zmm17
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
3 4 2.00 vrcp14pd %zmm28, %zmm3
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
1 1 0.50 shlb $3, %dl
1 1 0.50 shlb $7, %cl
1 1 0.25 orb %dl, %cl
1 1 0.25 addb $-9, %cl
1 1 1.00 kmovd %ecx, %k1
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
1 1 0.25 incq %rbx
1 1 0.25 cmpq %rbx, %r9
1 1 0.50 jne .LBB5_12
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
- - - - - - - - 1.00 - cmpq %rdx, %r11
- - - - - - - - 1.00 - setne %dl
- - 0.44 - - - - - 0.56 - sete %al
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
- - - 0.53 - - - 0.46 0.01 - incl %ecx
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
- - 0.02 - - - - - 0.98 - sete %cl
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
- - 0.04 - - - - - 0.96 - setne %dil
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
- - - 0.96 - - - - 0.04 - subb %al, %bpl
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
- - - - - - - 1.00 - - kmovd %ebp, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
- - - 0.94 - - - - 0.06 - orb %al, %bpl
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
- - - - - - - 1.00 - - kmovd %ebp, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
- - - - - - - - 1.00 - shlb $6, %dil
- - - 0.02 - - - - 0.98 - orb %al, %dil
- - - 0.48 - - - - 0.52 - orb $-69, %dil
- - - - - - - 1.00 - - kmovd %edi, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
- - - - - - - - 1.00 - shlb $3, %dl
- - - - - - - - 1.00 - shlb $7, %cl
- - - 1.00 - - - - - - orb %dl, %cl
- - - 0.52 - - - - 0.48 - addb $-9, %cl
- - - - - - - 1.00 - - kmovd %ecx, %k1
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
- - - 0.48 - - - - 0.52 - incq %rbx
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
- - - - - - - - 1.00 - jne .LBB5_12

View File

@@ -0,0 +1,167 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-dp.s
Architecture: CSX
Timestamp: 2023-02-10 16:30:53
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
2366 | | | | | | | | || | | * jne .LBB5_12
2367 | | | | | | | | || | | # LLVM-MCA-END
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
2364 | 1.0 | incq %rbx | [2364]

View File

@@ -0,0 +1,162 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - gromacs-icx-avx512-sp.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
| 1* | | | | | | | | | mov rsi, rax
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
| 1* | | | | | | | | | xor esi, esi
| 1* | | | | | | | | | xor edi, edi
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
| 1 | | | | | | | 1.0 | | setz sil
| 1 | | | | | | | 1.0 | | setnz dil
| 1 | | 1.0 | | | | | | | mov eax, 0xff
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
| 1 | | 1.0 | | | | | | | xor esi, 0xff
| 1 | | | | | | 1.0 | | | kmovd k1, esi
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
| 1 | | | | | | | 1.0 | | or esi, 0xfc
| 1 | | | | | | 1.0 | | | kmovd k1, esi
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
| 1 | | | | | | 1.0 | | | kmovd k1, eax
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
| 1* | | | | | | | | | cmp r10, rdx
| 0*F | | | | | | | | | jz 0x34
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
| 1 | | 1.0 | | | | | | | inc rdx
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
Total Num Of Uops: 140
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -0,0 +1,304 @@
[0] Code Region
Iterations: 100
Instructions: 13000
Total Cycles: 5640
Total uOps: 15400
Dispatch Width: 6
uOps Per Cycle: 2.73
IPC: 2.30
Block RThroughput: 40.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 5 0.50 * movslq (%r11,%rdx,4), %rax
1 1 0.25 movq %rax, %rsi
1 1 0.50 shlq $5, %rsi
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
2 8 0.50 * vmovups 128(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
2 8 0.50 * vmovups 320(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
2 8 0.50 * vmovups (%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
2 8 0.50 * vmovups 256(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
2 8 0.50 * vmovups 448(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
2 8 0.50 * vmovups 192(%rsp), %zmm1
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
2 8 0.50 * vmovups 384(%rsp), %zmm1
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
3 4 2.00 vrcp14ps %zmm27, %zmm31
3 4 2.00 vrcp14ps %zmm28, %zmm1
3 4 2.00 vrcp14ps %zmm29, %zmm2
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
3 4 2.00 vrcp14ps %zmm30, %zmm3
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
1 0 0.17 xorl %esi, %esi
1 0 0.17 xorl %edi, %edi
1 1 0.25 testl $2147483647, %eax
1 1 0.50 sete %sil
1 1 0.50 setne %dil
1 1 0.25 movl $255, %eax
1 1 0.50 cmovel %r8d, %eax
1 1 0.25 movl $255, %ecx
1 1 0.50 cmovel %r9d, %ecx
1 1 0.25 xorl $255, %esi
1 1 1.00 kmovd %esi, %k1
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
1 1 0.50 leal (%rdi,%rdi,2), %esi
1 1 0.25 orl $252, %esi
1 1 1.00 kmovd %esi, %k1
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
1 1 1.00 kmovd %eax, %k1
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
1 1 1.00 kmovd %ecx, %k1
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
1 5 0.50 * movq 176(%r15), %rax
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
1 1 0.25 cmpq %rdx, %r10
1 1 0.50 je .LBB4_18
1 5 0.50 * movq 160(%r15), %rdi
1 1 0.25 incq %rdx
1 1 0.50 jmp .LBB4_8
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
- - - - - - - - 1.00 - movq %rax, %rsi
- - - - - - - - 1.00 - shlq $5, %rsi
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
- - - - - - - - - - xorl %esi, %esi
- - - - - - - - - - xorl %edi, %edi
- - - - - - - - 1.00 - testl $2147483647, %eax
- - - - - - - - 1.00 - sete %sil
- - - - - - - - 1.00 - setne %dil
- - - 1.00 - - - - - - movl $255, %eax
- - - - - - - - 1.00 - cmovel %r8d, %eax
- - - 1.00 - - - - - - movl $255, %ecx
- - - - - - - - 1.00 - cmovel %r9d, %ecx
- - - 1.00 - - - - - - xorl $255, %esi
- - - - - - - 1.00 - - kmovd %esi, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
- - - - - - - - 1.00 - orl $252, %esi
- - - - - - - 1.00 - - kmovd %esi, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
- - - - - - - 1.00 - - kmovd %eax, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
- - - - - - - 1.00 - - kmovd %ecx, %k1
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
- - - - 1.00 - - - - - movq 176(%r15), %rax
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
- - - - - - - - 1.00 - cmpq %rdx, %r10
- - - - - - - - 1.00 - je .LBB4_18
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
- - - 1.00 - - - - - - incq %rdx
- - - - - - - - 1.00 - jmp .LBB4_8

View File

@@ -0,0 +1,161 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-icx-avx512-sp.s
Architecture: CSX
Timestamp: 2023-02-10 16:31:04
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
1791 | | | | | | | | || | | * je .LBB4_18
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
1796 | | | | | | | | || | | # LLVM-MCA-END
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
1794 | 1.0 | incq %rdx | [1794]

View File

@@ -0,0 +1,88 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - lammps-icc-avx2.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
| 1* | | | | | | | | | mov r8d, ecx
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
| 1* | | | | | | | | | mov r14d, r15d
| 1 | | | | | | | 1.0 | | shr r15, 0x20
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
| 1 | | | | | | | 1.0 | | add rdx, 0x4
| 1* | | | | | | | | | cmp rdx, rsi
| 0*F | | | | | | | | | jb 0xffffffffffffff02
Total Num Of Uops: 62
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.

View File

@@ -0,0 +1,156 @@
[0] Code Region
Iterations: 100
Instructions: 5600
Total Cycles: 2352
Total uOps: 6300
Dispatch Width: 6
uOps Per Cycle: 2.68
IPC: 2.38
Block RThroughput: 10.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
1 2 1.00 vmovq %xmm0, %rcx
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
1 2 1.00 vmovq %xmm2, %r15
1 1 0.25 movl %ecx, %r8d
1 1 0.50 shrq $32, %rcx
1 1 0.50 leal (%rcx,%rcx,2), %r14d
1 1 0.50 leal (%r8,%r8,2), %r8d
1 1 0.25 movslq %r8d, %rcx
1 1 0.25 movslq %r14d, %r8
1 1 0.25 movl %r15d, %r14d
1 1 0.50 shrq $32, %r15
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
1 1 0.50 leal (%r14,%r14,2), %r14d
1 1 0.25 movslq %r14d, %r14
1 1 0.50 leal (%r15,%r15,2), %r15d
1 1 0.25 movslq %r15d, %r15
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
2 3 1.00 vptest %ymm7, %ymm1
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
1 1 0.25 addq $4, %rdx
1 1 0.25 cmpq %rsi, %rdx
1 1 0.50 jb ..B1.22
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
- - 1.00 - - - - - - - vmovq %xmm2, %r15
- - - - - - - - 1.00 - movl %ecx, %r8d
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
- - 0.51 - - - - - 0.49 - shrq $32, %r15
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
- - 0.01 - - - - - 0.99 - addq $4, %rdx
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
- - 0.45 - - - - - 0.55 - jb ..B1.22

View File

@@ -0,0 +1,158 @@
[0] Code Region
Iterations: 100
Instructions: 5600
Total Cycles: 2306
Total uOps: 6300
Dispatch Width: 6
uOps Per Cycle: 2.73
IPC: 2.43
Block RThroughput: 10.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
1 2 1.00 vmovq %xmm0, %rcx
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
1 2 1.00 vmovq %xmm2, %r15
1 1 0.25 movl %ecx, %r8d
1 1 0.50 shrq $32, %rcx
1 1 0.50 leal (%rcx,%rcx,2), %r14d
1 1 0.50 leal (%r8,%r8,2), %r8d
1 1 0.25 movslq %r8d, %rcx
1 1 0.25 movslq %r14d, %r8
1 1 0.25 movl %r15d, %r14d
1 1 0.50 shrq $32, %r15
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
1 1 0.50 leal (%r14,%r14,2), %r14d
1 1 0.25 movslq %r14d, %r14
1 1 0.50 leal (%r15,%r15,2), %r15d
1 1 0.25 movslq %r15d, %r15
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
2 3 1.00 vptest %ymm7, %ymm1
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
1 1 0.25 addq $4, %rdx
1 1 0.25 cmpq %rsi, %rdx
1 1 0.50 jb ..B1.22
Resources:
[0] - ICXDivider
[1] - ICXFPDivider
[2] - ICXPort0
[3] - ICXPort1
[4] - ICXPort2
[5] - ICXPort3
[6] - ICXPort4
[7] - ICXPort5
[8] - ICXPort6
[9] - ICXPort7
[10] - ICXPort8
[11] - ICXPort9
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
- - - - - - - - 1.00 - - - movl %ecx, %r8d
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
- - 0.01 - - - - - 0.99 - - - jb ..B1.22

View File

@@ -0,0 +1,97 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx2.s
Architecture: CSX
Timestamp: 2023-02-10 16:29:58
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
----------------------------------------------------------------------------------------------------
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
259 | | | | | | | | || | | # Execution count [2.50e+01]
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
299 | | | | | | | | || | | # Execution count [1.25e+01]
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
319 | | | | | | | | || | | # Execution count [2.50e+01]
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
323 | | | | | | | | || | | # LLVM-MCA-END
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
320 | 1.0 | addq $4, %rdx #59.9| [320]

View File

@@ -0,0 +1,97 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx2.s
Architecture: ICX
Timestamp: 2023-02-10 16:29:48
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
-----------------------------------------------------------------------------------------------------------------------
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
323 | | | | | | | | | | || | | # LLVM-MCA-END
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
320 | 1.0 | addq $4, %rdx #59.9| [320]

View File

@@ -0,0 +1,75 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - lammps-icc-avx512.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
| 1 | | | | | | | 1.0 | | add r15, 0x8
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
| 1 | 1.0 | | | | | | | | kmovw k2, k5
| 1 | 1.0 | | | | | | | | kmovw k3, k5
| 1 | 1.0 | | | | | | | | kmovw k1, k5
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
| 1* | | | | | | | | | vmovaps zmm23, zmm31
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k4, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
| 1* | | | | | | | | | cmp r15, r14
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
Total Num Of Uops: 57
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@@ -0,0 +1,128 @@
[0] Code Region
Iterations: 100
Instructions: 4200
Total Cycles: 2465
Total uOps: 5800
Dispatch Width: 6
uOps Per Cycle: 2.35
IPC: 1.70
Block RThroughput: 13.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
1 1 0.25 addq $8, %r15
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
1 1 1.00 kmovw %k5, %k2
1 1 1.00 kmovw %k5, %k3
1 1 1.00 kmovw %k5, %k1
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
3 4 2.00 vrcp14pd %zmm31, %zmm30
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
1 4 1.00 vfpclasspd $30, %zmm30, %k0
1 1 0.50 vmovaps %zmm31, %zmm23
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
1 1 1.00 knotw %k0, %k4
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
1 1 0.25 cmpq %r14, %r15
1 1 0.50 jb ..B1.16
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
- - 1.00 - - - - - - - kmovw %k5, %k2
- - 1.00 - - - - - - - kmovw %k5, %k3
- - 1.00 - - - - - - - kmovw %k5, %k1
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
- - 1.00 - - - - - - - knotw %k0, %k4
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
- - 0.14 - - - - - 0.86 - jb ..B1.16

View File

@@ -0,0 +1,130 @@
[0] Code Region
Iterations: 100
Instructions: 4200
Total Cycles: 2465
Total uOps: 5800
Dispatch Width: 6
uOps Per Cycle: 2.35
IPC: 1.70
Block RThroughput: 13.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
1 1 0.25 addq $8, %r15
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
1 1 1.00 kmovw %k5, %k2
1 1 1.00 kmovw %k5, %k3
1 1 1.00 kmovw %k5, %k1
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
3 4 2.00 vrcp14pd %zmm31, %zmm30
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
1 4 1.00 vfpclasspd $30, %zmm30, %k0
1 1 0.50 vmovaps %zmm31, %zmm23
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
1 1 1.00 knotw %k0, %k4
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
1 1 0.25 cmpq %r14, %r15
1 1 0.50 jb ..B1.16
Resources:
[0] - ICXDivider
[1] - ICXFPDivider
[2] - ICXPort0
[3] - ICXPort1
[4] - ICXPort2
[5] - ICXPort3
[6] - ICXPort4
[7] - ICXPort5
[8] - ICXPort6
[9] - ICXPort7
[10] - ICXPort8
[11] - ICXPort9
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
- - 1.00 - - - - - - - - - kmovw %k5, %k2
- - 1.00 - - - - - - - - - kmovw %k5, %k3
- - 1.00 - - - - - - - - - kmovw %k5, %k1
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
- - 1.00 - - - - - - - - - knotw %k0, %k4
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
- - 0.14 - - - - - 0.86 - - - jb ..B1.16

View File

@@ -0,0 +1,77 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx512.s
Architecture: CSX
Timestamp: 2023-02-10 16:30:08
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
--------------------------------------------------------------------------------------------------
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
203 | | | | | | | | || | | # Execution count [2.50e+01]
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
246 | | | | | | | | || | | # LLVM-MCA-END
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
208 | 1.0 | addq $8, %r15 #59.9| [208]
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]

View File

@@ -0,0 +1,77 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icc-avx512.s
Architecture: ICX
Timestamp: 2023-02-10 16:29:42
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
------------------------------------------------------------------------------------------------------------------------
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
246 | | | | | | | | | | || | | # LLVM-MCA-END
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
208 | 1.0 | addq $8, %r15 #59.9| [208]
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]

View File

@@ -0,0 +1,197 @@
[0] Code Region
Iterations: 100
Instructions: 7000
Total Cycles: 3866
Total uOps: 7900
Dispatch Width: 6
uOps Per Cycle: 2.04
IPC: 1.81
Block RThroughput: 21.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
2 4 1.50 vpmovsxdq %xmm11, %ymm1
1 1 0.50 vpsllq $3, %ymm1, %ymm1
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
1 1 1.00 vmovq %xmm1, %r14
2 1 1.00 vpextrq $1, %xmm1, %r9
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
1 8 0.50 * vmovsd (%r14), %xmm2
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
2 4 1.50 vpmovsxdq %xmm6, %ymm6
1 1 0.50 vpsllq $3, %ymm6, %ymm6
1 1 1.00 vmovq %xmm1, %rdi
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
1 1 1.00 vmovq %xmm6, %rcx
2 1 1.00 vpextrq $1, %xmm1, %rbx
2 1 1.00 vpextrq $1, %xmm6, %rax
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
1 8 0.50 * vmovsd (%rdi), %xmm6
1 1 1.00 vmovq %xmm1, %rdi
2 1 1.00 vpextrq $1, %xmm1, %rsi
1 8 0.50 * vmovsd (%rdi), %xmm1
1 8 0.50 * vmovsd (%rcx), %xmm7
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
2 4 1.50 vpmovsxdq %xmm4, %ymm4
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
1 1 0.50 vpsllq $3, %ymm4, %ymm4
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
2 1 1.00 vpextrq $1, %xmm4, %rax
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
1 1 1.00 vmovq %xmm4, %rcx
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
1 1 1.00 vmovq %xmm4, %rsi
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
2 1 1.00 vpextrq $1, %xmm4, %rdi
1 8 0.50 * vmovsd (%rsi), %xmm4
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
1 8 0.50 * vmovsd (%rcx), %xmm6
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
1 1 0.25 addq $4, %rbp
1 1 0.25 cmpq %rdx, %rbp
1 1 0.50 jb .LBB0_9
Resources:
[0] - Zn3AGU0
[1] - Zn3AGU1
[2] - Zn3AGU2
[3] - Zn3ALU0
[4] - Zn3ALU1
[5] - Zn3ALU2
[6] - Zn3ALU3
[7] - Zn3BRU1
[8] - Zn3FPP0
[9] - Zn3FPP1
[10] - Zn3FPP2
[11] - Zn3FPP3
[12.0] - Zn3FPP45
[12.1] - Zn3FPP45
[13] - Zn3FPSt
[14.0] - Zn3LSU
[14.1] - Zn3LSU
[14.2] - Zn3LSU
[15.0] - Zn3Load
[15.1] - Zn3Load
[15.2] - Zn3Load
[16.0] - Zn3Store
[16.1] - Zn3Store
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9

View File

@@ -0,0 +1,108 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: lammps-icx-avx2zen.s
Architecture: ZEN3
Timestamp: 2023-02-10 16:31:30
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
--------------------------------------------------------------------------------------------------------------------------------------------
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
247 | 1.0 | addq $4, %rbp | [247]
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]