Add outputs for new analyses
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
4e5fe27c0f
commit
300776f512
120
static_analysis/rafael/analyses/gromacs-icx-avx512-dp-iaca.out
Normal file
120
static_analysis/rafael/analyses/gromacs-icx-avx512-dp-iaca.out
Normal file
@ -0,0 +1,120 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 45.95 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 40.0 0.0 | 2.0 | 5.0 5.0 | 5.0 5.0 | 0.0 | 40.0 | 2.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | movsxd rax, dword ptr [rdx+rcx*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rax, ptr [rax+rax*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rax, 0x6
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm28, zmmword ptr [rdi+rax*1]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm29, zmmword ptr [rdi+rax*1+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm30, zmmword ptr [rdi+rax*1+0x80]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm14, zmm3, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm12, zmm26, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm31, zmm25, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm23, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm31, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15, zmm12, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm14, zmm14
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm15, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm16, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm18, zmm15
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x200]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm18, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm15
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm15, zmm15, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm16, zmm15
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm16, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm16, zmm16, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19{k1}, zmm15, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm16, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14, zmm3, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm14, zmm0, 0x1
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm14, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm15, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm15, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm14, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm14, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm15, zmm24, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm1, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm12, zmm12, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm14, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm14, zmm22, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17{k2}, zmm12, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm27, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k2}, zmm12, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm14, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm15, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm18, zmm0, 0x1
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k2}, zmm12, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm18, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm16, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm1, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm12, zmm12, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm16, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21{k1}, zmm12, zmm15
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm15, zmmword ptr [rsp+0x240]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm15, zmm15, zmm28
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm16, zmmword ptr [rsp+0x80]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm16, zmm16, zmm29
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm18, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm10{k1}, zmm12, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14, zmm16, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm15, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm14, zmm0, 0x1
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm14, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm6{k1}, zmm12, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm14, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm14, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm12, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm1, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm12, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm12, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k2}, zmm3, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k2}, zmm3, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k2}, zmm3, zmm18
|
||||
| 1 | | 1.0 | | | | | | | inc rcx
|
||||
| 1* | | | | | | | | | cmp r11, rcx
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffdf3
|
||||
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffff7a5
|
||||
Total Num Of Uops: 95
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
@ -0,0 +1,132 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-dp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-04-05 00:41:22
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2814 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2815 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2816 | | | | | | | | || | | .LBB5_11: #
|
||||
2817 | | | | | | | | || | | # Parent Loop BB5_7 Depth=1
|
||||
2818 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2819 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%rdx,%rcx,4), %rax
|
||||
2820 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rax,%rax,2), %rax
|
||||
2821 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rax
|
||||
2822 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rdi,%rax), %zmm28
|
||||
2823 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rdi,%rax), %zmm29
|
||||
2824 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rdi,%rax), %zmm30
|
||||
2825 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
|
||||
2826 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm14
|
||||
2827 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm26, %zmm12
|
||||
2828 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm25, %zmm31
|
||||
2829 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm23, %zmm3
|
||||
2830 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm15
|
||||
2831 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm12, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm12) + zmm15
|
||||
2832 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm14, %zmm14, %zmm15 # zmm15 = (zmm14 * zmm14) + zmm15
|
||||
2833 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm15, %zmm16
|
||||
2834 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm15, %k1
|
||||
2835 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm16, %zmm15
|
||||
2836 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm16, %zmm18
|
||||
2837 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm18, %zmm15
|
||||
2838 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 512(%rsp), %zmm18 # 64-byte Reload
|
||||
2839 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18
|
||||
2840 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm1, %zmm16
|
||||
2841 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm16
|
||||
2842 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm15, %zmm15
|
||||
2843 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm15
|
||||
2844 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm16 # 64-byte Reload
|
||||
2845 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm16, %zmm16
|
||||
2846 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19
|
||||
2847 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm16, %zmm14
|
||||
2848 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm14 # zmm14 = (zmm18 * zmm18) + zmm14
|
||||
2849 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm14 # zmm14 = (zmm3 * zmm3) + zmm14
|
||||
2850 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm14, %k2
|
||||
2851 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm14, %zmm14
|
||||
2852 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11
|
||||
2853 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7
|
||||
2854 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm14, %zmm12
|
||||
2855 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm14, %zmm15
|
||||
2856 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm15, %zmm12
|
||||
2857 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm24, %zmm15
|
||||
2858 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm1, %zmm14
|
||||
2859 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm14, %zmm14
|
||||
2860 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm12, %zmm12
|
||||
2861 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm14, %zmm12
|
||||
2862 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm22, %zmm14
|
||||
2863 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17
|
||||
2864 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm3
|
||||
2865 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9
|
||||
2866 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm18
|
||||
2867 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm14, %zmm14, %zmm18 # zmm18 = (zmm14 * zmm14) + zmm18
|
||||
2868 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm15, %zmm18 # zmm18 = (zmm15 * zmm15) + zmm18
|
||||
2869 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1
|
||||
2870 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm18, %zmm18
|
||||
2871 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5
|
||||
2872 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm18, %zmm12
|
||||
2873 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm16
|
||||
2874 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm12
|
||||
2875 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm1, %zmm16
|
||||
2876 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm16
|
||||
2877 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm12, %zmm12
|
||||
2878 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm12
|
||||
2879 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21
|
||||
2880 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 576(%rsp), %zmm15 # 64-byte Reload
|
||||
2881 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm15, %zmm15
|
||||
2882 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm16 # 64-byte Reload
|
||||
2883 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2884 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
|
||||
2885 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm18, %zmm18
|
||||
2886 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10
|
||||
2887 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm14
|
||||
2888 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm16) + zmm14
|
||||
2889 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm15, %zmm14 # zmm14 = (zmm15 * zmm15) + zmm14
|
||||
2890 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm14, %k2
|
||||
2891 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm14, %zmm14
|
||||
2892 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6
|
||||
2893 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm14, %zmm3
|
||||
2894 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm14, %zmm12
|
||||
2895 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm3
|
||||
2896 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm14, %zmm1, %zmm12
|
||||
2897 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm12
|
||||
2898 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2899 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm3
|
||||
2900 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13
|
||||
2901 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8
|
||||
2902 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4
|
||||
2903 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %rcx
|
||||
2904 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
|
||||
2905 | | | | | | | | || | | * jne .LBB5_11
|
||||
2906 | 0.00 | | | | | | 1.00 | || | | jmp .LBB5_12
|
||||
2907 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
38.0 2.50 5.00 5.00 5.00 5.00 38.0 2.50 50.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2902 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4| [2902]
|
||||
2901 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8| [2901]
|
||||
2900 | 4.0 | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13| [2900]
|
||||
2892 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6| [2892]
|
||||
2886 | 4.0 | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10| [2886]
|
||||
2879 | 4.0 | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21| [2879]
|
||||
2871 | 4.0 | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5| [2871]
|
||||
2865 | 4.0 | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9| [2865]
|
||||
2863 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17| [2863]
|
||||
2853 | 4.0 | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7| [2853]
|
||||
2852 | 4.0 | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11| [2852]
|
||||
2846 | 4.0 | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19| [2846]
|
||||
2903 | 1.0 | incq %rcx | [2903]
|
||||
|
@ -0,0 +1,132 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-dp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-04-05 00:41:45
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
2814 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2815 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2816 | | | | | | | | | | || | | .LBB5_11: #
|
||||
2817 | | | | | | | | | | || | | # Parent Loop BB5_7 Depth=1
|
||||
2818 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2819 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%rdx,%rcx,4), %rax
|
||||
2820 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rax,%rax,2), %rax
|
||||
2821 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rax
|
||||
2822 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovapd (%rdi,%rax), %zmm28
|
||||
2823 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovapd 64(%rdi,%rax), %zmm29
|
||||
2824 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovapd 128(%rdi,%rax), %zmm30
|
||||
2825 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
|
||||
2826 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm14
|
||||
2827 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm26, %zmm12
|
||||
2828 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm25, %zmm31
|
||||
2829 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm23, %zmm3
|
||||
2830 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm15
|
||||
2831 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm12, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm12) + zmm15
|
||||
2832 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm14, %zmm14, %zmm15 # zmm15 = (zmm14 * zmm14) + zmm15
|
||||
2833 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm15, %zmm16
|
||||
2834 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm15, %k1
|
||||
2835 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm20, %zmm16, %zmm15
|
||||
2836 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm16, %zmm18
|
||||
2837 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm15, %zmm18, %zmm15
|
||||
2838 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 512(%rsp), %zmm18 # 64-byte Reload
|
||||
2839 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm18, %zmm18
|
||||
2840 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm1, %zmm16
|
||||
2841 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm16
|
||||
2842 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm2, %zmm15, %zmm15
|
||||
2843 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm15
|
||||
2844 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 448(%rsp), %zmm16 # 64-byte Reload
|
||||
2845 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm16, %zmm16
|
||||
2846 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19
|
||||
2847 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm16, %zmm14
|
||||
2848 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm18, %zmm14 # zmm14 = (zmm18 * zmm18) + zmm14
|
||||
2849 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm14 # zmm14 = (zmm3 * zmm3) + zmm14
|
||||
2850 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm14, %k2
|
||||
2851 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm14, %zmm14
|
||||
2852 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11
|
||||
2853 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7
|
||||
2854 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm14, %zmm12
|
||||
2855 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm14, %zmm14, %zmm15
|
||||
2856 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm15, %zmm12
|
||||
2857 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm24, %zmm15
|
||||
2858 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm14, %zmm1, %zmm14
|
||||
2859 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm14, %zmm14
|
||||
2860 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm2, %zmm12, %zmm12
|
||||
2861 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm14, %zmm12
|
||||
2862 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm22, %zmm14
|
||||
2863 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17
|
||||
2864 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm3
|
||||
2865 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9
|
||||
2866 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm3, %zmm3, %zmm18
|
||||
2867 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm14, %zmm14, %zmm18 # zmm18 = (zmm14 * zmm14) + zmm18
|
||||
2868 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm15, %zmm15, %zmm18 # zmm18 = (zmm15 * zmm15) + zmm18
|
||||
2869 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm18, %k1
|
||||
2870 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm18, %zmm18
|
||||
2871 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5
|
||||
2872 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm18, %zmm12
|
||||
2873 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm16
|
||||
2874 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm16, %zmm12
|
||||
2875 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm1, %zmm16
|
||||
2876 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm16, %zmm16
|
||||
2877 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm2, %zmm12, %zmm12
|
||||
2878 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm16, %zmm12
|
||||
2879 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21
|
||||
2880 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 576(%rsp), %zmm15 # 64-byte Reload
|
||||
2881 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm15, %zmm15
|
||||
2882 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 128(%rsp), %zmm16 # 64-byte Reload
|
||||
2883 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2884 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
|
||||
2885 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm18, %zmm18
|
||||
2886 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10
|
||||
2887 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm14
|
||||
2888 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm16) + zmm14
|
||||
2889 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm15, %zmm15, %zmm14 # zmm14 = (zmm15 * zmm15) + zmm14
|
||||
2890 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm14, %k2
|
||||
2891 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm14, %zmm14
|
||||
2892 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6
|
||||
2893 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm14, %zmm3
|
||||
2894 | 0.25 | | | | | 0.750 | | | | || | | vmulpd %zmm14, %zmm14, %zmm12
|
||||
2895 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm12, %zmm3
|
||||
2896 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm14, %zmm1, %zmm12
|
||||
2897 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm12, %zmm12
|
||||
2898 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2899 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm12, %zmm3
|
||||
2900 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13
|
||||
2901 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8
|
||||
2902 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4
|
||||
2903 | 0.00 | 0.50 | | | | -0.01 | 0.50 | | | || | | incq %rcx
|
||||
2904 | 0.00 | 0.75 | | | | -0.01 | 0.25 | | | || | | cmpq %rcx, %r11
|
||||
2905 | | | | | | | | | | || | | * jne .LBB5_11
|
||||
2906 | | | | | | | | | | || | | * jmp .LBB5_12
|
||||
2907 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
40.0 2.00 5.00 5.00 5.00 5.00 39.99 2.00 59 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2902 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4| [2902]
|
||||
2901 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8| [2901]
|
||||
2900 | 4.0 | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13| [2900]
|
||||
2892 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6| [2892]
|
||||
2886 | 4.0 | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10| [2886]
|
||||
2879 | 4.0 | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21| [2879]
|
||||
2871 | 4.0 | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5| [2871]
|
||||
2865 | 4.0 | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9| [2865]
|
||||
2863 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17| [2863]
|
||||
2853 | 4.0 | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7| [2853]
|
||||
2852 | 4.0 | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11| [2852]
|
||||
2846 | 4.0 | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19| [2846]
|
||||
2903 | 1.0 | incq %rcx | [2903]
|
||||
|
@ -0,0 +1,79 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 25.21 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 21.0 0.0 | 2.0 | 2.0 2.0 | 2.0 2.0 | 0.0 | 21.0 | 2.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | movsxd rdi, dword ptr [rsi+rdx*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rdi, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rdi, 0x5
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm16, zmmword ptr [rcx+rdi*1]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vinsertf64x4 zmm17, zmm16, ymmword ptr [rcx+rdi*1], 0x1
|
||||
| 1 | | | | 1.0 1.0 | | | | | vbroadcastf64x4 zmm18, ymmword ptr [rcx+rdi*1+0x40]
|
||||
| 1 | | | | | | 1.0 | | | vshuff64x2 zmm16, zmm16, zmm16, 0xee
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm6, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm20, zmm10, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm21, zmm12, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm17, zmm9, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm18, zmm14, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm16, zmm11, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm22, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm22, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm22, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm18, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm23, zmm16, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm23, zmm17, zmm17
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm24, zmm22
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm25, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k2, zmm22, zmm0, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1, zmm23, zmm0, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm22, zmm24, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm24, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm26, zmm25, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm22, zmm23, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm23, zmm25, zmm25
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm23, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm26, zmm22, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm24, zmm1, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm22, zmm24, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm22, zmm22, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm24, zmm23, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm25, zmm1, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm23, zmm25, zmm23
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm23, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm13{k2}, zmm22, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm8{k2}, zmm22, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm5{k2}, zmm22, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm15{k1}, zmm23, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm7{k1}, zmm23, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm4{k1}, zmm23, zmm18
|
||||
| 1 | | 1.0 | | | | | | | inc rdx
|
||||
| 1* | | | | | | | | | cmp r12, rdx
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffef6
|
||||
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffb58
|
||||
Total Num Of Uops: 51
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
@ -0,0 +1,84 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-sp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-04-05 00:42:20
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
1615 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1616 | | | | | | | | || | | .LBB2_11: #
|
||||
1617 | | | | | | | | || | | # Parent Loop BB2_7 Depth=1
|
||||
1618 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
1619 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%rsi,%rdx,4), %rdi
|
||||
1620 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rdi,%rdi,2), %rdi
|
||||
1621 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rdi
|
||||
1622 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd (%rcx,%rdi), %zmm16
|
||||
1623 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || | | vinsertf64x4 $1, (%rcx,%rdi), %zmm16, %zmm17
|
||||
1624 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vbroadcastf64x4 64(%rcx,%rdi), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3]
|
||||
1625 | | | | | | 1.00 | | || 3.0 | | vshuff64x2 $238, %zmm16, %zmm16, %zmm16 # zmm16 = zmm16[4,5,6,7,4,5,6,7]
|
||||
1626 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm17, %zmm6, %zmm19
|
||||
1627 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm16, %zmm10, %zmm20
|
||||
1628 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm18, %zmm12, %zmm21
|
||||
1629 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm17, %zmm9, %zmm17
|
||||
1630 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm18, %zmm14, %zmm18
|
||||
1631 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm11, %zmm16
|
||||
1632 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm21, %zmm21, %zmm22
|
||||
1633 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm20, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm20) + zmm22
|
||||
1634 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm19, %zmm19, %zmm22 # zmm22 = (zmm19 * zmm19) + zmm22
|
||||
1635 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm18, %zmm18, %zmm23
|
||||
1636 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm16) + zmm23
|
||||
1637 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm23 # zmm23 = (zmm17 * zmm17) + zmm23
|
||||
1638 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm22, %zmm24
|
||||
1639 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm23, %zmm25
|
||||
1640 | | | | | | | | || | | X vcmpltps %zmm0, %zmm22, %k2
|
||||
1641 | | | | | | | | || | | X vcmpltps %zmm0, %zmm23, %k1
|
||||
1642 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm29, %zmm24, %zmm22
|
||||
1643 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm24, %zmm23
|
||||
1644 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm29, %zmm25, %zmm26
|
||||
1645 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm22, %zmm23, %zmm22
|
||||
1646 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm25, %zmm25, %zmm23
|
||||
1647 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm26, %zmm23, %zmm23
|
||||
1648 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm2, %zmm22, %zmm26
|
||||
1649 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm1, %zmm24
|
||||
1650 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm22, %zmm24, %zmm22
|
||||
1651 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm22, %zmm22
|
||||
1652 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm2, %zmm23, %zmm24
|
||||
1653 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm25, %zmm1, %zmm25
|
||||
1654 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm25, %zmm23
|
||||
1655 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm23, %zmm23
|
||||
1656 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13
|
||||
1657 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8
|
||||
1658 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5
|
||||
1659 | 0.00 | | | | | 1.00 | | || | | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15
|
||||
1660 | 0.00 | | | | | 1.00 | | || | | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7
|
||||
1661 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4
|
||||
1662 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %rdx
|
||||
1663 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r12
|
||||
1664 | | | | | | | | || | | * jne .LBB2_11
|
||||
1665 | 0.00 | | | | | | 1.00 | || | | jmp .LBB2_12
|
||||
1666 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
20.0 2.50 2.00 2.00 2.00 2.00 20.0 2.50 53.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1661 | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4| [1661]
|
||||
1660 | 4.0 | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7| [1660]
|
||||
1659 | 4.0 | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15| [1659]
|
||||
1658 | 4.0 | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5| [1658]
|
||||
1657 | 4.0 | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8| [1657]
|
||||
1656 | 4.0 | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13| [1656]
|
||||
1662 | 1.0 | incq %rdx | [1662]
|
||||
|
@ -0,0 +1,84 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-sp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-04-05 00:42:45
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
1615 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1616 | | | | | | | | | | || | | .LBB2_11: #
|
||||
1617 | | | | | | | | | | || | | # Parent Loop BB2_7 Depth=1
|
||||
1618 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
1619 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%rsi,%rdx,4), %rdi
|
||||
1620 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rdi,%rdi,2), %rdi
|
||||
1621 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdi
|
||||
1622 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd (%rcx,%rdi), %zmm16
|
||||
1623 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | | | || | | vinsertf64x4 $1, (%rcx,%rdi), %zmm16, %zmm17
|
||||
1624 | | | | | | | | | | || | | X vbroadcastf64x4 64(%rcx,%rdi), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3]
|
||||
1625 | | | | | | | | | | || 0.0 | | X vshuff64x2 $238, %zmm16, %zmm16, %zmm16 # zmm16 = zmm16[4,5,6,7,4,5,6,7]
|
||||
1626 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm17, %zmm6, %zmm19
|
||||
1627 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm16, %zmm10, %zmm20
|
||||
1628 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm18, %zmm12, %zmm21
|
||||
1629 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm17, %zmm9, %zmm17
|
||||
1630 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm18, %zmm14, %zmm18
|
||||
1631 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm16, %zmm11, %zmm16
|
||||
1632 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm21, %zmm21, %zmm22
|
||||
1633 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm20, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm20) + zmm22
|
||||
1634 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm19, %zmm19, %zmm22 # zmm22 = (zmm19 * zmm19) + zmm22
|
||||
1635 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm18, %zmm18, %zmm23
|
||||
1636 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm16, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm16) + zmm23
|
||||
1637 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm17, %zmm17, %zmm23 # zmm23 = (zmm17 * zmm17) + zmm23
|
||||
1638 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm24
|
||||
1639 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm23, %zmm25
|
||||
1640 | | | | | | | | | | || | | X vcmpltps %zmm0, %zmm22, %k2
|
||||
1641 | | | | | | | | | | || | | X vcmpltps %zmm0, %zmm23, %k1
|
||||
1642 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm29, %zmm24, %zmm22
|
||||
1643 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm24, %zmm24, %zmm23
|
||||
1644 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm29, %zmm25, %zmm26
|
||||
1645 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm22, %zmm23, %zmm22
|
||||
1646 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm25, %zmm25, %zmm23
|
||||
1647 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm26, %zmm23, %zmm23
|
||||
1648 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddps %zmm2, %zmm22, %zmm26
|
||||
1649 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm24, %zmm1, %zmm24
|
||||
1650 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm22, %zmm24, %zmm22
|
||||
1651 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm26, %zmm22, %zmm22
|
||||
1652 | 0.50 | | | | | 0.500 | | | | || | | vaddps %zmm2, %zmm23, %zmm24
|
||||
1653 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm25, %zmm1, %zmm25
|
||||
1654 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm23, %zmm25, %zmm23
|
||||
1655 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm24, %zmm23, %zmm23
|
||||
1656 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13
|
||||
1657 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8
|
||||
1658 | 0.25 | | | | | 0.750 | | | | || | | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5
|
||||
1659 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15
|
||||
1660 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7
|
||||
1661 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4
|
||||
1662 | 0.00 | 0.50 | | | | -0.01 | 0.50 | | | || | | incq %rdx
|
||||
1663 | 0.00 | 0.75 | | | | -0.01 | 0.25 | | | || | | cmpq %rdx, %r12
|
||||
1664 | | | | | | | | | | || | | * jne .LBB2_11
|
||||
1665 | | | | | | | | | | || | | * jmp .LBB2_12
|
||||
1666 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
19.5 2.00 1.50 1.50 1.50 1.50 19.49 2.00 55.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1661 | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4| [1661]
|
||||
1660 | 4.0 | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7| [1660]
|
||||
1659 | 4.0 | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15| [1659]
|
||||
1658 | 4.0 | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5| [1658]
|
||||
1657 | 4.0 | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8| [1657]
|
||||
1656 | 4.0 | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13| [1656]
|
||||
1662 | 1.0 | incq %rdx | [1662]
|
||||
|
Loading…
Reference in New Issue
Block a user