diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-iaca.out b/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-iaca.out new file mode 100644 index 0000000..4a73eb8 --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-iaca.out @@ -0,0 +1,120 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - gromacs-avx512-dp.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 45.95 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 40.0 0.0 | 2.0 | 5.0 5.0 | 5.0 5.0 | 0.0 | 40.0 | 2.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 1.0 1.0 | | | | | | movsxd rax, dword ptr [rdx+rcx*4] +| 1 | | 1.0 | | | | | | | lea rax, ptr [rax+rax*2] +| 1 | | | | | | | 1.0 | | shl rax, 0x6 +| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm28, zmmword ptr [rdi+rax*1] +| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm29, zmmword ptr [rdi+rax*1+0x40] +| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm30, zmmword ptr [rdi+rax*1+0x80] +| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40] +| 1 | | | | | | 1.0 | | | vsubpd zmm14, zmm3, zmm28 +| 1 | 1.0 | | | | | | | | vsubpd zmm12, zmm26, zmm29 +| 1 | | | | | | 1.0 | | | vsubpd zmm31, zmm25, zmm30 +| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm23, zmm28 +| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm31, zmm31 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15, zmm12, zmm12 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm14, zmm14 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm15 +| 1 | | | | | | 1.0 | | | vcmppd k1, zmm15, zmm0, 0x1 +| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm16, zmm20 +| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm16 +| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm18, zmm15 +| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x200] +| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm18, zmm29 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm1, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm15 +| 1 | 1.0 | | | | | | | | vaddpd zmm15, zmm15, zmm2 +| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm16, zmm15 +| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm16, zmmword ptr [rsp+0x1c0] +| 1 | 1.0 | | | | | | | | vsubpd zmm16, zmm16, zmm30 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19{k1}, zmm15, zmm14 +| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm16, zmm16 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm18, zmm18 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14, zmm3, zmm3 +| 1 | | | | | | 1.0 | | | vcmppd k2, zmm14, zmm0, 0x1 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm14, zmm14 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm15, zmm12 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm15, zmm31 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm14, zmm20 +| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm14, zmm14 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12 +| 1 | 1.0 | | | | | | | | vsubpd zmm15, zmm24, zmm28 +| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm1, zmm14 +| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm12 +| 1 | | | | | | 1.0 | | | vaddpd zmm12, zmm12, zmm2 +| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm14, zmm12 +| 1 | | | | | | 1.0 | | | vsubpd zmm14, zmm22, zmm29 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17{k2}, zmm12, zmm3 +| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm27, zmm30 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k2}, zmm12, zmm18 +| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm3, zmm3 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm14, zmm14 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm15, zmm15 +| 1 | | | | | | 1.0 | | | vcmppd k1, zmm18, zmm0, 0x1 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm18 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k2}, zmm12, zmm16 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm18, zmm20 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm18 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm16, zmm12 +| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm1, zmm18 +| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm12 +| 1 | 1.0 | | | | | | | | vaddpd zmm12, zmm12, zmm2 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm16, zmm12 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21{k1}, zmm12, zmm15 +| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm15, zmmword ptr [rsp+0x240] +| 1 | | | | | | 1.0 | | | vsubpd zmm15, zmm15, zmm28 +| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm16, zmmword ptr [rsp+0x80] +| 1 | 1.0 | | | | | | | | vsubpd zmm16, zmm16, zmm29 +| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180] +| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm18, zmm30 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm10{k1}, zmm12, zmm14 +| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm18, zmm18 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14, zmm16, zmm16 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm15, zmm15 +| 1 | | | | | | 1.0 | | | vcmppd k2, zmm14, zmm0, 0x1 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm14, zmm14 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm6{k1}, zmm12, zmm3 +| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm14, zmm20 +| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm14, zmm14 +| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm12, zmm3 +| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm1, zmm14 +| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm12, zmm3 +| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2 +| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm12, zmm3 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k2}, zmm3, zmm15 +| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k2}, zmm3, zmm16 +| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k2}, zmm3, zmm18 +| 1 | | 1.0 | | | | | | | inc rcx +| 1* | | | | | | | | | cmp r11, rcx +| 0*F | | | | | | | | | jnz 0xfffffffffffffdf3 +| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffff7a5 +Total Num Of Uops: 95 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-osaca-csx.out b/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-osaca-csx.out new file mode 100644 index 0000000..884a5c7 --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-osaca-csx.out @@ -0,0 +1,132 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-avx512-dp.s +Architecture: CSX +Timestamp: 2023-04-05 00:41:22 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +-------------------------------------------------------------------------------------------------- +2814 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 +2815 | | | | | | | | || | | # LLVM-MCA-BEGIN +2816 | | | | | | | | || | | .LBB5_11: # +2817 | | | | | | | | || | | # Parent Loop BB5_7 Depth=1 +2818 | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +2819 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%rdx,%rcx,4), %rax +2820 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rax,%rax,2), %rax +2821 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rax +2822 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rdi,%rax), %zmm28 +2823 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rdi,%rax), %zmm29 +2824 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rdi,%rax), %zmm30 +2825 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload +2826 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm14 +2827 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm26, %zmm12 +2828 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm25, %zmm31 +2829 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm23, %zmm3 +2830 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm15 +2831 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm12, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm12) + zmm15 +2832 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm14, %zmm14, %zmm15 # zmm15 = (zmm14 * zmm14) + zmm15 +2833 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm15, %zmm16 +2834 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm15, %k1 +2835 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm16, %zmm15 +2836 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm16, %zmm18 +2837 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm18, %zmm15 +2838 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 512(%rsp), %zmm18 # 64-byte Reload +2839 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18 +2840 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm1, %zmm16 +2841 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm16 +2842 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm15, %zmm15 +2843 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm15 +2844 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm16 # 64-byte Reload +2845 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm16, %zmm16 +2846 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19 +2847 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm16, %zmm14 +2848 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm14 # zmm14 = (zmm18 * zmm18) + zmm14 +2849 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm14 # zmm14 = (zmm3 * zmm3) + zmm14 +2850 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm14, %k2 +2851 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm14, %zmm14 +2852 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11 +2853 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7 +2854 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm14, %zmm12 +2855 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm14, %zmm15 +2856 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm15, %zmm12 +2857 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm24, %zmm15 +2858 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm1, %zmm14 +2859 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm14, %zmm14 +2860 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm12, %zmm12 +2861 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm14, %zmm12 +2862 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm22, %zmm14 +2863 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17 +2864 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm3 +2865 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9 +2866 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm18 +2867 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm14, %zmm14, %zmm18 # zmm18 = (zmm14 * zmm14) + zmm18 +2868 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm15, %zmm18 # zmm18 = (zmm15 * zmm15) + zmm18 +2869 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 +2870 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm18, %zmm18 +2871 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5 +2872 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm18, %zmm12 +2873 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm16 +2874 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm12 +2875 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm1, %zmm16 +2876 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm16 +2877 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm12, %zmm12 +2878 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm12 +2879 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21 +2880 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 576(%rsp), %zmm15 # 64-byte Reload +2881 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm15, %zmm15 +2882 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm16 # 64-byte Reload +2883 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16 +2884 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload +2885 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm18, %zmm18 +2886 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10 +2887 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm14 +2888 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm16) + zmm14 +2889 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm15, %zmm14 # zmm14 = (zmm15 * zmm15) + zmm14 +2890 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm14, %k2 +2891 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm14, %zmm14 +2892 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6 +2893 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm14, %zmm3 +2894 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm14, %zmm12 +2895 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm3 +2896 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm14, %zmm1, %zmm12 +2897 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm12 +2898 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3 +2899 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm3 +2900 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13 +2901 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8 +2902 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4 +2903 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %rcx +2904 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11 +2905 | | | | | | | | || | | * jne .LBB5_11 +2906 | 0.00 | | | | | | 1.00 | || | | jmp .LBB5_12 +2907 | | | | | | | | || | | # LLVM-MCA-END + + 38.0 2.50 5.00 5.00 5.00 5.00 38.0 2.50 50.0 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +2902 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4| [2902] +2901 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8| [2901] +2900 | 4.0 | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13| [2900] +2892 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6| [2892] +2886 | 4.0 | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10| [2886] +2879 | 4.0 | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21| [2879] +2871 | 4.0 | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5| [2871] +2865 | 4.0 | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9| [2865] +2863 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17| [2863] +2853 | 4.0 | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7| [2853] +2852 | 4.0 | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11| [2852] +2846 | 4.0 | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19| [2846] +2903 | 1.0 | incq %rcx | [2903] + diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-osaca-icx.out b/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-osaca-icx.out new file mode 100644 index 0000000..72cda89 --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx512-dp-osaca-icx.out @@ -0,0 +1,132 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-avx512-dp.s +Architecture: ICX +Timestamp: 2023-04-05 00:41:45 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | +------------------------------------------------------------------------------------------------------------------------ +2814 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 +2815 | | | | | | | | | | || | | # LLVM-MCA-BEGIN +2816 | | | | | | | | | | || | | .LBB5_11: # +2817 | | | | | | | | | | || | | # Parent Loop BB5_7 Depth=1 +2818 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +2819 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%rdx,%rcx,4), %rax +2820 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rax,%rax,2), %rax +2821 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rax +2822 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovapd (%rdi,%rax), %zmm28 +2823 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovapd 64(%rdi,%rax), %zmm29 +2824 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovapd 128(%rdi,%rax), %zmm30 +2825 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload +2826 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm14 +2827 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm26, %zmm12 +2828 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm25, %zmm31 +2829 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm23, %zmm3 +2830 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm15 +2831 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm12, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm12) + zmm15 +2832 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm14, %zmm14, %zmm15 # zmm15 = (zmm14 * zmm14) + zmm15 +2833 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm15, %zmm16 +2834 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm15, %k1 +2835 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm20, %zmm16, %zmm15 +2836 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm16, %zmm18 +2837 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm15, %zmm18, %zmm15 +2838 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 512(%rsp), %zmm18 # 64-byte Reload +2839 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm18, %zmm18 +2840 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm1, %zmm16 +2841 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm16 +2842 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm2, %zmm15, %zmm15 +2843 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm15 +2844 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 448(%rsp), %zmm16 # 64-byte Reload +2845 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm16, %zmm16 +2846 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19 +2847 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm16, %zmm14 +2848 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm18, %zmm14 # zmm14 = (zmm18 * zmm18) + zmm14 +2849 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm14 # zmm14 = (zmm3 * zmm3) + zmm14 +2850 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm14, %k2 +2851 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm14, %zmm14 +2852 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11 +2853 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7 +2854 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm14, %zmm12 +2855 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm14, %zmm14, %zmm15 +2856 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm15, %zmm12 +2857 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm24, %zmm15 +2858 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm14, %zmm1, %zmm14 +2859 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm14, %zmm14 +2860 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm2, %zmm12, %zmm12 +2861 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm14, %zmm12 +2862 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm22, %zmm14 +2863 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17 +2864 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm3 +2865 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9 +2866 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm3, %zmm3, %zmm18 +2867 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm14, %zmm14, %zmm18 # zmm18 = (zmm14 * zmm14) + zmm18 +2868 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm15, %zmm15, %zmm18 # zmm18 = (zmm15 * zmm15) + zmm18 +2869 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm18, %k1 +2870 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm18, %zmm18 +2871 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5 +2872 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm18, %zmm12 +2873 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm16 +2874 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm16, %zmm12 +2875 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm1, %zmm16 +2876 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm16, %zmm16 +2877 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm2, %zmm12, %zmm12 +2878 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm16, %zmm12 +2879 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21 +2880 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 576(%rsp), %zmm15 # 64-byte Reload +2881 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm15, %zmm15 +2882 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 128(%rsp), %zmm16 # 64-byte Reload +2883 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16 +2884 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload +2885 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm18, %zmm18 +2886 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10 +2887 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm14 +2888 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm16) + zmm14 +2889 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm15, %zmm15, %zmm14 # zmm14 = (zmm15 * zmm15) + zmm14 +2890 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm14, %k2 +2891 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm14, %zmm14 +2892 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6 +2893 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm14, %zmm3 +2894 | 0.25 | | | | | 0.750 | | | | || | | vmulpd %zmm14, %zmm14, %zmm12 +2895 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm12, %zmm3 +2896 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm14, %zmm1, %zmm12 +2897 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm12, %zmm12 +2898 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm2, %zmm3, %zmm3 +2899 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm12, %zmm3 +2900 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13 +2901 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8 +2902 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4 +2903 | 0.00 | 0.50 | | | | -0.01 | 0.50 | | | || | | incq %rcx +2904 | 0.00 | 0.75 | | | | -0.01 | 0.25 | | | || | | cmpq %rcx, %r11 +2905 | | | | | | | | | | || | | * jne .LBB5_11 +2906 | | | | | | | | | | || | | * jmp .LBB5_12 +2907 | | | | | | | | | | || | | # LLVM-MCA-END + + 40.0 2.00 5.00 5.00 5.00 5.00 39.99 2.00 59 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +2902 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4| [2902] +2901 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8| [2901] +2900 | 4.0 | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13| [2900] +2892 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6| [2892] +2886 | 4.0 | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10| [2886] +2879 | 4.0 | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21| [2879] +2871 | 4.0 | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5| [2871] +2865 | 4.0 | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9| [2865] +2863 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17| [2863] +2853 | 4.0 | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7| [2853] +2852 | 4.0 | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11| [2852] +2846 | 4.0 | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19| [2846] +2903 | 1.0 | incq %rcx | [2903] + diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-iaca.out b/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-iaca.out new file mode 100644 index 0000000..4fb518b --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-iaca.out @@ -0,0 +1,79 @@ +Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 +Analyzed File - gromacs-avx512-sp.o +Binary Format - 64Bit +Architecture - SKX +Analysis Type - Throughput + +Throughput Analysis Report +-------------------------- +Block Throughput: 25.21 Cycles Throughput Bottleneck: Backend +Loop Count: 22 +Port Binding In Cycles Per Iteration: +-------------------------------------------------------------------------------------------------- +| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +-------------------------------------------------------------------------------------------------- +| Cycles | 21.0 0.0 | 2.0 | 2.0 2.0 | 2.0 2.0 | 0.0 | 21.0 | 2.0 | 0.0 | +-------------------------------------------------------------------------------------------------- + +DV - Divider pipe (on port 0) +D - Data fetch pipe (on ports 2 and 3) +F - Macro Fusion with the previous instruction occurred +* - instruction micro-ops not bound to a port +^ - Micro Fusion occurred +# - ESP Tracking sync uop was issued +@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected +X - instruction not supported, was not accounted in Analysis + +| Num Of | Ports pressure in cycles | | +| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | +----------------------------------------------------------------------------------------- +| 1 | | | 1.0 1.0 | | | | | | movsxd rdi, dword ptr [rsi+rdx*4] +| 1 | | 1.0 | | | | | | | lea rdi, ptr [rdi+rdi*2] +| 1 | | | | | | | 1.0 | | shl rdi, 0x5 +| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm16, zmmword ptr [rcx+rdi*1] +| 2 | | | 1.0 1.0 | | | 1.0 | | | vinsertf64x4 zmm17, zmm16, ymmword ptr [rcx+rdi*1], 0x1 +| 1 | | | | 1.0 1.0 | | | | | vbroadcastf64x4 zmm18, ymmword ptr [rcx+rdi*1+0x40] +| 1 | | | | | | 1.0 | | | vshuff64x2 zmm16, zmm16, zmm16, 0xee +| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm6, zmm17 +| 1 | 1.0 | | | | | | | | vsubps zmm20, zmm10, zmm16 +| 1 | | | | | | 1.0 | | | vsubps zmm21, zmm12, zmm18 +| 1 | 1.0 | | | | | | | | vsubps zmm17, zmm9, zmm17 +| 1 | | | | | | 1.0 | | | vsubps zmm18, zmm14, zmm18 +| 1 | 1.0 | | | | | | | | vsubps zmm16, zmm11, zmm16 +| 1 | | | | | | 1.0 | | | vmulps zmm22, zmm21, zmm21 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm22, zmm20, zmm20 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm22, zmm19, zmm19 +| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm18, zmm18 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm23, zmm16, zmm16 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm23, zmm17, zmm17 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm24, zmm22 +| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm25, zmm23 +| 1 | | | | | | 1.0 | | | vcmpps k2, zmm22, zmm0, 0x1 +| 1 | | | | | | 1.0 | | | vcmpps k1, zmm23, zmm0, 0x1 +| 1 | | | | | | 1.0 | | | vmulps zmm22, zmm24, zmm29 +| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm24, zmm24 +| 1 | | | | | | 1.0 | | | vmulps zmm26, zmm25, zmm29 +| 1 | 1.0 | | | | | | | | vmulps zmm22, zmm23, zmm22 +| 1 | | | | | | 1.0 | | | vmulps zmm23, zmm25, zmm25 +| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm23, zmm26 +| 1 | | | | | | 1.0 | | | vaddps zmm26, zmm22, zmm2 +| 1 | 1.0 | | | | | | | | vmulps zmm24, zmm1, zmm24 +| 1 | | | | | | 1.0 | | | vmulps zmm22, zmm24, zmm22 +| 1 | 1.0 | | | | | | | | vmulps zmm22, zmm22, zmm26 +| 1 | | | | | | 1.0 | | | vaddps zmm24, zmm23, zmm2 +| 1 | 1.0 | | | | | | | | vmulps zmm25, zmm1, zmm25 +| 1 | | | | | | 1.0 | | | vmulps zmm23, zmm25, zmm23 +| 1 | 1.0 | | | | | | | | vmulps zmm23, zmm23, zmm24 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm13{k2}, zmm22, zmm19 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm8{k2}, zmm22, zmm20 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm5{k2}, zmm22, zmm21 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm15{k1}, zmm23, zmm17 +| 1 | | | | | | 1.0 | | | vfmadd231ps zmm7{k1}, zmm23, zmm16 +| 1 | 1.0 | | | | | | | | vfmadd231ps zmm4{k1}, zmm23, zmm18 +| 1 | | 1.0 | | | | | | | inc rdx +| 1* | | | | | | | | | cmp r12, rdx +| 0*F | | | | | | | | | jnz 0xfffffffffffffef6 +| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffb58 +Total Num Of Uops: 51 +Analysis Notes: +Backend allocation was stalled due to unavailable allocation resources. diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-osaca-csx.out b/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-osaca-csx.out new file mode 100644 index 0000000..65ba71e --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-osaca-csx.out @@ -0,0 +1,84 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-avx512-sp.s +Architecture: CSX +Timestamp: 2023-04-05 00:42:20 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +-------------------------------------------------------------------------------------------------- +1615 | | | | | | | | || | | # LLVM-MCA-BEGIN +1616 | | | | | | | | || | | .LBB2_11: # +1617 | | | | | | | | || | | # Parent Loop BB2_7 Depth=1 +1618 | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +1619 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%rsi,%rdx,4), %rdi +1620 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rdi,%rdi,2), %rdi +1621 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rdi +1622 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd (%rcx,%rdi), %zmm16 +1623 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || | | vinsertf64x4 $1, (%rcx,%rdi), %zmm16, %zmm17 +1624 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vbroadcastf64x4 64(%rcx,%rdi), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] +1625 | | | | | | 1.00 | | || 3.0 | | vshuff64x2 $238, %zmm16, %zmm16, %zmm16 # zmm16 = zmm16[4,5,6,7,4,5,6,7] +1626 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm17, %zmm6, %zmm19 +1627 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm16, %zmm10, %zmm20 +1628 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm18, %zmm12, %zmm21 +1629 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm17, %zmm9, %zmm17 +1630 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm18, %zmm14, %zmm18 +1631 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm11, %zmm16 +1632 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm21, %zmm21, %zmm22 +1633 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm20, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm20) + zmm22 +1634 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm19, %zmm19, %zmm22 # zmm22 = (zmm19 * zmm19) + zmm22 +1635 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm18, %zmm18, %zmm23 +1636 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm16) + zmm23 +1637 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm23 # zmm23 = (zmm17 * zmm17) + zmm23 +1638 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm22, %zmm24 +1639 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm23, %zmm25 +1640 | | | | | | | | || | | X vcmpltps %zmm0, %zmm22, %k2 +1641 | | | | | | | | || | | X vcmpltps %zmm0, %zmm23, %k1 +1642 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm29, %zmm24, %zmm22 +1643 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm24, %zmm23 +1644 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm29, %zmm25, %zmm26 +1645 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm22, %zmm23, %zmm22 +1646 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm25, %zmm25, %zmm23 +1647 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm26, %zmm23, %zmm23 +1648 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm2, %zmm22, %zmm26 +1649 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm1, %zmm24 +1650 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm22, %zmm24, %zmm22 +1651 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm22, %zmm22 +1652 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm2, %zmm23, %zmm24 +1653 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm25, %zmm1, %zmm25 +1654 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm25, %zmm23 +1655 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm23, %zmm23 +1656 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13 +1657 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8 +1658 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5 +1659 | 0.00 | | | | | 1.00 | | || | | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15 +1660 | 0.00 | | | | | 1.00 | | || | | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7 +1661 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4 +1662 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %rdx +1663 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r12 +1664 | | | | | | | | || | | * jne .LBB2_11 +1665 | 0.00 | | | | | | 1.00 | || | | jmp .LBB2_12 +1666 | | | | | | | | || | | # LLVM-MCA-END + + 20.0 2.50 2.00 2.00 2.00 2.00 20.0 2.50 53.0 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +1661 | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4| [1661] +1660 | 4.0 | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7| [1660] +1659 | 4.0 | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15| [1659] +1658 | 4.0 | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5| [1658] +1657 | 4.0 | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8| [1657] +1656 | 4.0 | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13| [1656] +1662 | 1.0 | incq %rdx | [1662] + diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-osaca-icx.out b/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-osaca-icx.out new file mode 100644 index 0000000..62242d8 --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx512-sp-osaca-icx.out @@ -0,0 +1,84 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-avx512-sp.s +Architecture: ICX +Timestamp: 2023-04-05 00:42:45 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | +------------------------------------------------------------------------------------------------------------------------ +1615 | | | | | | | | | | || | | # LLVM-MCA-BEGIN +1616 | | | | | | | | | | || | | .LBB2_11: # +1617 | | | | | | | | | | || | | # Parent Loop BB2_7 Depth=1 +1618 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +1619 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%rsi,%rdx,4), %rdi +1620 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rdi,%rdi,2), %rdi +1621 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdi +1622 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd (%rcx,%rdi), %zmm16 +1623 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | | | || | | vinsertf64x4 $1, (%rcx,%rdi), %zmm16, %zmm17 +1624 | | | | | | | | | | || | | X vbroadcastf64x4 64(%rcx,%rdi), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] +1625 | | | | | | | | | | || 0.0 | | X vshuff64x2 $238, %zmm16, %zmm16, %zmm16 # zmm16 = zmm16[4,5,6,7,4,5,6,7] +1626 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm17, %zmm6, %zmm19 +1627 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm16, %zmm10, %zmm20 +1628 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm18, %zmm12, %zmm21 +1629 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm17, %zmm9, %zmm17 +1630 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm18, %zmm14, %zmm18 +1631 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm16, %zmm11, %zmm16 +1632 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm21, %zmm21, %zmm22 +1633 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm20, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm20) + zmm22 +1634 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm19, %zmm19, %zmm22 # zmm22 = (zmm19 * zmm19) + zmm22 +1635 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm18, %zmm18, %zmm23 +1636 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm16, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm16) + zmm23 +1637 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm17, %zmm17, %zmm23 # zmm23 = (zmm17 * zmm17) + zmm23 +1638 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm24 +1639 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm23, %zmm25 +1640 | | | | | | | | | | || | | X vcmpltps %zmm0, %zmm22, %k2 +1641 | | | | | | | | | | || | | X vcmpltps %zmm0, %zmm23, %k1 +1642 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm29, %zmm24, %zmm22 +1643 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm24, %zmm24, %zmm23 +1644 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm29, %zmm25, %zmm26 +1645 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm22, %zmm23, %zmm22 +1646 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm25, %zmm25, %zmm23 +1647 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm26, %zmm23, %zmm23 +1648 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddps %zmm2, %zmm22, %zmm26 +1649 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm24, %zmm1, %zmm24 +1650 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm22, %zmm24, %zmm22 +1651 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm26, %zmm22, %zmm22 +1652 | 0.50 | | | | | 0.500 | | | | || | | vaddps %zmm2, %zmm23, %zmm24 +1653 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm25, %zmm1, %zmm25 +1654 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm23, %zmm25, %zmm23 +1655 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm24, %zmm23, %zmm23 +1656 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13 +1657 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8 +1658 | 0.25 | | | | | 0.750 | | | | || | | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5 +1659 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15 +1660 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7 +1661 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4 +1662 | 0.00 | 0.50 | | | | -0.01 | 0.50 | | | || | | incq %rdx +1663 | 0.00 | 0.75 | | | | -0.01 | 0.25 | | | || | | cmpq %rdx, %r12 +1664 | | | | | | | | | | || | | * jne .LBB2_11 +1665 | | | | | | | | | | || | | * jmp .LBB2_12 +1666 | | | | | | | | | | || | | # LLVM-MCA-END + + 19.5 2.00 1.50 1.50 1.50 1.50 19.49 2.00 55.0 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +1661 | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4| [1661] +1660 | 4.0 | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7| [1660] +1659 | 4.0 | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15| [1659] +1658 | 4.0 | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5| [1658] +1657 | 4.0 | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8| [1657] +1656 | 4.0 | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13| [1656] +1662 | 1.0 | incq %rdx | [1662] +