Open Source Architecture Code Analyzer (OSACA) - 0.4.12 Analyzed file: gromacs-avx512-dp.s Architecture: CSX Timestamp: 2023-04-05 00:41:22 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction * - Instruction micro-ops not bound to a port X - No throughput/latency information for this instruction in data file Combined Analysis Report ------------------------ Port pressure in cycles | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- 2814 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 2815 | | | | | | | | || | | # LLVM-MCA-BEGIN 2816 | | | | | | | | || | | .LBB5_11: # 2817 | | | | | | | | || | | # Parent Loop BB5_7 Depth=1 2818 | | | | | | | | || | | # => This Inner Loop Header: Depth=2 2819 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%rdx,%rcx,4), %rax 2820 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rax,%rax,2), %rax 2821 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rax 2822 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rdi,%rax), %zmm28 2823 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rdi,%rax), %zmm29 2824 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rdi,%rax), %zmm30 2825 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload 2826 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm14 2827 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm26, %zmm12 2828 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm25, %zmm31 2829 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm23, %zmm3 2830 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm15 2831 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm12, %zmm12, %zmm15 # zmm15 = (zmm12 * zmm12) + zmm15 2832 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm14, %zmm14, %zmm15 # zmm15 = (zmm14 * zmm14) + zmm15 2833 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm15, %zmm16 2834 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm15, %k1 2835 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm16, %zmm15 2836 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm16, %zmm18 2837 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm18, %zmm15 2838 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 512(%rsp), %zmm18 # 64-byte Reload 2839 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18 2840 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm1, %zmm16 2841 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm16 2842 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm15, %zmm15 2843 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm16, %zmm15 2844 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm16 # 64-byte Reload 2845 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm16, %zmm16 2846 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19 2847 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm16, %zmm14 2848 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm14 # zmm14 = (zmm18 * zmm18) + zmm14 2849 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm14 # zmm14 = (zmm3 * zmm3) + zmm14 2850 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm14, %k2 2851 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm14, %zmm14 2852 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11 2853 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7 2854 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm14, %zmm12 2855 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm14, %zmm15 2856 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm15, %zmm12 2857 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm24, %zmm15 2858 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm1, %zmm14 2859 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm14, %zmm14 2860 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm12, %zmm12 2861 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm14, %zmm12 2862 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm22, %zmm14 2863 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17 2864 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm3 2865 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9 2866 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm18 2867 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm14, %zmm14, %zmm18 # zmm18 = (zmm14 * zmm14) + zmm18 2868 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm15, %zmm18 # zmm18 = (zmm15 * zmm15) + zmm18 2869 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 2870 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm18, %zmm18 2871 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5 2872 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm18, %zmm12 2873 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm16 2874 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm12 2875 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm1, %zmm16 2876 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm16 2877 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm12, %zmm12 2878 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm16, %zmm12 2879 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21 2880 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 576(%rsp), %zmm15 # 64-byte Reload 2881 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm15, %zmm15 2882 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm16 # 64-byte Reload 2883 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16 2884 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload 2885 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm18, %zmm18 2886 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10 2887 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm14 2888 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm16, %zmm14 # zmm14 = (zmm16 * zmm16) + zmm14 2889 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm15, %zmm15, %zmm14 # zmm14 = (zmm15 * zmm15) + zmm14 2890 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm14, %k2 2891 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm14, %zmm14 2892 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6 2893 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm14, %zmm3 2894 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm14, %zmm12 2895 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm3 2896 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm14, %zmm1, %zmm12 2897 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm12 2898 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3 2899 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm12, %zmm3 2900 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13 2901 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8 2902 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4 2903 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %rcx 2904 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11 2905 | | | | | | | | || | | * jne .LBB5_11 2906 | 0.00 | | | | | | 1.00 | || | | jmp .LBB5_12 2907 | | | | | | | | || | | # LLVM-MCA-END 38.0 2.50 5.00 5.00 5.00 5.00 38.0 2.50 50.0 4.0 Loop-Carried Dependencies Analysis Report ----------------------------------------- 2902 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm4 {%k2} # zmm4 = (zmm3 * zmm18) + zmm4| [2902] 2901 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm8 {%k2} # zmm8 = (zmm3 * zmm16) + zmm8| [2901] 2900 | 4.0 | vfmadd231pd %zmm15, %zmm3, %zmm13 {%k2} # zmm13 = (zmm3 * zmm15) + zmm13| [2900] 2892 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm6 {%k1} # zmm6 = (zmm12 * zmm3) + zmm6| [2892] 2886 | 4.0 | vfmadd231pd %zmm14, %zmm12, %zmm10 {%k1} # zmm10 = (zmm12 * zmm14) + zmm10| [2886] 2879 | 4.0 | vfmadd231pd %zmm15, %zmm12, %zmm21 {%k1} # zmm21 = (zmm12 * zmm15) + zmm21| [2879] 2871 | 4.0 | vfmadd231pd %zmm16, %zmm12, %zmm5 {%k2} # zmm5 = (zmm12 * zmm16) + zmm5| [2871] 2865 | 4.0 | vfmadd231pd %zmm18, %zmm12, %zmm9 {%k2} # zmm9 = (zmm12 * zmm18) + zmm9| [2865] 2863 | 4.0 | vfmadd231pd %zmm3, %zmm12, %zmm17 {%k2} # zmm17 = (zmm12 * zmm3) + zmm17| [2863] 2853 | 4.0 | vfmadd231pd %zmm31, %zmm15, %zmm7 {%k1} # zmm7 = (zmm15 * zmm31) + zmm7| [2853] 2852 | 4.0 | vfmadd231pd %zmm12, %zmm15, %zmm11 {%k1} # zmm11 = (zmm15 * zmm12) + zmm11| [2852] 2846 | 4.0 | vfmadd231pd %zmm14, %zmm15, %zmm19 {%k1} # zmm19 = (zmm15 * zmm14) + zmm19| [2846] 2903 | 1.0 | incq %rcx | [2903]