Open Source Architecture Code Analyzer (OSACA) - 0.4.12 Analyzed file: gromacs-avx512-sp.s Architecture: CSX Timestamp: 2023-04-05 00:42:20 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction * - Instruction micro-ops not bound to a port X - No throughput/latency information for this instruction in data file Combined Analysis Report ------------------------ Port pressure in cycles | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- 1615 | | | | | | | | || | | # LLVM-MCA-BEGIN 1616 | | | | | | | | || | | .LBB2_11: # 1617 | | | | | | | | || | | # Parent Loop BB2_7 Depth=1 1618 | | | | | | | | || | | # => This Inner Loop Header: Depth=2 1619 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%rsi,%rdx,4), %rdi 1620 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rdi,%rdi,2), %rdi 1621 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rdi 1622 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd (%rcx,%rdi), %zmm16 1623 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || | | vinsertf64x4 $1, (%rcx,%rdi), %zmm16, %zmm17 1624 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vbroadcastf64x4 64(%rcx,%rdi), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] 1625 | | | | | | 1.00 | | || 3.0 | | vshuff64x2 $238, %zmm16, %zmm16, %zmm16 # zmm16 = zmm16[4,5,6,7,4,5,6,7] 1626 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm17, %zmm6, %zmm19 1627 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm16, %zmm10, %zmm20 1628 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm18, %zmm12, %zmm21 1629 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm17, %zmm9, %zmm17 1630 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm18, %zmm14, %zmm18 1631 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm11, %zmm16 1632 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm21, %zmm21, %zmm22 1633 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm20, %zmm20, %zmm22 # zmm22 = (zmm20 * zmm20) + zmm22 1634 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm19, %zmm19, %zmm22 # zmm22 = (zmm19 * zmm19) + zmm22 1635 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm18, %zmm18, %zmm23 1636 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm23 # zmm23 = (zmm16 * zmm16) + zmm23 1637 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm23 # zmm23 = (zmm17 * zmm17) + zmm23 1638 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm22, %zmm24 1639 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm23, %zmm25 1640 | | | | | | | | || | | X vcmpltps %zmm0, %zmm22, %k2 1641 | | | | | | | | || | | X vcmpltps %zmm0, %zmm23, %k1 1642 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm29, %zmm24, %zmm22 1643 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm24, %zmm23 1644 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm29, %zmm25, %zmm26 1645 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm22, %zmm23, %zmm22 1646 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm25, %zmm25, %zmm23 1647 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm26, %zmm23, %zmm23 1648 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm2, %zmm22, %zmm26 1649 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm1, %zmm24 1650 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm22, %zmm24, %zmm22 1651 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm22, %zmm22 1652 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm2, %zmm23, %zmm24 1653 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm25, %zmm1, %zmm25 1654 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm25, %zmm23 1655 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm24, %zmm23, %zmm23 1656 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13 1657 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8 1658 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5 1659 | 0.00 | | | | | 1.00 | | || | | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15 1660 | 0.00 | | | | | 1.00 | | || | | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7 1661 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4 1662 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %rdx 1663 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r12 1664 | | | | | | | | || | | * jne .LBB2_11 1665 | 0.00 | | | | | | 1.00 | || | | jmp .LBB2_12 1666 | | | | | | | | || | | # LLVM-MCA-END 20.0 2.50 2.00 2.00 2.00 2.00 20.0 2.50 53.0 4.0 Loop-Carried Dependencies Analysis Report ----------------------------------------- 1661 | 4.0 | vfmadd231ps %zmm18, %zmm23, %zmm4 {%k1} # zmm4 = (zmm23 * zmm18) + zmm4| [1661] 1660 | 4.0 | vfmadd231ps %zmm16, %zmm23, %zmm7 {%k1} # zmm7 = (zmm23 * zmm16) + zmm7| [1660] 1659 | 4.0 | vfmadd231ps %zmm17, %zmm23, %zmm15 {%k1} # zmm15 = (zmm23 * zmm17) + zmm15| [1659] 1658 | 4.0 | vfmadd231ps %zmm21, %zmm22, %zmm5 {%k2} # zmm5 = (zmm22 * zmm21) + zmm5| [1658] 1657 | 4.0 | vfmadd231ps %zmm20, %zmm22, %zmm8 {%k2} # zmm8 = (zmm22 * zmm20) + zmm8| [1657] 1656 | 4.0 | vfmadd231ps %zmm19, %zmm22, %zmm13 {%k2} # zmm13 = (zmm22 * zmm19) + zmm13| [1656] 1662 | 1.0 | incq %rdx | [1662]