Open Source Architecture Code Analyzer (OSACA) - 0.4.12 Analyzed file: gromacs-icx-avx512-sp.s Architecture: ICX Timestamp: 2023-02-14 12:51:43 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction * - Instruction micro-ops not bound to a port X - No throughput/latency information for this instruction in data file Combined Analysis Report ------------------------ Port pressure in cycles | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | ------------------------------------------------------------------------------------------------------------------------ 1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649 1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN 1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1 1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx 1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx 1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx 1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16 1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3] 1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] 1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7] 1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18 1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17 1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16 1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22 1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22 1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22 1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23 1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24 1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24 1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24 1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25 1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23 1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23 1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23 1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx 1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi 1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp 1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12 1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil 1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx 1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl 1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx 1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx 1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12 1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl 1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx 1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl 1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx 1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d 1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d 1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d 1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx 1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx 1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d 1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00 1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx 1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2 1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2} 1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21 1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20 1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19 1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22 1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12 1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18 1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18 1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18 1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9 1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17 1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5 1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16 1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16 1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16 1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22 1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17 1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17 1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16 1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx 1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx 1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx 1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx 1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx 1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx 1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB 1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2 1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2} 1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16 1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13 1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8 1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4 1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax 1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10 1420 | | | | | | | | | | || | | * jne .LBB2_12 1421 | | | | | | | | | | || | | # LLVM-MCA-END 22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0 Loop-Carried Dependencies Analysis Report ----------------------------------------- 1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417] 1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416] 1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415] 1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397] 1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395] 1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391] 1418 | 1.0 | incq %rax | [1418]