diff --git a/arch_analysis/iaca_force_aos_geq1200.txt b/arch_analysis/iaca_force_aos_geq1200.txt deleted file mode 100644 index a10a778..0000000 --- a/arch_analysis/iaca_force_aos_geq1200.txt +++ /dev/null @@ -1,79 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_aos_geq1200_markers.o -Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 -Analyzed File - force_aos_geq1200_markers.o -Binary Format - 64Bit -Architecture - SKX -Analysis Type - Throughput - -Throughput Analysis Report --------------------------- -Block Throughput: 33.05 Cycles Throughput Bottleneck: Backend -Loop Count: 22 -Port Binding In Cycles Per Iteration: --------------------------------------------------------------------------------------------------- -| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------------------------- -| Cycles | 20.0 0.0 | 4.5 | 13.0 13.0 | 13.0 13.0 | 0.0 | 18.0 | 4.5 | 0.0 | --------------------------------------------------------------------------------------------------- - -DV - Divider pipe (on port 0) -D - Data fetch pipe (on ports 2 and 3) -F - Macro Fusion with the previous instruction occurred -* - instruction micro-ops not bound to a port -^ - Micro Fusion occurred -# - ESP Tracking sync uop was issued -@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected -X - instruction not supported, was not accounted in Analysis - -| Num Of | Ports pressure in cycles | | -| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | ------------------------------------------------------------------------------------------ -| 1 | | | | | | 1.0 | | | vpcmpgtd k3, ymm2, ymm3 -| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k3}{z}, ymmword ptr [r15+r13*4] -| 1 | 1.0 | | | | | | | | kmovw r9d, k3 -| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17 -| 1 | | 1.0 | | | | | | | vpaddd ymm17, ymm17, ymm18 -| 1 | 1.0 | | | | | | | | kmovw k1, k3 -| 1 | 1.0 | | | | | | | | kmovw k2, k3 -| 1* | | | | | | | | | vpxord zmm18, zmm18, zmm18 -| 1* | | | | | | | | | vpxord zmm19, zmm19, zmm19 -| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20 -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm18, k1, zmmword ptr [rdi+ymm17*8+0x10] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm19, k2, zmmword ptr [rdi+ymm17*8+0x8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rdi+ymm17*8] -| 1 | | 0.5 | | | | | 0.5 | | add r13, 0x8 -| 1 | | 1.0 | | | | | | | vpaddd ymm3, ymm3, ymm16 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm29, zmm4, zmm18 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm27, zmm0, zmm19 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm26, zmm1, zmm20 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm27, zmm27 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm25, zmm26, zmm26 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm25, zmm29, zmm29 -| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm24, zmm25 -| 1 | | | | | | 1.0 | | | vcmppd k2, zmm25, zmm14, 0x1 -| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm24, 0x1e -| 1 | 1.0 | | | | | | | | kmovw edx, k2 -| 1 | 1.0 | | | | | | | | knotw k1, k0 -| 1* | | | | | | | | | vmovaps zmm17, zmm25 -| 1 | | | | | | | 1.0 | | and r9d, edx -| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm17, zmm24, qword ptr [rip]{1to8} -| 1 | | | | | | 1.0 | | | kmovw k3, r9d -| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm24{k1}, zmm17, zmm24 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm24{k1}, zmm18, zmm24 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm24, zmm13 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm21, zmm24, zmm10 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm22, zmm24, zmm19 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm20, zmm24, zmm22 -| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm24, zmm22, zmm5 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm20, zmm21 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm23, zmm24 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k3}, zmm28, zmm26 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k3}, zmm28, zmm27 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k3}, zmm28, zmm29 -| 1* | | | | | | | | | cmp r13, rbx -| 0*F | | | | | | | | | jb 0xfffffffffffffef7 -Total Num Of Uops: 60 -Analysis Notes: -Backend allocation was stalled due to unavailable allocation resources. -There were bubbles in the frontend. diff --git a/arch_analysis/iaca_force_aos_lt1200.txt b/arch_analysis/iaca_force_aos_lt1200.txt deleted file mode 100644 index 451061a..0000000 --- a/arch_analysis/iaca_force_aos_lt1200.txt +++ /dev/null @@ -1,104 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/ICC$ iaca -arch SKX force.o -Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 -Analyzed File - force.o -Binary Format - 64Bit -Architecture - SKX -Analysis Type - Throughput - -Throughput Analysis Report --------------------------- -Block Throughput: 36.70 Cycles Throughput Bottleneck: Backend -Loop Count: 23 -Port Binding In Cycles Per Iteration: --------------------------------------------------------------------------------------------------- -| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------------------------- -| Cycles | 17.5 0.0 | 11.0 | 20.5 17.0 | 20.5 17.0 | 7.0 | 20.5 | 7.0 | 0.0 | --------------------------------------------------------------------------------------------------- - -DV - Divider pipe (on port 0) -D - Data fetch pipe (on ports 2 and 3) -F - Macro Fusion with the previous instruction occurred -* - instruction micro-ops not bound to a port -^ - Micro Fusion occurred -# - ESP Tracking sync uop was issued -@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected -X - instruction not supported, was not accounted in Analysis - -| Num Of | Ports pressure in cycles | | -| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | ------------------------------------------------------------------------------------------ -| 1* | | | | | | | | | mov r13, r8 -| 1 | | 1.0 | | | | | | | imul r13, rcx -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm2, xmm6 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm1, xmm7 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm0, xmm12 -| 1 | | | | | | | 1.0 | | movsxd rbx, r12d -| 1 | | | | | | | 1.0 | | add r13, r10 -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x40], rax -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x38], r8 -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x30], r10 -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x28], rsi -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x20], rcx -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x50], r9 -| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x48], rdx -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu ymm3, ymmword ptr [r13+rbx*4] -| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm3, ymm3 -| 1 | | 1.0 | | | | | | | vpaddd ymm3, ymm3, ymm4 -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r10d, dword ptr [r13+rbx*4] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r9d, dword ptr [r13+rbx*4+0x4] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r8d, dword ptr [r13+rbx*4+0x8] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov esi, dword ptr [r13+rbx*4+0xc] -| 1 | | 1.0 | | | | | | | lea r10d, ptr [r10+r10*2] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov ecx, dword ptr [r13+rbx*4+0x10] -| 1 | | 1.0 | | | | | | | lea r9d, ptr [r9+r9*2] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov edx, dword ptr [r13+rbx*4+0x14] -| 1 | | 1.0 | | | | | | | lea r8d, ptr [r8+r8*2] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov eax, dword ptr [r13+rbx*4+0x18] -| 1 | | 1.0 | | | | | | | lea esi, ptr [rsi+rsi*2] -| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r15d, dword ptr [r13+rbx*4+0x1c] -| 1 | | 1.0 | | | | | | | lea ecx, ptr [rcx+rcx*2] -| 1 | | 1.0 | | | | | | | lea edx, ptr [rdx+rdx*2] -| 1 | | 1.0 | | | | | | | lea eax, ptr [rax+rax*2] -| 1 | | 1.0 | | | | | | | lea r15d, ptr [r15+r15*2] -| 1 | | | | | | 1.0 | | | vpcmpeqb k1, xmm0, xmm0 -| 1 | | | | | | 1.0 | | | vpcmpeqb k2, xmm0, xmm0 -| 1 | | | | | | 1.0 | | | vpcmpeqb k3, xmm0, xmm0 -| 1* | | | | | | | | | vpxord zmm4, zmm4, zmm4 -| 1* | | | | | | | | | vpxord zmm17, zmm17, zmm17 -| 1* | | | | | | | | | vpxord zmm18, zmm18, zmm18 -| 5^ | 2.0 | | 4.0 4.0 | 4.0 4.0 | | | 1.0 | | vgatherdpd zmm4, k1, zmmword ptr [rdi+ymm3*8+0x10] -| 5^ | 1.5 | | 4.0 4.0 | 4.0 4.0 | | 0.5 | 1.0 | | vgatherdpd zmm17, k2, zmmword ptr [rdi+ymm3*8+0x8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm18, k3, zmmword ptr [rdi+ymm3*8] -| 1 | | | | | | | 1.0 | | add r12d, 0x8 -| 1 | | | | | | | 1.0 | | add rbx, 0x8 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm26, zmm0, zmm4 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm24, zmm1, zmm17 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm23, zmm2, zmm18 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm3, zmm24, zmm24 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm3, zmm23, zmm23 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm3, zmm26, zmm26 -| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm22, zmm3 -| 1 | | | | | | 1.0 | | | vcmppd k2, zmm3, zmm14, 0x1 -| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm22, 0x1e -| 2^ | 1.0 | | 0.5 0.5 | 0.5 0.5 | | | | | vfnmadd213pd zmm3, zmm22, qword ptr [rip]{1to8} -| 1 | 1.0 | | | | | | | | knotw k1, k0 -| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm3, zmm3 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm22{k1}, zmm3, zmm22 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm22{k1}, zmm4, zmm22 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm17, zmm22, zmm13 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm22, zmm10 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm20, zmm22, zmm17 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm18, zmm22, zmm20 -| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm22, zmm20, zmm5 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm21, zmm18, zmm19 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm21, zmm22 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k2}, zmm25, zmm23 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k2}, zmm25, zmm24 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k2}, zmm25, zmm26 -| 1* | | | | | | | | | cmp r12d, r14d -| 0*F | | | | | | | | | jb 0xfffffffffffffed3 -Total Num Of Uops: 91 -Analysis Notes: -Backend allocation was stalled due to unavailable allocation resources. -There were bubbles in the frontend. diff --git a/arch_analysis/iaca_force_aos_lt8.txt b/arch_analysis/iaca_force_aos_lt8.txt deleted file mode 100644 index 973db7c..0000000 --- a/arch_analysis/iaca_force_aos_lt8.txt +++ /dev/null @@ -1,82 +0,0 @@ -Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 -Analyzed File - force_aos_lt8_markers.o -Binary Format - 64Bit -Architecture - SKX -Analysis Type - Throughput - -Throughput Analysis Report --------------------------- -Block Throughput: 69.79 Cycles Throughput Bottleneck: Backend -Loop Count: 22 -Port Binding In Cycles Per Iteration: --------------------------------------------------------------------------------------------------- -| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------------------------- -| Cycles | 21.0 0.0 | 5.5 | 13.0 13.0 | 13.0 13.0 | 0.0 | 21.0 | 5.5 | 0.0 | --------------------------------------------------------------------------------------------------- - -DV - Divider pipe (on port 0) -D - Data fetch pipe (on ports 2 and 3) -F - Macro Fusion with the previous instruction occurred -* - instruction micro-ops not bound to a port -^ - Micro Fusion occurred -# - ESP Tracking sync uop was issued -@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected -X - instruction not supported, was not accounted in Analysis - -| Num Of | Ports pressure in cycles | | -| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | ------------------------------------------------------------------------------------------ -| 1 | | 1.0 | | | | | | | imul rcx, r8 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm4, xmm6 -| 1 | | | | | | | 1.0 | | sub r11d, r14d -| 1 | | 0.5 | | | | | 0.5 | | add rcx, r10 -| 1 | | | | | | 1.0 | | | vpbroadcastd ymm0, r11d -| 1 | | | | | | 1.0 | | | vpcmpgtd k3, ymm0, ymm15 -| 1 | | 0.5 | | | | | 0.5 | | movsxd r14, r14d -| 1 | 1.0 | | | | | | | | kmovw ebx, k3 -| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm1{k3}{z}, ymmword ptr [rcx+r14*4] -| 1 | | 1.0 | | | | | | | vpaddd ymm2, ymm1, ymm1 -| 1 | | 1.0 | | | | | | | vpaddd ymm0, ymm1, ymm2 -| 1 | 1.0 | | | | | | | | kmovw k1, k3 -| 1 | 1.0 | | | | | | | | kmovw k2, k3 -| 1* | | | | | | | | | vpxord zmm1, zmm1, zmm1 -| 1* | | | | | | | | | vpxord zmm2, zmm2, zmm2 -| 1* | | | | | | | | | vpxord zmm3, zmm3, zmm3 -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm1, k1, zmmword ptr [rdi+ymm0*8+0x10] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm2, k2, zmmword ptr [rdi+ymm0*8+0x8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm3, k3, zmmword ptr [rdi+ymm0*8] -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm7, xmm7 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm12, xmm12 -| 1 | 1.0 | | | | | | | | vsubpd zmm23, zmm12, zmm1 -| 1 | 1.0 | | | | | | | | vsubpd zmm21, zmm7, zmm2 -| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm20, zmm4, zmm3 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm21, zmm21 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm19, zmm20, zmm20 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm19, zmm23, zmm23 -| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm19 -| 1 | | | | | | 1.0 | | | vcmppd k2, zmm19, zmm14, 0x1 -| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm18, 0x1e -| 1 | 1.0 | | | | | | | | kmovw ecx, k2 -| 1 | 1.0 | | | | | | | | knotw k1, k0 -| 1* | | | | | | | | | vmovaps zmm0, zmm19 -| 1 | | 0.5 | | | | | 0.5 | | and ebx, ecx -| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm0, zmm18, qword ptr [rip]{1to8} -| 1 | | | | | | 1.0 | | | kmovw k3, ebx -| 1 | 1.0 | | | | | | | | vmulpd zmm1, zmm0, zmm0 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm18{k1}, zmm0, zmm18 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm18{k1}, zmm1, zmm18 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm2, zmm18, zmm13 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm4, zmm18, zmm10 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm6, zmm18, zmm2 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm3, zmm18, zmm6 -| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm18, zmm6, zmm5 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm17, zmm3, zmm4 -| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm22, zmm17, zmm18 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k3}, zmm22, zmm20 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k3}, zmm22, zmm21 -| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k3}, zmm22, zmm23 -Total Num Of Uops: 65 -Analysis Notes: -Backend allocation was stalled due to unavailable allocation resources. -There were bubbles in the frontend. diff --git a/arch_analysis/iaca_force_soa_geq1200.txt b/arch_analysis/iaca_force_soa_geq1200.txt deleted file mode 100644 index 1deb2bc..0000000 --- a/arch_analysis/iaca_force_soa_geq1200.txt +++ /dev/null @@ -1,74 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_geq1200_markers.o -Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 -Analyzed File - force_soa_geq1200_markers.o -Binary Format - 64Bit -Architecture - SKX -Analysis Type - Throughput - -Throughput Analysis Report --------------------------- -Block Throughput: 31.47 Cycles Throughput Bottleneck: Backend -Loop Count: 22 -Port Binding In Cycles Per Iteration: --------------------------------------------------------------------------------------------------- -| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------------------------- -| Cycles | 18.0 0.0 | 3.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 18.0 | 3.0 | 0.0 | --------------------------------------------------------------------------------------------------- - -DV - Divider pipe (on port 0) -D - Data fetch pipe (on ports 2 and 3) -F - Macro Fusion with the previous instruction occurred -* - instruction micro-ops not bound to a port -^ - Micro Fusion occurred -# - ESP Tracking sync uop was issued -@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected -X - instruction not supported, was not accounted in Analysis - -| Num Of | Ports pressure in cycles | | -| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | ------------------------------------------------------------------------------------------ -| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4 -| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm18 -| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm20{k5}{z}, ymmword ptr [rcx+r15*4] -| 1* | | | | | | | | | vmovaps zmm22, zmm19 -| 1 | | 1.0 | | | | | | | add r15, 0x8 -| 1 | 1.0 | | | | | | | | kmovw k2, k5 -| 1* | | | | | | | | | vmovaps zmm21, zmm19 -| 1 | 1.0 | | | | | | | | kmovw k1, k5 -| 1* | | | | | | | | | vmovaps zmm23, zmm19 -| 1 | 1.0 | | | | | | | | kmovw k3, k5 -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm23, k3, zmmword ptr [rsi+ymm20*8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k2, zmmword ptr [rax+ymm20*8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k1, zmmword ptr [rdx+ymm20*8] -| 1 | | | | | | 1.0 | | | vsubpd zmm0, zmm5, zmm22 -| 1 | | | | | | 1.0 | | | vsubpd zmm1, zmm2, zmm21 -| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm6, zmm23 -| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm0, zmm0 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm20, zmm1, zmm1 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm20, zmm21, zmm21 -| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm31, zmm20 -| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm20, zmm16, 0x1 -| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm31, 0x1e -| 1* | | | | | | | | | vmovaps zmm24, zmm20 -| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm24, zmm31, qword ptr [rip]{1to8} -| 1 | 1.0 | | | | | | | | knotw k4, k0 -| 1 | | | | | | 1.0 | | | vmulpd zmm25, zmm24, zmm24 -| 1 | | | | | | 1.0 | | | vfmadd213pd zmm31{k4}, zmm24, zmm31 -| 1 | 1.0 | | | | | | | | vfmadd213pd zmm31{k4}, zmm25, zmm31 -| 1 | | | | | | 1.0 | | | vmulpd zmm26, zmm31, zmm15 -| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm31, zmm14 -| 1 | | | | | | 1.0 | | | vmulpd zmm29, zmm31, zmm26 -| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm31, zmm29 -| 1 | | | | | | 1.0 | | | vfmsub213pd zmm31, zmm29, zmm7 -| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm27, zmm28 -| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm30, zmm31 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k6}, zmm24, zmm1 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm12{k6}, zmm24, zmm0 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k6}, zmm24, zmm21 -| 1* | | | | | | | | | cmp r15, r14 -| 0*F | | | | | | | | | jb 0xffffffffffffff19 -Total Num Of Uops: 55 -Analysis Notes: -Backend allocation was stalled due to unavailable allocation resources. -There were bubbles in the frontend. diff --git a/arch_analysis/iaca_force_soa_lt1200.txt b/arch_analysis/iaca_force_soa_lt1200.txt deleted file mode 100644 index 38f7014..0000000 --- a/arch_analysis/iaca_force_soa_lt1200.txt +++ /dev/null @@ -1,72 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_lt1200_markers.o -Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 -Analyzed File - force_soa_lt1200_markers.o -Binary Format - 64Bit -Architecture - SKX -Analysis Type - Throughput - -Throughput Analysis Report --------------------------- -Block Throughput: 30.25 Cycles Throughput Bottleneck: Backend -Loop Count: 23 -Port Binding In Cycles Per Iteration: --------------------------------------------------------------------------------------------------- -| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------------------------- -| Cycles | 16.0 0.0 | 2.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 19.0 | 3.0 | 0.0 | --------------------------------------------------------------------------------------------------- - -DV - Divider pipe (on port 0) -D - Data fetch pipe (on ports 2 and 3) -F - Macro Fusion with the previous instruction occurred -* - instruction micro-ops not bound to a port -^ - Micro Fusion occurred -# - ESP Tracking sync uop was issued -@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected -X - instruction not supported, was not accounted in Analysis - -| Num Of | Ports pressure in cycles | | -| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | ------------------------------------------------------------------------------------------ -| 1 | | | | | | 1.0 | | | vpcmpeqb k2, xmm0, xmm0 -| 1 | | 1.0 | | | | | | | add r9d, 0x8 -| 1 | | | | | | 1.0 | | | vpcmpeqb k1, xmm0, xmm0 -| 1 | | | | | | 1.0 | | | vpcmpeqb k3, xmm0, xmm0 -| 1 | | | 1.0 1.0 | | | | | | vmovdqu ymm3, ymmword ptr [rcx+r14*4] -| 1 | | 1.0 | | | | | | | add r14, 0x8 -| 1* | | | | | | | | | vpxord zmm5, zmm5, zmm5 -| 1* | | | | | | | | | vpxord zmm4, zmm4, zmm4 -| 1* | | | | | | | | | vpxord zmm6, zmm6, zmm6 -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm5, k2, zmmword ptr [rax+ymm3*8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm4, k1, zmmword ptr [rdx+ymm3*8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm6, k3, zmmword ptr [rsi+ymm3*8] -| 1 | | | | | | 1.0 | | | vsubpd zmm29, zmm1, zmm5 -| 1 | 1.0 | | | | | | | | vsubpd zmm28, zmm0, zmm4 -| 1 | | | | | | 1.0 | | | vsubpd zmm31, zmm2, zmm6 -| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm29, zmm29 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm20, zmm28, zmm28 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm20, zmm31, zmm31 -| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm27, zmm20 -| 1 | | | | | | 1.0 | | | vcmppd k5, zmm20, zmm16, 0x1 -| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm27, 0x1e -| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm20, zmm27, qword ptr [rip]{1to8} -| 1 | 1.0 | | | | | | | | knotw k4, k0 -| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm20, zmm20 -| 1 | | | | | | 1.0 | | | vfmadd213pd zmm27{k4}, zmm20, zmm27 -| 1 | 1.0 | | | | | | | | vfmadd213pd zmm27{k4}, zmm21, zmm27 -| 1 | | | | | | 1.0 | | | vmulpd zmm22, zmm27, zmm15 -| 1 | 1.0 | | | | | | | | vmulpd zmm24, zmm27, zmm14 -| 1 | | | | | | 1.0 | | | vmulpd zmm25, zmm27, zmm22 -| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm27, zmm25 -| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm25, zmm7 -| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm23, zmm24 -| 1 | | | | | | 1.0 | | | vmulpd zmm30, zmm26, zmm27 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k5}, zmm30, zmm28 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm12{k5}, zmm30, zmm29 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k5}, zmm30, zmm31 -| 1* | | | | | | | | | cmp r9d, ebx -| 0*F | | | | | | | | | jb 0xffffffffffffff22 -Total Num Of Uops: 52 -Analysis Notes: -Backend allocation was stalled due to unavailable allocation resources. -There were bubbles in the frontend. diff --git a/arch_analysis/iaca_force_soa_lt8.txt b/arch_analysis/iaca_force_soa_lt8.txt deleted file mode 100644 index c9782f9..0000000 --- a/arch_analysis/iaca_force_soa_lt8.txt +++ /dev/null @@ -1,78 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_lt8_markers.o -Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45 -Analyzed File - force_soa_lt8_markers.o -Binary Format - 64Bit -Architecture - SKX -Analysis Type - Throughput - -Throughput Analysis Report --------------------------- -Block Throughput: 35.00 Cycles Throughput Bottleneck: Backend -Loop Count: 22 -Port Binding In Cycles Per Iteration: --------------------------------------------------------------------------------------------------- -| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | --------------------------------------------------------------------------------------------------- -| Cycles | 20.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 20.0 | 4.0 | 0.0 | --------------------------------------------------------------------------------------------------- - -DV - Divider pipe (on port 0) -D - Data fetch pipe (on ports 2 and 3) -F - Macro Fusion with the previous instruction occurred -* - instruction micro-ops not bound to a port -^ - Micro Fusion occurred -# - ESP Tracking sync uop was issued -@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected -X - instruction not supported, was not accounted in Analysis - -| Num Of | Ports pressure in cycles | | -| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 | ------------------------------------------------------------------------------------------ -| 1 | | 1.0 | | | | | | | imul r8, r12 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm9, xmm9 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm2, xmm8 -| 1 | | | | | | 1.0 | | | vbroadcastsd zmm10, xmm10 -| 1 | | 1.0 | | | | | | | sub r13d, ebx -| 1 | | 1.0 | | | | | | | add r8, r11 -| 1 | | | | | | 1.0 | | | vpbroadcastd ymm0, r13d -| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm0, ymm17 -| 1 | | | | | | | 1.0 | | movsxd rbx, ebx -| 1* | | | | | | | | | vmovaps zmm4, zmm19 -| 1 | 1.0 | | | | | | | | kmovw k2, k5 -| 1* | | | | | | | | | vmovaps zmm3, zmm19 -| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm1{k5}{z}, ymmword ptr [r8+rbx*4] -| 1 | 1.0 | | | | | | | | kmovw k1, k5 -| 1* | | | | | | | | | vmovaps zmm5, zmm19 -| 1 | 1.0 | | | | | | | | kmovw k3, k5 -| 5^ | 2.0 | | 4.0 4.0 | 4.0 4.0 | | | 1.0 | | vgatherdpd zmm5, k3, zmmword ptr [rsi+ymm1*8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm4, k2, zmmword ptr [rax+ymm1*8] -| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm3, k1, zmmword ptr [rdx+ymm1*8] -| 1 | 1.0 | | | | | | | | vsubpd zmm30, zmm10, zmm5 -| 1 | | | | | | 1.0 | | | vsubpd zmm28, zmm9, zmm4 -| 1 | 1.0 | | | | | | | | vsubpd zmm27, zmm2, zmm3 -| 1 | | | | | | 1.0 | | | vmulpd zmm26, zmm28, zmm28 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm26, zmm27, zmm27 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm26, zmm30, zmm30 -| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm25, zmm26 -| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm26, zmm16, 0x1 -| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm25, 0x1e -| 1* | | | | | | | | | vmovaps zmm6, zmm26 -| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm6, zmm25, qword ptr [rip]{1to8} -| 1 | 1.0 | | | | | | | | knotw k4, k0 -| 1 | | | | | | 1.0 | | | vmulpd zmm8, zmm6, zmm6 -| 1 | 1.0 | | | | | | | | vfmadd213pd zmm25{k4}, zmm6, zmm25 -| 1 | | | | | | 1.0 | | | vfmadd213pd zmm25{k4}, zmm8, zmm25 -| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm25, zmm15 -| 1 | | | | | | 1.0 | | | vmulpd zmm22, zmm25, zmm14 -| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm25, zmm20 -| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm25, zmm23 -| 1 | 1.0 | | | | | | | | vfmsub213pd zmm25, zmm23, zmm7 -| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm21, zmm22 -| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm24, zmm25 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k6}, zmm29, zmm27 -| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k6}, zmm29, zmm28 -| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k6}, zmm29, zmm30 -Total Num Of Uops: 60 -Analysis Notes: -Backend allocation was stalled due to unavailable allocation resources. -There were bubbles in the frontend. diff --git a/arch_analysis/osaca_force_aos_geq1200.txt b/arch_analysis/osaca_force_aos_geq1200.txt deleted file mode 100644 index 5e0907a..0000000 --- a/arch_analysis/osaca_force_aos_geq1200.txt +++ /dev/null @@ -1,80 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_aos_geq1200_markers.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: force_aos_geq1200_markers.s -Architecture: CSX -Timestamp: 2021-04-29 15:53:50 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 196 | | | | | | 1.00 | | || | | vpcmpgtd %ymm3, %ymm2, %k3 #67.9 - 197 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #68.21 - 198 | 1.00 | | | | | | | || | | kmovw %k3, %r9d #67.9 - 199 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #69.36 - 200 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm17 #69.36 - 201 | | | | | | | | || | | # LOE rax rcx rbx rbp rsi rdi r8 r10 r13 r15 r9d r11d r12d r14d xmm6 xmm7 xmm12 ymm2 ymm3 ymm15 ymm16 ymm17 zmm0 zmm1 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 k3 - 202 | | | | | | | | || | | ..B1.21: # Preds ..B1.18 - 203 | | | | | | | | || | | # Execution count [1.25e+01] - 204 | 1.00 | | | | | | | || | | kmovw %k3, %k1 #69.36 - 205 | 1.00 | | | | | | | || | | kmovw %k3, %k2 #69.36 - 206 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm18, %zmm18, %zmm18 #69.36 - 207 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm19, %zmm19, %zmm19 #69.36 - 208 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #69.36 - 209 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd 16(%rdi,%ymm17,8), %zmm18{%k1} #69.36 - 210 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd 8(%rdi,%ymm17,8), %zmm19{%k2} #69.36 - 211 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdi,%ymm17,8), %zmm20{%k3} #69.36 - 212 | | | | | | | | || | | # LOE rax rcx rbx rbp rsi rdi r8 r10 r13 r15 r9d r11d r12d r14d xmm6 xmm7 xmm12 ymm2 ymm3 ymm15 ymm16 zmm0 zmm1 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20 - 213 | | | | | | | | || | | ..B1.22: # Preds ..B1.21 - 214 | | | | | | | | || | | # Execution count [2.50e+01] - 215 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r13 #67.9 - 216 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm16, %ymm3, %ymm3 #67.9 - 217 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm4, %zmm29 #71.36 - 218 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm19, %zmm0, %zmm27 #70.36 - 219 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm1, %zmm26 #69.36 - 220 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm27, %zmm27, %zmm25 #72.49 - 221 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm26, %zmm26, %zmm25 #72.49 - 222 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm29, %zmm29, %zmm25 #72.63 - 223 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm25, %zmm24 #75.38 - 224 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm25, %k2 #74.22 - 225 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm24, %k0 #75.38 - 226 | 1.00 | | | | | | | || | | kmovw %k2, %edx #74.22 - 227 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38 - 228 | | | | | | | | || | | * vmovaps %zmm25, %zmm17 #75.38 - 229 | 0.00 | 0.34 | | | | 0.00 | 0.66 | || | | andl %edx, %r9d #74.22 - 230 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #75.38 - 231 | 1.00 | | | | | | | || | | kmovw %r9d, %k3 #78.17 - 232 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18 #75.38 - 233 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #75.38 - 234 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #75.38 - 235 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm24, %zmm19 #76.38 - 236 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm24, %zmm21 #77.54 - 237 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm19, %zmm24, %zmm22 #76.44 - 238 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm22, %zmm24, %zmm20 #76.50 - 239 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm22, %zmm24 #77.54 - 240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm21, %zmm20, %zmm23 #77.61 - 241 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm24, %zmm23, %zmm28 #77.67 - 242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm26, %zmm28, %zmm9{%k3} #78.17 - 243 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm27, %zmm28, %zmm8{%k3} #79.17 - 244 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #80.17 - 245 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %rbx, %r13 #67.9 - 246 | | | | | | | | || | | * jb ..B1.18 # Prob 82% #67.9 - - 20.5 6.00 13.0 2.50 13.0 2.50 20.5 4.00 70.0 4 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 215 | 1.0 | addq $8, %r13 #67.9| [215] - 216 | 1.0 | vpaddd %ymm16, %ymm3, %ymm3 #67.9| [216] - 244 | 4.0 | vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #80.17| [244] - 243 | 4.0 | vfmadd231pd %zmm27, %zmm28, %zmm8{%k3} #79.17| [243] - 242 | 4.0 | vfmadd231pd %zmm26, %zmm28, %zmm9{%k3} #78.17| [242] - diff --git a/arch_analysis/osaca_force_aos_lt1200.txt b/arch_analysis/osaca_force_aos_lt1200.txt deleted file mode 100644 index a30919e..0000000 --- a/arch_analysis/osaca_force_aos_lt1200.txt +++ /dev/null @@ -1,112 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/ICC$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: force.s -Architecture: CSX -Timestamp: 2021-04-26 22:33:06 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 261 | | | | | | | | || | | ..B1.25: # Preds ..B1.24 - 262 | | | | | | | | || | | # Execution count [4.50e+00] - 263 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | 1.0 | movq %r8, %r13 #56.43 - 264 | | 1.00 | | | | | | || 3.0 | 3.0 | imulq %rcx, %r13 #56.43 - 265 | | | | | | 1.00 | | || | | vbroadcastsd %xmm6, %zmm2 #58.23 - 266 | | | | | | 1.00 | | || | | vbroadcastsd %xmm7, %zmm1 #59.23 - 267 | | | | | | 1.00 | | || | | vbroadcastsd %xmm12, %zmm0 #60.23 - 268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | movslq %r12d, %rbx #67.9 - 269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | 1.0 | addq %r10, %r13 #37.5 - 270 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rax, -64(%rsp) #37.5[spill] - 271 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %r8, -56(%rsp) #37.5[spill] - 272 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %r10, -48(%rsp) #37.5[spill] - 273 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rsi, -40(%rsp) #37.5[spill] - 274 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rcx, -32(%rsp) #37.5[spill] - 275 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %r9, -80(%rsp) #37.5[spill] - 276 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rdx, -72(%rsp) #37.5[spill] - 277 | | | | | | | | || | | # LOE rbx rbp rdi r13 r11d r12d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 - 278 | | | | | | | | || | | ..B1.26: # Preds ..B1.30 ..B1.25 - 279 | | | | | | | | || | | # Execution count [2.50e+01] - 280 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%r13,%rbx,4), %ymm3 #68.21 - 281 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm3, %ymm3, %ymm4 #69.36 - 282 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm4, %ymm3, %ymm3 #69.36 - 283 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl (%r13,%rbx,4), %r10d #68.21 - 284 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 4(%r13,%rbx,4), %r9d #68.21 - 285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | 4.0 | movl 8(%r13,%rbx,4), %r8d #68.21 - 286 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 12(%r13,%rbx,4), %esi #68.21 - 287 | | 1.00 | | | | 0.00 | | || | | lea (%r10,%r10,2), %r10d #69.36 - 288 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 16(%r13,%rbx,4), %ecx #68.21 - 289 | | 1.00 | | | | 0.00 | | || | | lea (%r9,%r9,2), %r9d #69.36 - 290 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 20(%r13,%rbx,4), %edx #68.21 - 291 | | 1.00 | | | | 0.00 | | || | 1.0 | lea (%r8,%r8,2), %r8d #69.36 - 292 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 24(%r13,%rbx,4), %eax #68.21 - 293 | | 1.00 | | | | 0.00 | | || | | lea (%rsi,%rsi,2), %esi #69.36 - 294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 28(%r13,%rbx,4), %r15d #68.21 - 295 | | 1.00 | | | | 0.00 | | || | | lea (%rcx,%rcx,2), %ecx #69.36 - 296 | | 1.00 | | | | 0.00 | | || | | lea (%rdx,%rdx,2), %edx #69.36 - 297 | | 1.00 | | | | 0.00 | | || | | lea (%rax,%rax,2), %eax #69.36 - 298 | | 1.00 | | | | 0.00 | | || | | lea (%r15,%r15,2), %r15d #69.36 - 299 | | | | | | | | || | | # LOE rbx rbp rdi r13 eax edx ecx esi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm7 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 - 300 | | | | | | | | || | | ..B1.29: # Preds ..B1.26 - 301 | | | | | | | | || | | # Execution count [1.25e+01] - 302 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k1 #69.36 - 303 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k2 #69.36 - 304 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k3 #69.36 - 305 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm4, %zmm4, %zmm4 #69.36 - 306 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm17, %zmm17, %zmm17 #69.36 - 307 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm18, %zmm18, %zmm18 #69.36 - 308 | 1.50 | 0.17 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.83 | || | | vgatherdpd 16(%rdi,%ymm3,8), %zmm4{%k1} #69.36 - 309 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || 4.0 | | vgatherdpd 8(%rdi,%ymm3,8), %zmm17{%k2} #69.36 - 310 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdi,%ymm3,8), %zmm18{%k3} #69.36 - 311 | | | | | | | | || | | # LOE rbx rbp rdi r13 r11d r12d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 zmm17 zmm18 - 312 | | | | | | | | || | | ..B1.30: # Preds ..B1.29 - 313 | | | | | | | | || | | # Execution count [2.50e+01] - 314 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | addl $8, %r12d #67.9 - 315 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | addq $8, %rbx #67.9 - 316 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm4, %zmm0, %zmm26 #71.36 - 317 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm1, %zmm24 #70.36 - 318 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm2, %zmm23 #69.36 - 319 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm24, %zmm3 #72.49 - 320 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm23, %zmm23, %zmm3 #72.49 - 321 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm26, %zmm26, %zmm3 #72.63 - 322 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm3, %zmm22 #75.38 - 323 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm3, %k2 #74.22 - 324 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm22, %k0 #75.38 - 325 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #75.38 - 326 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38 - 327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm3, %zmm3, %zmm4 #75.38 - 328 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #75.38 - 329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #75.38 - 330 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm22, %zmm17 #76.38 - 331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm22, %zmm19 #77.54 - 332 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm22, %zmm20 #76.44 - 333 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm22, %zmm18 #76.50 - 334 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm5, %zmm20, %zmm22 #77.54 - 335 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm21 #77.61 - 336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm21, %zmm25 #77.67 - 337 | 1.00 | | | | | 0.00 | | || | | vfmadd231pd %zmm23, %zmm25, %zmm9{%k2} #78.17 - 338 | 1.00 | | | | | 0.00 | | || | | vfmadd231pd %zmm24, %zmm25, %zmm8{%k2} #79.17 - 339 | 1.00 | | | | | 0.00 | | || 4.0 | | vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #80.17 - 340 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpl %r14d, %r12d #67.9 - 341 | | | | | | | | || | | * jb ..B1.26 # Prob 82% #67.9 - 342 | | | | | | | | || | | # LOE rbx rbp rdi r13 r11d r12d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 - - 21.0 11.2 17.0 6.50 17.0 6.50 7.00 17.0 8.83 7.00 75.0 10.0 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 287 | 6.0 | lea (%r10,%r10,2), %r10d #69.36| [269, 283, 287] - 291 | 10.0 | lea (%r8,%r8,2), %r8d #69.36| [263, 264, 269, 285, 291] - 295 | 9.0 | lea (%rcx,%rcx,2), %ecx #69.36| [264, 269, 288, 295] - 314 | 1.0 | addl $8, %r12d #67.9| [314] - 339 | 4.0 | vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #80.17| [339] - 338 | 4.0 | vfmadd231pd %zmm24, %zmm25, %zmm8{%k2} #79.17| [338] - 337 | 4.0 | vfmadd231pd %zmm23, %zmm25, %zmm9{%k2} #78.17| [337] diff --git a/arch_analysis/osaca_force_aos_lt8.txt b/arch_analysis/osaca_force_aos_lt8.txt deleted file mode 100644 index 810dae9..0000000 --- a/arch_analysis/osaca_force_aos_lt8.txt +++ /dev/null @@ -1,91 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_aos_lt8_markers.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: force_aos_lt8_markers.s -Architecture: CSX -Timestamp: 2021-04-29 15:49:27 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 358 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 - 359 | | | | | | | | || | | ..B1.33: # Preds ..B1.32 - 360 | | | | | | | | || | | # Execution count [2.50e+01] - 361 | | 1.00 | | | | | | || 3.0 | | imulq %r8, %rcx #56.43 - 362 | | | | | | 1.00 | | || | 3.0 | vbroadcastsd %xmm6, %zmm4 #58.23 - 363 | | | | | | | | || | | X subl %r14d, %r11d #67.9 - 364 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | | addq %r10, %rcx #37.5 - 365 | | | | | | | | || | | X vpbroadcastd %r11d, %ymm0 #67.9 - 366 | | | | | | 1.00 | | || | | vpcmpgtd %ymm15, %ymm0, %k3 #67.9 - 367 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | movslq %r14d, %r14 #67.9 - 368 | 1.00 | | | | | | | || | | kmovw %k3, %ebx #67.9 - 369 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%rcx,%r14,4), %ymm1{%k3}{z} #68.21 - 370 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm1, %ymm1, %ymm2 #69.36 - 371 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm2, %ymm1, %ymm0 #69.36 - 372 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r9 r10 ebx xmm7 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 k3 - 373 | | | | | | | | || | | ..B1.36: # Preds ..B1.33 - 374 | | | | | | | | || | | # Execution count [1.25e+01] - 375 | 1.00 | | | | | | | || | | kmovw %k3, %k1 #69.36 - 376 | 1.00 | | | | | | | || | | kmovw %k3, %k2 #69.36 - 377 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm1, %zmm1, %zmm1 #69.36 - 378 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm2, %zmm2, %zmm2 #69.36 - 379 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm3, %zmm3, %zmm3 #69.36 - 380 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd 16(%rdi,%ymm0,8), %zmm1{%k1} #69.36 - 381 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd 8(%rdi,%ymm0,8), %zmm2{%k2} #69.36 - 382 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdi,%ymm0,8), %zmm3{%k3} #69.36 - 383 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r9 r10 ebx xmm7 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 - 384 | | | | | | | | || | | ..B1.37: # Preds ..B1.36 - 385 | | | | | | | | || | | # Execution count [2.50e+01] - 386 | | | | | | 1.00 | | || | | vbroadcastsd %xmm7, %zmm7 #59.23 - 387 | | | | | | 1.00 | | || | | vbroadcastsd %xmm12, %zmm12 #60.23 - 388 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm1, %zmm12, %zmm23 #71.36 - 389 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm2, %zmm7, %zmm21 #70.36 - 390 | 0.50 | | | | | 0.50 | | || | 4.0 | vsubpd %zmm3, %zmm4, %zmm20 #69.36 - 391 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm21, %zmm19 #72.49 - 392 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm20, %zmm20, %zmm19 #72.49 - 393 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm23, %zmm23, %zmm19 #72.63 - 394 | 2.50 | | | | | 0.50 | | || 8.0 | 8.0 | vrcp14pd %zmm19, %zmm18 #75.38 - 395 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm19, %k2 #74.22 - 396 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm18, %k0 #75.38 - 397 | 1.00 | | | | | | | || | | kmovw %k2, %ecx #74.22 - 398 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38 - 399 | | | | | | | | || | | * vmovaps %zmm19, %zmm0 #75.38 - 400 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | andl %ecx, %ebx #74.22 - 401 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #75.38 - 402 | 1.00 | | | | | | | || | | kmovw %ebx, %k3 #78.17 - 403 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm0, %zmm0, %zmm1 #75.38 - 404 | 0.50 | | | | | 0.50 | | || | 4.0 | vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #75.38 - 405 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #75.38 - 406 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm18, %zmm2 #76.38 - 407 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm18, %zmm4 #77.54 - 408 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vmulpd %zmm2, %zmm18, %zmm6 #76.44 - 409 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm6, %zmm18, %zmm3 #76.50 - 410 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm5, %zmm6, %zmm18 #77.54 - 411 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm4, %zmm3, %zmm17 #77.61 - 412 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm18, %zmm17, %zmm22 #77.67 - 413 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm22, %zmm9{%k3} #78.17 - 414 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm21, %zmm22, %zmm8{%k3} #79.17 - 415 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17 - - 22.0 5.00 13.0 2.50 13.0 2.50 22.0 5.00 70.0 35.0 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 363 | 0.0 | subl %r14d, %r11d #67.9| [363] - 367 | 1.0 | movslq %r14d, %r14 #67.9| [367] - 386 | 3.0 | vbroadcastsd %xmm7, %zmm7 #59.23| [386] - 387 | 3.0 | vbroadcastsd %xmm12, %zmm12 #60.23| [387] - 415 | 4.0 | vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17| [415] - 414 | 4.0 | vfmadd231pd %zmm21, %zmm22, %zmm8{%k3} #79.17| [414] - 413 | 4.0 | vfmadd231pd %zmm20, %zmm22, %zmm9{%k3} #78.17| [413] - 397 | 28.0 | kmovw %k2, %ecx #74.22| [361, 364, 369, 371, 382, 390, 392, 393, 395, 397] - 408 | 35.0 | vmulpd %zmm2, %zmm18, %zmm6 #76.44| [362, 390, 392, 393, 394, 404, 405, 408] - diff --git a/arch_analysis/osaca_force_soa_geq1200.txt b/arch_analysis/osaca_force_soa_geq1200.txt deleted file mode 100644 index a98d11b..0000000 --- a/arch_analysis/osaca_force_soa_geq1200.txt +++ /dev/null @@ -1,71 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_geq1200_markers.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: force_soa_geq1200_markers.s -Architecture: CSX -Timestamp: 2021-04-29 15:54:23 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 189 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r10 r11 r12 r14 r15 ebx r9d r13d xmm8 xmm9 xmm10 ymm3 ymm4 ymm17 ymm18 zmm2 zmm5 zmm6 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19 - 190 | | | | | | | | || | | ..B1.18: # Preds ..B1.18 ..B1.17 - 191 | | | | | | | | || | | # Execution count [2.50e+01] - 192 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #67.9 - 193 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm18, %ymm4, %ymm4 #67.9 - 194 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rcx,%r15,4), %ymm20{%k5}{z} #68.21 - 195 | | | | | | | | || | | * vmovaps %zmm19, %zmm22 #70.36 - 196 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addq $8, %r15 #67.9 - 197 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #70.36 - 198 | | | | | | | | || | | * vmovaps %zmm19, %zmm21 #69.36 - 199 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #69.36 - 200 | | | | | | | | || | | * vmovaps %zmm19, %zmm23 #71.36 - 201 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #71.36 - 202 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm20,8), %zmm23{%k3} #71.36 - 203 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%rax,%ymm20,8), %zmm22{%k2} #70.36 - 204 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdx,%ymm20,8), %zmm21{%k1} #69.36 - 205 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm22, %zmm5, %zmm0 #70.36 - 206 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm21, %zmm2, %zmm1 #69.36 - 207 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm23, %zmm6, %zmm21 #71.36 - 208 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm0, %zmm0, %zmm20 #72.49 - 209 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm1, %zmm1, %zmm20 #72.49 - 210 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm21, %zmm21, %zmm20 #72.63 - 211 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm20, %zmm31 #75.38 - 212 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm20, %k6{%k5} #74.22 - 213 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm31, %k0 #75.38 - 214 | | | | | | | | || | | * vmovaps %zmm20, %zmm24 #75.38 - 215 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm31, %zmm24 #75.38 - 216 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38 - 217 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm24, %zmm25 #75.38 - 218 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm31, %zmm24, %zmm31{%k4} #75.38 - 219 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm31, %zmm25, %zmm31{%k4} #75.38 - 220 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm31, %zmm26 #76.38 - 221 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm31, %zmm28 #77.54 - 222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm26, %zmm31, %zmm29 #76.44 - 223 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm31, %zmm27 #76.50 - 224 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm7, %zmm29, %zmm31 #77.54 - 225 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm28, %zmm27, %zmm30 #77.61 - 226 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm31, %zmm30, %zmm24 #77.67 - 227 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm1, %zmm24, %zmm13{%k6} #78.17 - 228 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm0, %zmm24, %zmm12{%k6} #79.17 - 229 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm21, %zmm24, %zmm11{%k6} #80.17 - 230 | 0.00 | 0.17 | | | | 0.00 | 0.83 | || | | cmpq %r14, %r15 #67.9 - 231 | | | | | | | | || | | * jb ..B1.18 # Prob 82% #67.9 - - 18.0 4.17 13.0 2.50 13.0 2.50 18.0 2.83 68.0 4 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 193 | 1.0 | vpaddd %ymm18, %ymm4, %ymm4 #67.9| [193] - 196 | 1.0 | addq $8, %r15 #67.9| [196] - 228 | 4.0 | vfmadd231pd %zmm0, %zmm24, %zmm12{%k6} #79.17| [228] - 227 | 4.0 | vfmadd231pd %zmm1, %zmm24, %zmm13{%k6} #78.17| [227] - 229 | 4.0 | vfmadd231pd %zmm21, %zmm24, %zmm11{%k6} #80.17| [229] diff --git a/arch_analysis/osaca_force_soa_lt1200.txt b/arch_analysis/osaca_force_soa_lt1200.txt deleted file mode 100644 index ded0878..0000000 --- a/arch_analysis/osaca_force_soa_lt1200.txt +++ /dev/null @@ -1,69 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_lt1200_markers.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: force_soa_lt1200_markers.s -Architecture: CSX -Timestamp: 2021-04-29 15:39:58 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 253 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r10 r11 r12 r14 ebx r9d r13d xmm8 xmm9 xmm10 ymm17 ymm18 zmm0 zmm1 zmm2 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19 - 254 | | | | | | | | || | | ..B1.22: # Preds ..B1.22 ..B1.21 - 255 | | | | | | | | || | | # Execution count [2.50e+01] - 256 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k2 #70.36 - 257 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl $8, %r9d #67.9 - 258 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k1 #69.36 - 259 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k3 #71.36 - 260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rcx,%r14,4), %ymm3 #68.21 - 261 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addq $8, %r14 #67.9 - 262 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm5, %zmm5, %zmm5 #70.36 - 263 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm4, %zmm4, %zmm4 #69.36 - 264 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm6, %zmm6, %zmm6 #71.36 - 265 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%rax,%ymm3,8), %zmm5{%k2} #70.36 - 266 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdx,%ymm3,8), %zmm4{%k1} #69.36 - 267 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm3,8), %zmm6{%k3} #71.36 - 268 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm5, %zmm1, %zmm29 #70.36 - 269 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm4, %zmm0, %zmm28 #69.36 - 270 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm6, %zmm2, %zmm31 #71.36 - 271 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm20 #72.49 - 272 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm20 #72.49 - 273 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm20 #72.63 - 274 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm20, %zmm27 #75.38 - 275 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm20, %k5 #74.22 - 276 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm27, %k0 #75.38 - 277 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm27, %zmm20 #75.38 - 278 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38 - 279 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm20, %zmm21 #75.38 - 280 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm27, %zmm20, %zmm27{%k4} #75.38 - 281 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm27, %zmm21, %zmm27{%k4} #75.38 - 282 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm27, %zmm22 #76.38 - 283 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm27, %zmm24 #77.54 - 284 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm27, %zmm25 #76.44 - 285 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm27, %zmm23 #76.50 - 286 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm7, %zmm25, %zmm27 #77.54 - 287 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm23, %zmm26 #77.61 - 288 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm30 #77.67 - 289 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm13{%k5} #78.17 - 290 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm12{%k5} #79.17 - 291 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm31, %zmm30, %zmm11{%k5} #80.17 - 292 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | cmpl %ebx, %r9d #67.9 - 293 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #67.9 - - 17.5 3.00 13.0 2.50 13.0 2.50 17.5 3.00 68.0 4 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 257 | 1.0 | addl $8, %r9d #67.9| [257] - 261 | 1.0 | addq $8, %r14 #67.9| [261] - 290 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm12{%k5} #79.17| [290] - 289 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm13{%k5} #78.17| [289] - 291 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm11{%k5} #80.17| [291] diff --git a/arch_analysis/osaca_force_soa_lt8.txt b/arch_analysis/osaca_force_soa_lt8.txt deleted file mode 100644 index bf328fa..0000000 --- a/arch_analysis/osaca_force_soa_lt8.txt +++ /dev/null @@ -1,79 +0,0 @@ -iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_lt8_markers.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: force_soa_lt8_markers.s -Architecture: CSX -Timestamp: 2021-04-29 15:52:48 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 300 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r10 r11 r12 ebx r13d xmm8 xmm9 xmm10 ymm17 ymm18 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19 - 301 | | | | | | | | || | | ..B1.25: # Preds ..B1.24 - 302 | | | | | | | | || | | # Execution count [2.50e+01] - 303 | | 1.00 | | | | | | || 3.0 | | imulq %r12, %r8 #56.43 - 304 | | | | | | 1.00 | | || | | vbroadcastsd %xmm9, %zmm9 #59.23 - 305 | | | | | | 1.00 | | || | 3.0 | vbroadcastsd %xmm8, %zmm2 #58.23 - 306 | | | | | | 1.00 | | || | | vbroadcastsd %xmm10, %zmm10 #60.23 - 307 | | | | | | | | || | | X subl %ebx, %r13d #67.9 - 308 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | addq %r11, %r8 #37.5 - 309 | | | | | | | | || | | X vpbroadcastd %r13d, %ymm0 #67.9 - 310 | | | | | | 1.00 | | || | | vpcmpgtd %ymm17, %ymm0, %k5 #67.9 - 311 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ebx, %rbx #67.9 - 312 | | | | | | | | || | | * vmovaps %zmm19, %zmm4 #70.36 - 313 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #70.36 - 314 | | | | | | | | || | | * vmovaps %zmm19, %zmm3 #69.36 - 315 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r8,%rbx,4), %ymm1{%k5}{z} #68.21 - 316 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #69.36 - 317 | | | | | | | | || | | * vmovaps %zmm19, %zmm5 #71.36 - 318 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #71.36 - 319 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm1,8), %zmm5{%k3} #71.36 - 320 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || 4.0 | | vgatherdpd (%rax,%ymm1,8), %zmm4{%k2} #70.36 - 321 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdx,%ymm1,8), %zmm3{%k1} #69.36 - 322 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm5, %zmm10, %zmm30 #71.36 - 323 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm4, %zmm9, %zmm28 #70.36 - 324 | 0.50 | | | | | 0.50 | | || | 4.0 | vsubpd %zmm3, %zmm2, %zmm27 #69.36 - 325 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm28, %zmm26 #72.49 - 326 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm27, %zmm27, %zmm26 #72.49 - 327 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm30, %zmm30, %zmm26 #72.63 - 328 | 2.50 | | | | | 0.50 | | || 8.0 | 8.0 | vrcp14pd %zmm26, %zmm25 #75.38 - 329 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm26, %k6{%k5} #74.22 - 330 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm25, %k0 #75.38 - 331 | | | | | | | | || | | * vmovaps %zmm26, %zmm6 #75.38 - 332 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | 4.0 | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm25, %zmm6 #75.38 - 333 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38 - 334 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vmulpd %zmm6, %zmm6, %zmm8 #75.38 - 335 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm25, %zmm6, %zmm25{%k4} #75.38 - 336 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm25, %zmm8, %zmm25{%k4} #75.38 - 337 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm25, %zmm20 #76.38 - 338 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm25, %zmm22 #77.54 - 339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm25, %zmm23 #76.44 - 340 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm25, %zmm21 #76.50 - 341 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm7, %zmm23, %zmm25 #77.54 - 342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm21, %zmm24 #77.61 - 343 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm24, %zmm29 #77.67 - 344 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm27, %zmm29, %zmm13{%k6} #78.17 - 345 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm29, %zmm12{%k6} #79.17 - 346 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm30, %zmm29, %zmm11{%k6} #80.17 - - 19.5 3.50 13.0 2.50 13.0 2.50 19.5 3.50 68.0 31.0 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 308 | 4.0 | addq %r11, %r8 #37.5| [303, 308] - 304 | 3.0 | vbroadcastsd %xmm9, %zmm9 #59.23| [304] - 306 | 3.0 | vbroadcastsd %xmm10, %zmm10 #60.23| [306] - 307 | 0.0 | subl %ebx, %r13d #67.9| [307] - 311 | 1.0 | movslq %ebx, %rbx #67.9| [311] - 346 | 4.0 | vfmadd231pd %zmm30, %zmm29, %zmm11{%k6} #80.17| [346] - 345 | 4.0 | vfmadd231pd %zmm28, %zmm29, %zmm12{%k6} #79.17| [345] - 344 | 4.0 | vfmadd231pd %zmm27, %zmm29, %zmm13{%k6} #78.17| [344] - 334 | 31.0 | vmulpd %zmm6, %zmm6, %zmm8 #75.38| [305, 324, 326, 327, 328, 332, 334] diff --git a/arch_analysis/osaca_force_soa_lt8_iln1000.txt b/arch_analysis/osaca_force_soa_lt8_iln1000.txt deleted file mode 100644 index 8ff88d5..0000000 --- a/arch_analysis/osaca_force_soa_lt8_iln1000.txt +++ /dev/null @@ -1,70 +0,0 @@ -iwia021h@testfront1:~/MD-Bench$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX ICC/force.s -Open Source Architecture Code Analyzer (OSACA) - 0.3.14 -Analyzed file: ICC/force.s -Architecture: CSX -Timestamp: 2021-04-30 16:08:44 - - - P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction - * - Instruction micro-ops not bound to a port - X - No throughput/latency information for this instruction in data file - - -Combined Analysis Report ------------------------- - Port pressure in cycles - | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | -------------------------------------------------------------------------------------------------- - 306 | | | | | | | | || | | # LOE rbp rdi r8 r9 r10 edx ecx r11d r12d r13d r14d r15d ymm13 ymm14 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm15 - 307 | | | | | | | | || | | ..B1.29: # Preds ..B1.28 - 308 | | | | | | | | || | | # Execution count [2.50e+04] - 309 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | movl %r14d, %eax #64.13 - 310 | | | | | | | | || | | X subl %ecx, %eax #64.13 - 311 | | | | | | | | || | | X vpbroadcastd %eax, %ymm0 #64.13 - 312 | | | | | | 1.00 | | || | | vpcmpgtd %ymm14, %ymm0, %k5 #64.13 - 313 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ecx, %rcx #64.13 - 314 | | | | | | | | || | | * vmovaps %zmm15, %zmm17 #67.40 - 315 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #67.40 - 316 | | | | | | | | || | | * vmovaps %zmm15, %zmm16 #66.40 - 317 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rdi,%rcx,4), %ymm1{%k5}{z} #65.25 - 318 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #66.40 - 319 | | | | | | | | || | | * vmovaps %zmm15, %zmm18 #68.40 - 320 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #68.40 - 321 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%r8,%ymm1,8), %zmm18{%k3} #68.40 - 322 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%r9,%ymm1,8), %zmm17{%k2} #67.40 - 323 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || | | vgatherdpd (%r10,%ymm1,8), %zmm16{%k1} #66.40 - 324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm3, %zmm31 #68.40 - 325 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm2, %zmm29 #67.40 - 326 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm16, %zmm4, %zmm28 #66.40 - 327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm27 #69.53 - 328 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm27 #69.53 - 329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm27 #69.67 - 330 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm27, %zmm26 #72.42 - 331 | | | | | | 1.00 | | || | | vcmppd $1, %zmm12, %zmm27, %k6{%k5} #71.26 - 332 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm26, %k0 #72.42 - 333 | | | | | | | | || | | * vmovaps %zmm27, %zmm19 #72.42 - 334 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #72.42 - 335 | 1.00 | | | | | | | || | | knotw %k0, %k4 #72.42 - 336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm19, %zmm20 #72.42 - 337 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #72.42 - 338 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #72.42 - 339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm11, %zmm26, %zmm21 #73.42 - 340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm9, %zmm26, %zmm23 #74.58 - 341 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm26, %zmm24 #73.48 - 342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm26, %zmm22 #73.54 - 343 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm10, %zmm24, %zmm26 #74.58 - 344 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm23, %zmm22, %zmm25 #74.65 - 345 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm26, %zmm25, %zmm30 #74.71 - 346 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21 - 347 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21 - 348 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21 - - 18.0 3.00 13.0 2.50 13.0 2.50 18.0 3.00 68.0 4 - - -Loop-Carried Dependencies Analysis Report ------------------------------------------ - 313 | 1.0 | movslq %ecx, %rcx #64.13| [313] - 348 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21| [348] - 347 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21| [347] - 346 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21| [346]