Update results for arch_analysis and stub script
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
79
arch_analysis/iaca_force_aos_geq1200.txt
Normal file
79
arch_analysis/iaca_force_aos_geq1200.txt
Normal file
@@ -0,0 +1,79 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_aos_geq1200_markers.o
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - force_aos_geq1200_markers.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 33.05 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 20.0 0.0 | 4.5 | 13.0 13.0 | 13.0 13.0 | 0.0 | 18.0 | 4.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k3, ymm2, ymm3
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k3}{z}, ymmword ptr [r15+r13*4]
|
||||
| 1 | 1.0 | | | | | | | | kmovw r9d, k3
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm17, ymm17, ymm18
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k3
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k3
|
||||
| 1* | | | | | | | | | vpxord zmm18, zmm18, zmm18
|
||||
| 1* | | | | | | | | | vpxord zmm19, zmm19, zmm19
|
||||
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm18, k1, zmmword ptr [rdi+ymm17*8+0x10]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm19, k2, zmmword ptr [rdi+ymm17*8+0x8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rdi+ymm17*8]
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add r13, 0x8
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm3, ymm3, ymm16
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm29, zmm4, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm27, zmm0, zmm19
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm26, zmm1, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm27, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm25, zmm26, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm25, zmm29, zmm29
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm24, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm25, zmm14, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm24, 0x1e
|
||||
| 1 | 1.0 | | | | | | | | kmovw edx, k2
|
||||
| 1 | 1.0 | | | | | | | | knotw k1, k0
|
||||
| 1* | | | | | | | | | vmovaps zmm17, zmm25
|
||||
| 1 | | | | | | | 1.0 | | and r9d, edx
|
||||
| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm17, zmm24, qword ptr [rip]{1to8}
|
||||
| 1 | | | | | | 1.0 | | | kmovw k3, r9d
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm24{k1}, zmm17, zmm24
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm24{k1}, zmm18, zmm24
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm24, zmm13
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm21, zmm24, zmm10
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm22, zmm24, zmm19
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm20, zmm24, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm24, zmm22, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm20, zmm21
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm23, zmm24
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k3}, zmm28, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k3}, zmm28, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k3}, zmm28, zmm29
|
||||
| 1* | | | | | | | | | cmp r13, rbx
|
||||
| 0*F | | | | | | | | | jb 0xfffffffffffffef7
|
||||
Total Num Of Uops: 60
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
82
arch_analysis/iaca_force_aos_lt8.txt
Normal file
82
arch_analysis/iaca_force_aos_lt8.txt
Normal file
@@ -0,0 +1,82 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - force_aos_lt8_markers.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 69.79 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 21.0 0.0 | 5.5 | 13.0 13.0 | 13.0 13.0 | 0.0 | 21.0 | 5.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | 1.0 | | | | | | | imul rcx, r8
|
||||
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm4, xmm6
|
||||
| 1 | | | | | | | 1.0 | | sub r11d, r14d
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add rcx, r10
|
||||
| 1 | | | | | | 1.0 | | | vpbroadcastd ymm0, r11d
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k3, ymm0, ymm15
|
||||
| 1 | | 0.5 | | | | | 0.5 | | movsxd r14, r14d
|
||||
| 1 | 1.0 | | | | | | | | kmovw ebx, k3
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm1{k3}{z}, ymmword ptr [rcx+r14*4]
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm2, ymm1, ymm1
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm0, ymm1, ymm2
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k3
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k3
|
||||
| 1* | | | | | | | | | vpxord zmm1, zmm1, zmm1
|
||||
| 1* | | | | | | | | | vpxord zmm2, zmm2, zmm2
|
||||
| 1* | | | | | | | | | vpxord zmm3, zmm3, zmm3
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm1, k1, zmmword ptr [rdi+ymm0*8+0x10]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm2, k2, zmmword ptr [rdi+ymm0*8+0x8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm3, k3, zmmword ptr [rdi+ymm0*8]
|
||||
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm7, xmm7
|
||||
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm12, xmm12
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm23, zmm12, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm21, zmm7, zmm2
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm20, zmm4, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm21, zmm21
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm19, zmm20, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm19, zmm23, zmm23
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm19, zmm14, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm18, 0x1e
|
||||
| 1 | 1.0 | | | | | | | | kmovw ecx, k2
|
||||
| 1 | 1.0 | | | | | | | | knotw k1, k0
|
||||
| 1* | | | | | | | | | vmovaps zmm0, zmm19
|
||||
| 1 | | 0.5 | | | | | 0.5 | | and ebx, ecx
|
||||
| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm0, zmm18, qword ptr [rip]{1to8}
|
||||
| 1 | | | | | | 1.0 | | | kmovw k3, ebx
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm1, zmm0, zmm0
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm18{k1}, zmm0, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm18{k1}, zmm1, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm2, zmm18, zmm13
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm4, zmm18, zmm10
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm6, zmm18, zmm2
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm3, zmm18, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm18, zmm6, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm17, zmm3, zmm4
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm22, zmm17, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k3}, zmm22, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k3}, zmm22, zmm21
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k3}, zmm22, zmm23
|
||||
Total Num Of Uops: 65
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
74
arch_analysis/iaca_force_soa_geq1200.txt
Normal file
74
arch_analysis/iaca_force_soa_geq1200.txt
Normal file
@@ -0,0 +1,74 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_geq1200_markers.o
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - force_soa_geq1200_markers.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 31.47 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 18.0 0.0 | 3.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 18.0 | 3.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm18
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm20{k5}{z}, ymmword ptr [rcx+r15*4]
|
||||
| 1* | | | | | | | | | vmovaps zmm22, zmm19
|
||||
| 1 | | 1.0 | | | | | | | add r15, 0x8
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k5
|
||||
| 1* | | | | | | | | | vmovaps zmm21, zmm19
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k5
|
||||
| 1* | | | | | | | | | vmovaps zmm23, zmm19
|
||||
| 1 | 1.0 | | | | | | | | kmovw k3, k5
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm23, k3, zmmword ptr [rsi+ymm20*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k2, zmmword ptr [rax+ymm20*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k1, zmmword ptr [rdx+ymm20*8]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm0, zmm5, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm1, zmm2, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm6, zmm23
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm0, zmm0
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm20, zmm1, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm20, zmm21, zmm21
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm31, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm20, zmm16, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm31, 0x1e
|
||||
| 1* | | | | | | | | | vmovaps zmm24, zmm20
|
||||
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm24, zmm31, qword ptr [rip]{1to8}
|
||||
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm25, zmm24, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd213pd zmm31{k4}, zmm24, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd213pd zmm31{k4}, zmm25, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm26, zmm31, zmm15
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm31, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm29, zmm31, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm31, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm31, zmm29, zmm7
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm27, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm30, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k6}, zmm24, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm12{k6}, zmm24, zmm0
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k6}, zmm24, zmm21
|
||||
| 1* | | | | | | | | | cmp r15, r14
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff19
|
||||
Total Num Of Uops: 55
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
72
arch_analysis/iaca_force_soa_lt1200.txt
Normal file
72
arch_analysis/iaca_force_soa_lt1200.txt
Normal file
@@ -0,0 +1,72 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_lt1200_markers.o
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - force_soa_lt1200_markers.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 30.25 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 23
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 16.0 0.0 | 2.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 19.0 | 3.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | | | | 1.0 | | | vpcmpeqb k2, xmm0, xmm0
|
||||
| 1 | | 1.0 | | | | | | | add r9d, 0x8
|
||||
| 1 | | | | | | 1.0 | | | vpcmpeqb k1, xmm0, xmm0
|
||||
| 1 | | | | | | 1.0 | | | vpcmpeqb k3, xmm0, xmm0
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovdqu ymm3, ymmword ptr [rcx+r14*4]
|
||||
| 1 | | 1.0 | | | | | | | add r14, 0x8
|
||||
| 1* | | | | | | | | | vpxord zmm5, zmm5, zmm5
|
||||
| 1* | | | | | | | | | vpxord zmm4, zmm4, zmm4
|
||||
| 1* | | | | | | | | | vpxord zmm6, zmm6, zmm6
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm5, k2, zmmword ptr [rax+ymm3*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm4, k1, zmmword ptr [rdx+ymm3*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm6, k3, zmmword ptr [rsi+ymm3*8]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm29, zmm1, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm28, zmm0, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm31, zmm2, zmm6
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm29, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm20, zmm28, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm20, zmm31, zmm31
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm27, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k5, zmm20, zmm16, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm27, 0x1e
|
||||
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm20, zmm27, qword ptr [rip]{1to8}
|
||||
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd213pd zmm27{k4}, zmm20, zmm27
|
||||
| 1 | 1.0 | | | | | | | | vfmadd213pd zmm27{k4}, zmm21, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm22, zmm27, zmm15
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm24, zmm27, zmm14
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm25, zmm27, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm27, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm25, zmm7
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm23, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm30, zmm26, zmm27
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k5}, zmm30, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm12{k5}, zmm30, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k5}, zmm30, zmm31
|
||||
| 1* | | | | | | | | | cmp r9d, ebx
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff22
|
||||
Total Num Of Uops: 52
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
78
arch_analysis/iaca_force_soa_lt8.txt
Normal file
78
arch_analysis/iaca_force_soa_lt8.txt
Normal file
@@ -0,0 +1,78 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_lt8_markers.o
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - force_soa_lt8_markers.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 35.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 20.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 20.0 | 4.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | 1.0 | | | | | | | imul r8, r12
|
||||
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm9, xmm9
|
||||
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm2, xmm8
|
||||
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm10, xmm10
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, ebx
|
||||
| 1 | | 1.0 | | | | | | | add r8, r11
|
||||
| 1 | | | | | | 1.0 | | | vpbroadcastd ymm0, r13d
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm0, ymm17
|
||||
| 1 | | | | | | | 1.0 | | movsxd rbx, ebx
|
||||
| 1* | | | | | | | | | vmovaps zmm4, zmm19
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k5
|
||||
| 1* | | | | | | | | | vmovaps zmm3, zmm19
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm1{k5}{z}, ymmword ptr [r8+rbx*4]
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k5
|
||||
| 1* | | | | | | | | | vmovaps zmm5, zmm19
|
||||
| 1 | 1.0 | | | | | | | | kmovw k3, k5
|
||||
| 5^ | 2.0 | | 4.0 4.0 | 4.0 4.0 | | | 1.0 | | vgatherdpd zmm5, k3, zmmword ptr [rsi+ymm1*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm4, k2, zmmword ptr [rax+ymm1*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm3, k1, zmmword ptr [rdx+ymm1*8]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm30, zmm10, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm28, zmm9, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm27, zmm2, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm26, zmm28, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm26, zmm27, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm26, zmm30, zmm30
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm25, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm26, zmm16, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm25, 0x1e
|
||||
| 1* | | | | | | | | | vmovaps zmm6, zmm26
|
||||
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm6, zmm25, qword ptr [rip]{1to8}
|
||||
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm8, zmm6, zmm6
|
||||
| 1 | 1.0 | | | | | | | | vfmadd213pd zmm25{k4}, zmm6, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd213pd zmm25{k4}, zmm8, zmm25
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm25, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm22, zmm25, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm25, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm25, zmm23
|
||||
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm25, zmm23, zmm7
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm21, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm24, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k6}, zmm29, zmm27
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k6}, zmm29, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k6}, zmm29, zmm30
|
||||
Total Num Of Uops: 60
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
80
arch_analysis/osaca_force_aos_geq1200.txt
Normal file
80
arch_analysis/osaca_force_aos_geq1200.txt
Normal file
@@ -0,0 +1,80 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_aos_geq1200_markers.s
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
|
||||
Analyzed file: force_aos_geq1200_markers.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2021-04-29 15:53:50
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
196 | | | | | | 1.00 | | || | | vpcmpgtd %ymm3, %ymm2, %k3 #67.9
|
||||
197 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #68.21
|
||||
198 | 1.00 | | | | | | | || | | kmovw %k3, %r9d #67.9
|
||||
199 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #69.36
|
||||
200 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm17 #69.36
|
||||
201 | | | | | | | | || | | # LOE rax rcx rbx rbp rsi rdi r8 r10 r13 r15 r9d r11d r12d r14d xmm6 xmm7 xmm12 ymm2 ymm3 ymm15 ymm16 ymm17 zmm0 zmm1 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 k3
|
||||
202 | | | | | | | | || | | ..B1.21: # Preds ..B1.18
|
||||
203 | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
204 | 1.00 | | | | | | | || | | kmovw %k3, %k1 #69.36
|
||||
205 | 1.00 | | | | | | | || | | kmovw %k3, %k2 #69.36
|
||||
206 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm18, %zmm18, %zmm18 #69.36
|
||||
207 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm19, %zmm19, %zmm19 #69.36
|
||||
208 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #69.36
|
||||
209 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd 16(%rdi,%ymm17,8), %zmm18{%k1} #69.36
|
||||
210 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd 8(%rdi,%ymm17,8), %zmm19{%k2} #69.36
|
||||
211 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdi,%ymm17,8), %zmm20{%k3} #69.36
|
||||
212 | | | | | | | | || | | # LOE rax rcx rbx rbp rsi rdi r8 r10 r13 r15 r9d r11d r12d r14d xmm6 xmm7 xmm12 ymm2 ymm3 ymm15 ymm16 zmm0 zmm1 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
213 | | | | | | | | || | | ..B1.22: # Preds ..B1.21
|
||||
214 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
215 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r13 #67.9
|
||||
216 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm16, %ymm3, %ymm3 #67.9
|
||||
217 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm4, %zmm29 #71.36
|
||||
218 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm19, %zmm0, %zmm27 #70.36
|
||||
219 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm1, %zmm26 #69.36
|
||||
220 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm27, %zmm27, %zmm25 #72.49
|
||||
221 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm26, %zmm26, %zmm25 #72.49
|
||||
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm29, %zmm29, %zmm25 #72.63
|
||||
223 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm25, %zmm24 #75.38
|
||||
224 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm25, %k2 #74.22
|
||||
225 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm24, %k0 #75.38
|
||||
226 | 1.00 | | | | | | | || | | kmovw %k2, %edx #74.22
|
||||
227 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38
|
||||
228 | | | | | | | | || | | * vmovaps %zmm25, %zmm17 #75.38
|
||||
229 | 0.00 | 0.34 | | | | 0.00 | 0.66 | || | | andl %edx, %r9d #74.22
|
||||
230 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #75.38
|
||||
231 | 1.00 | | | | | | | || | | kmovw %r9d, %k3 #78.17
|
||||
232 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18 #75.38
|
||||
233 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #75.38
|
||||
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #75.38
|
||||
235 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm24, %zmm19 #76.38
|
||||
236 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm24, %zmm21 #77.54
|
||||
237 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm19, %zmm24, %zmm22 #76.44
|
||||
238 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm22, %zmm24, %zmm20 #76.50
|
||||
239 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm22, %zmm24 #77.54
|
||||
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm21, %zmm20, %zmm23 #77.61
|
||||
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm24, %zmm23, %zmm28 #77.67
|
||||
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm26, %zmm28, %zmm9{%k3} #78.17
|
||||
243 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm27, %zmm28, %zmm8{%k3} #79.17
|
||||
244 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #80.17
|
||||
245 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %rbx, %r13 #67.9
|
||||
246 | | | | | | | | || | | * jb ..B1.18 # Prob 82% #67.9
|
||||
|
||||
20.5 6.00 13.0 2.50 13.0 2.50 20.5 4.00 70.0 4
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
215 | 1.0 | addq $8, %r13 #67.9| [215]
|
||||
216 | 1.0 | vpaddd %ymm16, %ymm3, %ymm3 #67.9| [216]
|
||||
244 | 4.0 | vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #80.17| [244]
|
||||
243 | 4.0 | vfmadd231pd %zmm27, %zmm28, %zmm8{%k3} #79.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm26, %zmm28, %zmm9{%k3} #78.17| [242]
|
||||
|
91
arch_analysis/osaca_force_aos_lt8.txt
Normal file
91
arch_analysis/osaca_force_aos_lt8.txt
Normal file
@@ -0,0 +1,91 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_aos_lt8_markers.s
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
|
||||
Analyzed file: force_aos_lt8_markers.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2021-04-29 15:49:27
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
358 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
|
||||
359 | | | | | | | | || | | ..B1.33: # Preds ..B1.32
|
||||
360 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
361 | | 1.00 | | | | | | || 3.0 | | imulq %r8, %rcx #56.43
|
||||
362 | | | | | | 1.00 | | || | 3.0 | vbroadcastsd %xmm6, %zmm4 #58.23
|
||||
363 | | | | | | | | || | | X subl %r14d, %r11d #67.9
|
||||
364 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | | addq %r10, %rcx #37.5
|
||||
365 | | | | | | | | || | | X vpbroadcastd %r11d, %ymm0 #67.9
|
||||
366 | | | | | | 1.00 | | || | | vpcmpgtd %ymm15, %ymm0, %k3 #67.9
|
||||
367 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | movslq %r14d, %r14 #67.9
|
||||
368 | 1.00 | | | | | | | || | | kmovw %k3, %ebx #67.9
|
||||
369 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%rcx,%r14,4), %ymm1{%k3}{z} #68.21
|
||||
370 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm1, %ymm1, %ymm2 #69.36
|
||||
371 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm2, %ymm1, %ymm0 #69.36
|
||||
372 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r9 r10 ebx xmm7 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 k3
|
||||
373 | | | | | | | | || | | ..B1.36: # Preds ..B1.33
|
||||
374 | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
375 | 1.00 | | | | | | | || | | kmovw %k3, %k1 #69.36
|
||||
376 | 1.00 | | | | | | | || | | kmovw %k3, %k2 #69.36
|
||||
377 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm1, %zmm1, %zmm1 #69.36
|
||||
378 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm2, %zmm2, %zmm2 #69.36
|
||||
379 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm3, %zmm3, %zmm3 #69.36
|
||||
380 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd 16(%rdi,%ymm0,8), %zmm1{%k1} #69.36
|
||||
381 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd 8(%rdi,%ymm0,8), %zmm2{%k2} #69.36
|
||||
382 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdi,%ymm0,8), %zmm3{%k3} #69.36
|
||||
383 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r9 r10 ebx xmm7 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
|
||||
384 | | | | | | | | || | | ..B1.37: # Preds ..B1.36
|
||||
385 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
386 | | | | | | 1.00 | | || | | vbroadcastsd %xmm7, %zmm7 #59.23
|
||||
387 | | | | | | 1.00 | | || | | vbroadcastsd %xmm12, %zmm12 #60.23
|
||||
388 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm1, %zmm12, %zmm23 #71.36
|
||||
389 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm2, %zmm7, %zmm21 #70.36
|
||||
390 | 0.50 | | | | | 0.50 | | || | 4.0 | vsubpd %zmm3, %zmm4, %zmm20 #69.36
|
||||
391 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm21, %zmm19 #72.49
|
||||
392 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm20, %zmm20, %zmm19 #72.49
|
||||
393 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm23, %zmm23, %zmm19 #72.63
|
||||
394 | 2.50 | | | | | 0.50 | | || 8.0 | 8.0 | vrcp14pd %zmm19, %zmm18 #75.38
|
||||
395 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm19, %k2 #74.22
|
||||
396 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm18, %k0 #75.38
|
||||
397 | 1.00 | | | | | | | || | | kmovw %k2, %ecx #74.22
|
||||
398 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38
|
||||
399 | | | | | | | | || | | * vmovaps %zmm19, %zmm0 #75.38
|
||||
400 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | andl %ecx, %ebx #74.22
|
||||
401 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #75.38
|
||||
402 | 1.00 | | | | | | | || | | kmovw %ebx, %k3 #78.17
|
||||
403 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm0, %zmm0, %zmm1 #75.38
|
||||
404 | 0.50 | | | | | 0.50 | | || | 4.0 | vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #75.38
|
||||
405 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #75.38
|
||||
406 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm18, %zmm2 #76.38
|
||||
407 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm18, %zmm4 #77.54
|
||||
408 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vmulpd %zmm2, %zmm18, %zmm6 #76.44
|
||||
409 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm6, %zmm18, %zmm3 #76.50
|
||||
410 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm5, %zmm6, %zmm18 #77.54
|
||||
411 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm4, %zmm3, %zmm17 #77.61
|
||||
412 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm18, %zmm17, %zmm22 #77.67
|
||||
413 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm22, %zmm9{%k3} #78.17
|
||||
414 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm21, %zmm22, %zmm8{%k3} #79.17
|
||||
415 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17
|
||||
|
||||
22.0 5.00 13.0 2.50 13.0 2.50 22.0 5.00 70.0 35.0
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
363 | 0.0 | subl %r14d, %r11d #67.9| [363]
|
||||
367 | 1.0 | movslq %r14d, %r14 #67.9| [367]
|
||||
386 | 3.0 | vbroadcastsd %xmm7, %zmm7 #59.23| [386]
|
||||
387 | 3.0 | vbroadcastsd %xmm12, %zmm12 #60.23| [387]
|
||||
415 | 4.0 | vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17| [415]
|
||||
414 | 4.0 | vfmadd231pd %zmm21, %zmm22, %zmm8{%k3} #79.17| [414]
|
||||
413 | 4.0 | vfmadd231pd %zmm20, %zmm22, %zmm9{%k3} #78.17| [413]
|
||||
397 | 28.0 | kmovw %k2, %ecx #74.22| [361, 364, 369, 371, 382, 390, 392, 393, 395, 397]
|
||||
408 | 35.0 | vmulpd %zmm2, %zmm18, %zmm6 #76.44| [362, 390, 392, 393, 394, 404, 405, 408]
|
||||
|
71
arch_analysis/osaca_force_soa_geq1200.txt
Normal file
71
arch_analysis/osaca_force_soa_geq1200.txt
Normal file
@@ -0,0 +1,71 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_geq1200_markers.s
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
|
||||
Analyzed file: force_soa_geq1200_markers.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2021-04-29 15:54:23
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
189 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r10 r11 r12 r14 r15 ebx r9d r13d xmm8 xmm9 xmm10 ymm3 ymm4 ymm17 ymm18 zmm2 zmm5 zmm6 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19
|
||||
190 | | | | | | | | || | | ..B1.18: # Preds ..B1.18 ..B1.17
|
||||
191 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
192 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #67.9
|
||||
193 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm18, %ymm4, %ymm4 #67.9
|
||||
194 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rcx,%r15,4), %ymm20{%k5}{z} #68.21
|
||||
195 | | | | | | | | || | | * vmovaps %zmm19, %zmm22 #70.36
|
||||
196 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addq $8, %r15 #67.9
|
||||
197 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #70.36
|
||||
198 | | | | | | | | || | | * vmovaps %zmm19, %zmm21 #69.36
|
||||
199 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #69.36
|
||||
200 | | | | | | | | || | | * vmovaps %zmm19, %zmm23 #71.36
|
||||
201 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #71.36
|
||||
202 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm20,8), %zmm23{%k3} #71.36
|
||||
203 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%rax,%ymm20,8), %zmm22{%k2} #70.36
|
||||
204 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdx,%ymm20,8), %zmm21{%k1} #69.36
|
||||
205 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm22, %zmm5, %zmm0 #70.36
|
||||
206 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm21, %zmm2, %zmm1 #69.36
|
||||
207 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm23, %zmm6, %zmm21 #71.36
|
||||
208 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm0, %zmm0, %zmm20 #72.49
|
||||
209 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm1, %zmm1, %zmm20 #72.49
|
||||
210 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm21, %zmm21, %zmm20 #72.63
|
||||
211 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm20, %zmm31 #75.38
|
||||
212 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm20, %k6{%k5} #74.22
|
||||
213 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm31, %k0 #75.38
|
||||
214 | | | | | | | | || | | * vmovaps %zmm20, %zmm24 #75.38
|
||||
215 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm31, %zmm24 #75.38
|
||||
216 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38
|
||||
217 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm24, %zmm25 #75.38
|
||||
218 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm31, %zmm24, %zmm31{%k4} #75.38
|
||||
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm31, %zmm25, %zmm31{%k4} #75.38
|
||||
220 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm31, %zmm26 #76.38
|
||||
221 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm31, %zmm28 #77.54
|
||||
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm26, %zmm31, %zmm29 #76.44
|
||||
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm31, %zmm27 #76.50
|
||||
224 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm7, %zmm29, %zmm31 #77.54
|
||||
225 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm28, %zmm27, %zmm30 #77.61
|
||||
226 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm31, %zmm30, %zmm24 #77.67
|
||||
227 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm1, %zmm24, %zmm13{%k6} #78.17
|
||||
228 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm0, %zmm24, %zmm12{%k6} #79.17
|
||||
229 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm21, %zmm24, %zmm11{%k6} #80.17
|
||||
230 | 0.00 | 0.17 | | | | 0.00 | 0.83 | || | | cmpq %r14, %r15 #67.9
|
||||
231 | | | | | | | | || | | * jb ..B1.18 # Prob 82% #67.9
|
||||
|
||||
18.0 4.17 13.0 2.50 13.0 2.50 18.0 2.83 68.0 4
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
193 | 1.0 | vpaddd %ymm18, %ymm4, %ymm4 #67.9| [193]
|
||||
196 | 1.0 | addq $8, %r15 #67.9| [196]
|
||||
228 | 4.0 | vfmadd231pd %zmm0, %zmm24, %zmm12{%k6} #79.17| [228]
|
||||
227 | 4.0 | vfmadd231pd %zmm1, %zmm24, %zmm13{%k6} #78.17| [227]
|
||||
229 | 4.0 | vfmadd231pd %zmm21, %zmm24, %zmm11{%k6} #80.17| [229]
|
69
arch_analysis/osaca_force_soa_lt1200.txt
Normal file
69
arch_analysis/osaca_force_soa_lt1200.txt
Normal file
@@ -0,0 +1,69 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_lt1200_markers.s
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
|
||||
Analyzed file: force_soa_lt1200_markers.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2021-04-29 15:39:58
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
253 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r10 r11 r12 r14 ebx r9d r13d xmm8 xmm9 xmm10 ymm17 ymm18 zmm0 zmm1 zmm2 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19
|
||||
254 | | | | | | | | || | | ..B1.22: # Preds ..B1.22 ..B1.21
|
||||
255 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
256 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k2 #70.36
|
||||
257 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl $8, %r9d #67.9
|
||||
258 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k1 #69.36
|
||||
259 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k3 #71.36
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rcx,%r14,4), %ymm3 #68.21
|
||||
261 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addq $8, %r14 #67.9
|
||||
262 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm5, %zmm5, %zmm5 #70.36
|
||||
263 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm4, %zmm4, %zmm4 #69.36
|
||||
264 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm6, %zmm6, %zmm6 #71.36
|
||||
265 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%rax,%ymm3,8), %zmm5{%k2} #70.36
|
||||
266 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdx,%ymm3,8), %zmm4{%k1} #69.36
|
||||
267 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm3,8), %zmm6{%k3} #71.36
|
||||
268 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm5, %zmm1, %zmm29 #70.36
|
||||
269 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm4, %zmm0, %zmm28 #69.36
|
||||
270 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm6, %zmm2, %zmm31 #71.36
|
||||
271 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm20 #72.49
|
||||
272 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm20 #72.49
|
||||
273 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm20 #72.63
|
||||
274 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm20, %zmm27 #75.38
|
||||
275 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm20, %k5 #74.22
|
||||
276 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm27, %k0 #75.38
|
||||
277 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm27, %zmm20 #75.38
|
||||
278 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38
|
||||
279 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm20, %zmm21 #75.38
|
||||
280 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm27, %zmm20, %zmm27{%k4} #75.38
|
||||
281 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm27, %zmm21, %zmm27{%k4} #75.38
|
||||
282 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm27, %zmm22 #76.38
|
||||
283 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm27, %zmm24 #77.54
|
||||
284 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm27, %zmm25 #76.44
|
||||
285 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm27, %zmm23 #76.50
|
||||
286 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm7, %zmm25, %zmm27 #77.54
|
||||
287 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm23, %zmm26 #77.61
|
||||
288 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm30 #77.67
|
||||
289 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm13{%k5} #78.17
|
||||
290 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm12{%k5} #79.17
|
||||
291 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm31, %zmm30, %zmm11{%k5} #80.17
|
||||
292 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | cmpl %ebx, %r9d #67.9
|
||||
293 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #67.9
|
||||
|
||||
17.5 3.00 13.0 2.50 13.0 2.50 17.5 3.00 68.0 4
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
257 | 1.0 | addl $8, %r9d #67.9| [257]
|
||||
261 | 1.0 | addq $8, %r14 #67.9| [261]
|
||||
290 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm12{%k5} #79.17| [290]
|
||||
289 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm13{%k5} #78.17| [289]
|
||||
291 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm11{%k5} #80.17| [291]
|
79
arch_analysis/osaca_force_soa_lt8.txt
Normal file
79
arch_analysis/osaca_force_soa_lt8.txt
Normal file
@@ -0,0 +1,79 @@
|
||||
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_lt8_markers.s
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
|
||||
Analyzed file: force_soa_lt8_markers.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2021-04-29 15:52:48
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
-------------------------------------------------------------------------------------------------
|
||||
300 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r10 r11 r12 ebx r13d xmm8 xmm9 xmm10 ymm17 ymm18 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19
|
||||
301 | | | | | | | | || | | ..B1.25: # Preds ..B1.24
|
||||
302 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
303 | | 1.00 | | | | | | || 3.0 | | imulq %r12, %r8 #56.43
|
||||
304 | | | | | | 1.00 | | || | | vbroadcastsd %xmm9, %zmm9 #59.23
|
||||
305 | | | | | | 1.00 | | || | 3.0 | vbroadcastsd %xmm8, %zmm2 #58.23
|
||||
306 | | | | | | 1.00 | | || | | vbroadcastsd %xmm10, %zmm10 #60.23
|
||||
307 | | | | | | | | || | | X subl %ebx, %r13d #67.9
|
||||
308 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | addq %r11, %r8 #37.5
|
||||
309 | | | | | | | | || | | X vpbroadcastd %r13d, %ymm0 #67.9
|
||||
310 | | | | | | 1.00 | | || | | vpcmpgtd %ymm17, %ymm0, %k5 #67.9
|
||||
311 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ebx, %rbx #67.9
|
||||
312 | | | | | | | | || | | * vmovaps %zmm19, %zmm4 #70.36
|
||||
313 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #70.36
|
||||
314 | | | | | | | | || | | * vmovaps %zmm19, %zmm3 #69.36
|
||||
315 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r8,%rbx,4), %ymm1{%k5}{z} #68.21
|
||||
316 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #69.36
|
||||
317 | | | | | | | | || | | * vmovaps %zmm19, %zmm5 #71.36
|
||||
318 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #71.36
|
||||
319 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm1,8), %zmm5{%k3} #71.36
|
||||
320 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || 4.0 | | vgatherdpd (%rax,%ymm1,8), %zmm4{%k2} #70.36
|
||||
321 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdx,%ymm1,8), %zmm3{%k1} #69.36
|
||||
322 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm5, %zmm10, %zmm30 #71.36
|
||||
323 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm4, %zmm9, %zmm28 #70.36
|
||||
324 | 0.50 | | | | | 0.50 | | || | 4.0 | vsubpd %zmm3, %zmm2, %zmm27 #69.36
|
||||
325 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm28, %zmm26 #72.49
|
||||
326 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm27, %zmm27, %zmm26 #72.49
|
||||
327 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm30, %zmm30, %zmm26 #72.63
|
||||
328 | 2.50 | | | | | 0.50 | | || 8.0 | 8.0 | vrcp14pd %zmm26, %zmm25 #75.38
|
||||
329 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm26, %k6{%k5} #74.22
|
||||
330 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm25, %k0 #75.38
|
||||
331 | | | | | | | | || | | * vmovaps %zmm26, %zmm6 #75.38
|
||||
332 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | 4.0 | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm25, %zmm6 #75.38
|
||||
333 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38
|
||||
334 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vmulpd %zmm6, %zmm6, %zmm8 #75.38
|
||||
335 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm25, %zmm6, %zmm25{%k4} #75.38
|
||||
336 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm25, %zmm8, %zmm25{%k4} #75.38
|
||||
337 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm25, %zmm20 #76.38
|
||||
338 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm25, %zmm22 #77.54
|
||||
339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm25, %zmm23 #76.44
|
||||
340 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm25, %zmm21 #76.50
|
||||
341 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm7, %zmm23, %zmm25 #77.54
|
||||
342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm21, %zmm24 #77.61
|
||||
343 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm24, %zmm29 #77.67
|
||||
344 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm27, %zmm29, %zmm13{%k6} #78.17
|
||||
345 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm29, %zmm12{%k6} #79.17
|
||||
346 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm30, %zmm29, %zmm11{%k6} #80.17
|
||||
|
||||
19.5 3.50 13.0 2.50 13.0 2.50 19.5 3.50 68.0 31.0
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
308 | 4.0 | addq %r11, %r8 #37.5| [303, 308]
|
||||
304 | 3.0 | vbroadcastsd %xmm9, %zmm9 #59.23| [304]
|
||||
306 | 3.0 | vbroadcastsd %xmm10, %zmm10 #60.23| [306]
|
||||
307 | 0.0 | subl %ebx, %r13d #67.9| [307]
|
||||
311 | 1.0 | movslq %ebx, %rbx #67.9| [311]
|
||||
346 | 4.0 | vfmadd231pd %zmm30, %zmm29, %zmm11{%k6} #80.17| [346]
|
||||
345 | 4.0 | vfmadd231pd %zmm28, %zmm29, %zmm12{%k6} #79.17| [345]
|
||||
344 | 4.0 | vfmadd231pd %zmm27, %zmm29, %zmm13{%k6} #78.17| [344]
|
||||
334 | 31.0 | vmulpd %zmm6, %zmm6, %zmm8 #75.38| [305, 324, 326, 327, 328, 332, 334]
|
Reference in New Issue
Block a user