Add arch_analysis directory and first AVX2 results

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2021-05-06 23:18:28 +02:00
parent 327cc302b8
commit 9c28ff1e9e
17 changed files with 26360 additions and 0 deletions

View File

@ -0,0 +1,74 @@
iwia021h@testfront1:~/MD-Bench/asm/avx2$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=BDW force_soa_lt600_markers.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force_soa_lt600_markers.s
Architecture: BDW
Timestamp: 2021-05-06 15:40:52
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
----------------------------------------------------------------------------------------------------
12343 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%rax,%rdx,4), %xmm2 #60.25
12344 | | 1.00 | | | | 0.00 | | || | 1.0 | vpcmpeqd %ymm13, %ymm13, %ymm13 #61.40
12345 | | | | | | 1.00 | | || | | vxorpd %ymm8, %ymm8, %ymm8 #61.40
12346 | | | | | | | | || | | * vmovdqa %ymm13, %ymm0 #61.40
12347 | | | | | | 1.00 | | || | | vxorpd %ymm14, %ymm14, %ymm14 #62.40
12348 | | | | | | | | || | | * vmovdqa %ymm13, %ymm7 #62.40
12349 | 0.00 | 0.00 | 2.00 | 2.00 | 1.00 | 3.00 | 1.00 | || | | vgatherdpd %ymm0, (%r8,%xmm2,8), %ymm8 #61.40
12350 | 0.00 | 0.00 | 2.00 | 2.00 | 1.00 | 3.00 | 1.00 | || 5.0 | | vgatherdpd %ymm7, (%rdi,%xmm2,8), %ymm14 #62.40
12351 | | 1.00 | | | | | | || | | vsubpd %ymm8, %ymm5, %ymm0 #61.40
12352 | | 1.00 | | | | | | || 3.0 | | vsubpd %ymm14, %ymm4, %ymm8 #62.40
12353 | | | | | | 1.00 | | || | | vxorpd %ymm7, %ymm7, %ymm7 #63.40
12354 | | | | | | | | || | 0.0 | * vmovdqa %ymm13, %ymm15 #63.40
12355 | 0.00 | 0.00 | 2.00 | 2.00 | 1.00 | 3.00 | 1.00 | || | 5.0 | vgatherdpd %ymm15, (%rsi,%xmm2,8), %ymm7 #63.40
12356 | 0.50 | 0.50 | | | | | | || 3.0 | | vmulpd %ymm8, %ymm8, %ymm14 #64.53
12357 | | 1.00 | | | | | | || | 3.0 | vsubpd %ymm7, %ymm6, %ymm2 #63.40
12358 | 0.50 | 0.50 | | | | | | || 5.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.53
12359 | 0.50 | 0.50 | | | | | | || 5.0 | 5.0 | vfmadd231pd %ymm2, %ymm2, %ymm14 #64.67
12360 | | | | | | | | || | | X vcmpltpd %ymm11, %ymm14, %ymm7 #66.26
12361 | 1.00 | | | | | 1.00 | | || | | vptest %ymm13, %ymm7 #66.26
12362 | | | | | | | | || | | X je ..B1.26 # Prob 50% #66.26
12363 | | | | | | | | || | | # LOE rax rdx rsi rdi r8 r9 r12 r14 ecx ebx r10d r13d r15d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 ymm8 ymm9 ymm10 ymm11 ymm12 ymm14
12364 | | | | | | | | || | | ..B1.25: # Preds ..B1.24
12365 | | | | | | | | || | | # Execution count [1.25e+04]
12366 | 2.50 16.00 | 0.50 | | | | | | || 23.0 | 23.0 | vdivpd %ymm14, %ymm10, %ymm13 #67.42
12367 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || 3.0 | | vmulpd 352(%rsp), %ymm13, %ymm14 #68.42[spill]
12368 | 0.50 | 0.50 | | | | | | || 3.0 | | vmulpd %ymm14, %ymm13, %ymm14 #68.48
12369 | 0.50 | 0.50 | | | | | | || 3.0 | | vmulpd %ymm14, %ymm13, %ymm15 #68.54
12370 | 0.50 | 0.50 | | | | | | || | | vfmsub213pd %ymm9, %ymm13, %ymm14 #69.58
12371 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | || | 3.0 | vmulpd 320(%rsp), %ymm13, %ymm13 #69.58[spill]
12372 | 0.50 | 0.50 | | | | | | || 3.0 | 3.0 | vmulpd %ymm13, %ymm15, %ymm15 #69.65
12373 | 0.75 | 0.25 | | | | | | || 3.0 | 3.0 | vmulpd %ymm14, %ymm15, %ymm13 #69.71
12374 | 1.00 | 0.00 | | | | | | || | | vmulpd %ymm13, %ymm0, %ymm0 #70.35
12375 | 1.00 | 0.00 | | | | | | || | | vmulpd %ymm13, %ymm8, %ymm8 #71.35
12376 | | | | | | 1.00 | | || | | vandpd %ymm0, %ymm7, %ymm0 #70.35
12377 | | 1.00 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #70.21
12378 | 1.00 | 0.00 | | | | | | || 3.0 | | vmulpd %ymm13, %ymm2, %ymm0 #72.35
12379 | | | | | | 1.00 | | || | | vandpd %ymm8, %ymm7, %ymm2 #71.35
12380 | | | | | | 1.00 | | || 1.0 | | vandpd %ymm0, %ymm7, %ymm7 #72.35
12381 | | 1.00 | | | | | | || | | vaddpd %ymm2, %ymm1, %ymm1 #71.21
12382 | | 1.00 | | | | | | || 3.0 | | vaddpd %ymm7, %ymm3, %ymm3 #72.21
12383 | | | | | | | | || | | # LOE rax rdx rsi rdi r8 r9 r12 r14 ecx ebx r10d r13d r15d ymm1 ymm3 ymm4 ymm5 ymm6 ymm9 ymm10 ymm11 ymm12
12384 | | | | | | | | || | | ..B1.26: # Preds ..B1.25 ..B1.24
12385 | | | | | | | | || | | # Execution count [2.50e+04]
12386 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | addq $4, %rdx #59.13
12387 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %rdx #59.13
12388 | | | | | | | | || | | * jb ..B1.24 # Prob 82% #59.13
11.8 16.00 12.2 7.50 1.50 7.50 1.50 3.00 16.0 5.00 70 46
Loop-Carried Dependencies Analysis Report
-----------------------------------------
12373 | 46.0 | vmulpd %ymm14, %ymm15, %ymm13 #69.71| [12344, 12354, 12355, 12357, 12359, 12366, 12371, 12372, 12373]
12377 | 3.0 | vaddpd %ymm0, %ymm12, %ymm12 #70.21| [12377]
12381 | 3.0 | vaddpd %ymm2, %ymm1, %ymm1 #71.21| [12381]
12382 | 3.0 | vaddpd %ymm7, %ymm3, %ymm3 #72.21| [12382]
12386 | 1.0 | addq $4, %rdx #59.13| [12386]

View File

@ -0,0 +1,79 @@
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_aos_geq1200_markers.o
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - force_aos_geq1200_markers.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 33.05 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 20.0 0.0 | 4.5 | 13.0 13.0 | 13.0 13.0 | 0.0 | 18.0 | 4.5 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | | | | 1.0 | | | vpcmpgtd k3, ymm2, ymm3
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k3}{z}, ymmword ptr [r15+r13*4]
| 1 | 1.0 | | | | | | | | kmovw r9d, k3
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
| 1 | | 1.0 | | | | | | | vpaddd ymm17, ymm17, ymm18
| 1 | 1.0 | | | | | | | | kmovw k1, k3
| 1 | 1.0 | | | | | | | | kmovw k2, k3
| 1* | | | | | | | | | vpxord zmm18, zmm18, zmm18
| 1* | | | | | | | | | vpxord zmm19, zmm19, zmm19
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm18, k1, zmmword ptr [rdi+ymm17*8+0x10]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm19, k2, zmmword ptr [rdi+ymm17*8+0x8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rdi+ymm17*8]
| 1 | | 0.5 | | | | | 0.5 | | add r13, 0x8
| 1 | | 1.0 | | | | | | | vpaddd ymm3, ymm3, ymm16
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm29, zmm4, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm27, zmm0, zmm19
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm26, zmm1, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm27, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm25, zmm26, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm25, zmm29, zmm29
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm24, zmm25
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm25, zmm14, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm24, 0x1e
| 1 | 1.0 | | | | | | | | kmovw edx, k2
| 1 | 1.0 | | | | | | | | knotw k1, k0
| 1* | | | | | | | | | vmovaps zmm17, zmm25
| 1 | | | | | | | 1.0 | | and r9d, edx
| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm17, zmm24, qword ptr [rip]{1to8}
| 1 | | | | | | 1.0 | | | kmovw k3, r9d
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm24{k1}, zmm17, zmm24
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm24{k1}, zmm18, zmm24
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm24, zmm13
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm21, zmm24, zmm10
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm22, zmm24, zmm19
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm20, zmm24, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm24, zmm22, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm20, zmm21
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm23, zmm24
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k3}, zmm28, zmm26
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k3}, zmm28, zmm27
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k3}, zmm28, zmm29
| 1* | | | | | | | | | cmp r13, rbx
| 0*F | | | | | | | | | jb 0xfffffffffffffef7
Total Num Of Uops: 60
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,104 @@
iwia021h@testfront1:~/MD-Bench/ICC$ iaca -arch SKX force.o
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - force.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 36.70 Cycles Throughput Bottleneck: Backend
Loop Count: 23
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 17.5 0.0 | 11.0 | 20.5 17.0 | 20.5 17.0 | 7.0 | 20.5 | 7.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1* | | | | | | | | | mov r13, r8
| 1 | | 1.0 | | | | | | | imul r13, rcx
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm2, xmm6
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm1, xmm7
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm0, xmm12
| 1 | | | | | | | 1.0 | | movsxd rbx, r12d
| 1 | | | | | | | 1.0 | | add r13, r10
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x40], rax
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x38], r8
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x30], r10
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x28], rsi
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x20], rcx
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x50], r9
| 2^ | | | 0.5 | 0.5 | 1.0 | | | | mov qword ptr [rsp-0x48], rdx
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu ymm3, ymmword ptr [r13+rbx*4]
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm3, ymm3
| 1 | | 1.0 | | | | | | | vpaddd ymm3, ymm3, ymm4
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r10d, dword ptr [r13+rbx*4]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r9d, dword ptr [r13+rbx*4+0x4]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r8d, dword ptr [r13+rbx*4+0x8]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov esi, dword ptr [r13+rbx*4+0xc]
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r10+r10*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov ecx, dword ptr [r13+rbx*4+0x10]
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r9+r9*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov edx, dword ptr [r13+rbx*4+0x14]
| 1 | | 1.0 | | | | | | | lea r8d, ptr [r8+r8*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov eax, dword ptr [r13+rbx*4+0x18]
| 1 | | 1.0 | | | | | | | lea esi, ptr [rsi+rsi*2]
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov r15d, dword ptr [r13+rbx*4+0x1c]
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rcx+rcx*2]
| 1 | | 1.0 | | | | | | | lea edx, ptr [rdx+rdx*2]
| 1 | | 1.0 | | | | | | | lea eax, ptr [rax+rax*2]
| 1 | | 1.0 | | | | | | | lea r15d, ptr [r15+r15*2]
| 1 | | | | | | 1.0 | | | vpcmpeqb k1, xmm0, xmm0
| 1 | | | | | | 1.0 | | | vpcmpeqb k2, xmm0, xmm0
| 1 | | | | | | 1.0 | | | vpcmpeqb k3, xmm0, xmm0
| 1* | | | | | | | | | vpxord zmm4, zmm4, zmm4
| 1* | | | | | | | | | vpxord zmm17, zmm17, zmm17
| 1* | | | | | | | | | vpxord zmm18, zmm18, zmm18
| 5^ | 2.0 | | 4.0 4.0 | 4.0 4.0 | | | 1.0 | | vgatherdpd zmm4, k1, zmmword ptr [rdi+ymm3*8+0x10]
| 5^ | 1.5 | | 4.0 4.0 | 4.0 4.0 | | 0.5 | 1.0 | | vgatherdpd zmm17, k2, zmmword ptr [rdi+ymm3*8+0x8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm18, k3, zmmword ptr [rdi+ymm3*8]
| 1 | | | | | | | 1.0 | | add r12d, 0x8
| 1 | | | | | | | 1.0 | | add rbx, 0x8
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm26, zmm0, zmm4
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm24, zmm1, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm23, zmm2, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm3, zmm24, zmm24
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm3, zmm23, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm3, zmm26, zmm26
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm22, zmm3
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm3, zmm14, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm22, 0x1e
| 2^ | 1.0 | | 0.5 0.5 | 0.5 0.5 | | | | | vfnmadd213pd zmm3, zmm22, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k1, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm3, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm22{k1}, zmm3, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm22{k1}, zmm4, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm17, zmm22, zmm13
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm22, zmm10
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm20, zmm22, zmm17
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm18, zmm22, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm22, zmm20, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm21, zmm18, zmm19
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm21, zmm22
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k2}, zmm25, zmm23
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k2}, zmm25, zmm24
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k2}, zmm25, zmm26
| 1* | | | | | | | | | cmp r12d, r14d
| 0*F | | | | | | | | | jb 0xfffffffffffffed3
Total Num Of Uops: 91
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,82 @@
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - force_aos_lt8_markers.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 69.79 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 21.0 0.0 | 5.5 | 13.0 13.0 | 13.0 13.0 | 0.0 | 21.0 | 5.5 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | 1.0 | | | | | | | imul rcx, r8
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm4, xmm6
| 1 | | | | | | | 1.0 | | sub r11d, r14d
| 1 | | 0.5 | | | | | 0.5 | | add rcx, r10
| 1 | | | | | | 1.0 | | | vpbroadcastd ymm0, r11d
| 1 | | | | | | 1.0 | | | vpcmpgtd k3, ymm0, ymm15
| 1 | | 0.5 | | | | | 0.5 | | movsxd r14, r14d
| 1 | 1.0 | | | | | | | | kmovw ebx, k3
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm1{k3}{z}, ymmword ptr [rcx+r14*4]
| 1 | | 1.0 | | | | | | | vpaddd ymm2, ymm1, ymm1
| 1 | | 1.0 | | | | | | | vpaddd ymm0, ymm1, ymm2
| 1 | 1.0 | | | | | | | | kmovw k1, k3
| 1 | 1.0 | | | | | | | | kmovw k2, k3
| 1* | | | | | | | | | vpxord zmm1, zmm1, zmm1
| 1* | | | | | | | | | vpxord zmm2, zmm2, zmm2
| 1* | | | | | | | | | vpxord zmm3, zmm3, zmm3
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm1, k1, zmmword ptr [rdi+ymm0*8+0x10]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm2, k2, zmmword ptr [rdi+ymm0*8+0x8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm3, k3, zmmword ptr [rdi+ymm0*8]
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm7, xmm7
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm12, xmm12
| 1 | 1.0 | | | | | | | | vsubpd zmm23, zmm12, zmm1
| 1 | 1.0 | | | | | | | | vsubpd zmm21, zmm7, zmm2
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm20, zmm4, zmm3
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm19, zmm21, zmm21
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm19, zmm20, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm19, zmm23, zmm23
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm19
| 1 | | | | | | 1.0 | | | vcmppd k2, zmm19, zmm14, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm18, 0x1e
| 1 | 1.0 | | | | | | | | kmovw ecx, k2
| 1 | 1.0 | | | | | | | | knotw k1, k0
| 1* | | | | | | | | | vmovaps zmm0, zmm19
| 1 | | 0.5 | | | | | 0.5 | | and ebx, ecx
| 2^ | | | | 1.0 1.0 | | 1.0 | | | vfnmadd213pd zmm0, zmm18, qword ptr [rip]{1to8}
| 1 | | | | | | 1.0 | | | kmovw k3, ebx
| 1 | 1.0 | | | | | | | | vmulpd zmm1, zmm0, zmm0
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm18{k1}, zmm0, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm18{k1}, zmm1, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm2, zmm18, zmm13
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm4, zmm18, zmm10
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm6, zmm18, zmm2
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm3, zmm18, zmm6
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm18, zmm6, zmm5
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm17, zmm3, zmm4
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm22, zmm17, zmm18
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k3}, zmm22, zmm20
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k3}, zmm22, zmm21
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm11{k3}, zmm22, zmm23
Total Num Of Uops: 65
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,74 @@
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_geq1200_markers.o
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - force_soa_geq1200_markers.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 31.47 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 18.0 0.0 | 3.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 18.0 | 3.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm18
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm20{k5}{z}, ymmword ptr [rcx+r15*4]
| 1* | | | | | | | | | vmovaps zmm22, zmm19
| 1 | | 1.0 | | | | | | | add r15, 0x8
| 1 | 1.0 | | | | | | | | kmovw k2, k5
| 1* | | | | | | | | | vmovaps zmm21, zmm19
| 1 | 1.0 | | | | | | | | kmovw k1, k5
| 1* | | | | | | | | | vmovaps zmm23, zmm19
| 1 | 1.0 | | | | | | | | kmovw k3, k5
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm23, k3, zmmword ptr [rsi+ymm20*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k2, zmmword ptr [rax+ymm20*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k1, zmmword ptr [rdx+ymm20*8]
| 1 | | | | | | 1.0 | | | vsubpd zmm0, zmm5, zmm22
| 1 | | | | | | 1.0 | | | vsubpd zmm1, zmm2, zmm21
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm6, zmm23
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm0, zmm0
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm20, zmm1, zmm1
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm20, zmm21, zmm21
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm31, zmm20
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm20, zmm16, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm31, 0x1e
| 1* | | | | | | | | | vmovaps zmm24, zmm20
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm24, zmm31, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k4, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm25, zmm24, zmm24
| 1 | | | | | | 1.0 | | | vfmadd213pd zmm31{k4}, zmm24, zmm31
| 1 | 1.0 | | | | | | | | vfmadd213pd zmm31{k4}, zmm25, zmm31
| 1 | | | | | | 1.0 | | | vmulpd zmm26, zmm31, zmm15
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm31, zmm14
| 1 | | | | | | 1.0 | | | vmulpd zmm29, zmm31, zmm26
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm31, zmm29
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm31, zmm29, zmm7
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm27, zmm28
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm30, zmm31
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k6}, zmm24, zmm1
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm12{k6}, zmm24, zmm0
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k6}, zmm24, zmm21
| 1* | | | | | | | | | cmp r15, r14
| 0*F | | | | | | | | | jb 0xffffffffffffff19
Total Num Of Uops: 55
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,72 @@
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_lt1200_markers.o
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - force_soa_lt1200_markers.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 30.25 Cycles Throughput Bottleneck: Backend
Loop Count: 23
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 16.0 0.0 | 2.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 19.0 | 3.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | | | | | 1.0 | | | vpcmpeqb k2, xmm0, xmm0
| 1 | | 1.0 | | | | | | | add r9d, 0x8
| 1 | | | | | | 1.0 | | | vpcmpeqb k1, xmm0, xmm0
| 1 | | | | | | 1.0 | | | vpcmpeqb k3, xmm0, xmm0
| 1 | | | 1.0 1.0 | | | | | | vmovdqu ymm3, ymmword ptr [rcx+r14*4]
| 1 | | 1.0 | | | | | | | add r14, 0x8
| 1* | | | | | | | | | vpxord zmm5, zmm5, zmm5
| 1* | | | | | | | | | vpxord zmm4, zmm4, zmm4
| 1* | | | | | | | | | vpxord zmm6, zmm6, zmm6
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm5, k2, zmmword ptr [rax+ymm3*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm4, k1, zmmword ptr [rdx+ymm3*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm6, k3, zmmword ptr [rsi+ymm3*8]
| 1 | | | | | | 1.0 | | | vsubpd zmm29, zmm1, zmm5
| 1 | 1.0 | | | | | | | | vsubpd zmm28, zmm0, zmm4
| 1 | | | | | | 1.0 | | | vsubpd zmm31, zmm2, zmm6
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm29, zmm29
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm20, zmm28, zmm28
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm20, zmm31, zmm31
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm27, zmm20
| 1 | | | | | | 1.0 | | | vcmppd k5, zmm20, zmm16, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm27, 0x1e
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm20, zmm27, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k4, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm20, zmm20
| 1 | | | | | | 1.0 | | | vfmadd213pd zmm27{k4}, zmm20, zmm27
| 1 | 1.0 | | | | | | | | vfmadd213pd zmm27{k4}, zmm21, zmm27
| 1 | | | | | | 1.0 | | | vmulpd zmm22, zmm27, zmm15
| 1 | 1.0 | | | | | | | | vmulpd zmm24, zmm27, zmm14
| 1 | | | | | | 1.0 | | | vmulpd zmm25, zmm27, zmm22
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm27, zmm25
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm25, zmm7
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm23, zmm24
| 1 | | | | | | 1.0 | | | vmulpd zmm30, zmm26, zmm27
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k5}, zmm30, zmm28
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm12{k5}, zmm30, zmm29
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k5}, zmm30, zmm31
| 1* | | | | | | | | | cmp r9d, ebx
| 0*F | | | | | | | | | jb 0xffffffffffffff22
Total Num Of Uops: 52
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,78 @@
iwia021h@testfront1:~/MD-Bench/asm$ iaca -arch SKX force_soa_lt8_markers.o
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
Analyzed File - force_soa_lt8_markers.o
Binary Format - 64Bit
Architecture - SKX
Analysis Type - Throughput
Throughput Analysis Report
--------------------------
Block Throughput: 35.00 Cycles Throughput Bottleneck: Backend
Loop Count: 22
Port Binding In Cycles Per Iteration:
--------------------------------------------------------------------------------------------------
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
--------------------------------------------------------------------------------------------------
| Cycles | 20.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 20.0 | 4.0 | 0.0 |
--------------------------------------------------------------------------------------------------
DV - Divider pipe (on port 0)
D - Data fetch pipe (on ports 2 and 3)
F - Macro Fusion with the previous instruction occurred
* - instruction micro-ops not bound to a port
^ - Micro Fusion occurred
# - ESP Tracking sync uop was issued
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
X - instruction not supported, was not accounted in Analysis
| Num Of | Ports pressure in cycles | |
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
-----------------------------------------------------------------------------------------
| 1 | | 1.0 | | | | | | | imul r8, r12
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm9, xmm9
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm2, xmm8
| 1 | | | | | | 1.0 | | | vbroadcastsd zmm10, xmm10
| 1 | | 1.0 | | | | | | | sub r13d, ebx
| 1 | | 1.0 | | | | | | | add r8, r11
| 1 | | | | | | 1.0 | | | vpbroadcastd ymm0, r13d
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm0, ymm17
| 1 | | | | | | | 1.0 | | movsxd rbx, ebx
| 1* | | | | | | | | | vmovaps zmm4, zmm19
| 1 | 1.0 | | | | | | | | kmovw k2, k5
| 1* | | | | | | | | | vmovaps zmm3, zmm19
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm1{k5}{z}, ymmword ptr [r8+rbx*4]
| 1 | 1.0 | | | | | | | | kmovw k1, k5
| 1* | | | | | | | | | vmovaps zmm5, zmm19
| 1 | 1.0 | | | | | | | | kmovw k3, k5
| 5^ | 2.0 | | 4.0 4.0 | 4.0 4.0 | | | 1.0 | | vgatherdpd zmm5, k3, zmmword ptr [rsi+ymm1*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm4, k2, zmmword ptr [rax+ymm1*8]
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm3, k1, zmmword ptr [rdx+ymm1*8]
| 1 | 1.0 | | | | | | | | vsubpd zmm30, zmm10, zmm5
| 1 | | | | | | 1.0 | | | vsubpd zmm28, zmm9, zmm4
| 1 | 1.0 | | | | | | | | vsubpd zmm27, zmm2, zmm3
| 1 | | | | | | 1.0 | | | vmulpd zmm26, zmm28, zmm28
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm26, zmm27, zmm27
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm26, zmm30, zmm30
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm25, zmm26
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm26, zmm16, 0x1
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm25, 0x1e
| 1* | | | | | | | | | vmovaps zmm6, zmm26
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm6, zmm25, qword ptr [rip]{1to8}
| 1 | 1.0 | | | | | | | | knotw k4, k0
| 1 | | | | | | 1.0 | | | vmulpd zmm8, zmm6, zmm6
| 1 | 1.0 | | | | | | | | vfmadd213pd zmm25{k4}, zmm6, zmm25
| 1 | | | | | | 1.0 | | | vfmadd213pd zmm25{k4}, zmm8, zmm25
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm25, zmm15
| 1 | | | | | | 1.0 | | | vmulpd zmm22, zmm25, zmm14
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm25, zmm20
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm25, zmm23
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm25, zmm23, zmm7
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm21, zmm22
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm24, zmm25
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k6}, zmm29, zmm27
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k6}, zmm29, zmm28
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k6}, zmm29, zmm30
Total Num Of Uops: 60
Analysis Notes:
Backend allocation was stalled due to unavailable allocation resources.
There were bubbles in the frontend.

View File

@ -0,0 +1,80 @@
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_aos_geq1200_markers.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force_aos_geq1200_markers.s
Architecture: CSX
Timestamp: 2021-04-29 15:53:50
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
196 | | | | | | 1.00 | | || | | vpcmpgtd %ymm3, %ymm2, %k3 #67.9
197 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #68.21
198 | 1.00 | | | | | | | || | | kmovw %k3, %r9d #67.9
199 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #69.36
200 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm17 #69.36
201 | | | | | | | | || | | # LOE rax rcx rbx rbp rsi rdi r8 r10 r13 r15 r9d r11d r12d r14d xmm6 xmm7 xmm12 ymm2 ymm3 ymm15 ymm16 ymm17 zmm0 zmm1 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 k3
202 | | | | | | | | || | | ..B1.21: # Preds ..B1.18
203 | | | | | | | | || | | # Execution count [1.25e+01]
204 | 1.00 | | | | | | | || | | kmovw %k3, %k1 #69.36
205 | 1.00 | | | | | | | || | | kmovw %k3, %k2 #69.36
206 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm18, %zmm18, %zmm18 #69.36
207 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm19, %zmm19, %zmm19 #69.36
208 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #69.36
209 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd 16(%rdi,%ymm17,8), %zmm18{%k1} #69.36
210 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd 8(%rdi,%ymm17,8), %zmm19{%k2} #69.36
211 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdi,%ymm17,8), %zmm20{%k3} #69.36
212 | | | | | | | | || | | # LOE rax rcx rbx rbp rsi rdi r8 r10 r13 r15 r9d r11d r12d r14d xmm6 xmm7 xmm12 ymm2 ymm3 ymm15 ymm16 zmm0 zmm1 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
213 | | | | | | | | || | | ..B1.22: # Preds ..B1.21
214 | | | | | | | | || | | # Execution count [2.50e+01]
215 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r13 #67.9
216 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm16, %ymm3, %ymm3 #67.9
217 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm4, %zmm29 #71.36
218 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm19, %zmm0, %zmm27 #70.36
219 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm1, %zmm26 #69.36
220 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm27, %zmm27, %zmm25 #72.49
221 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm26, %zmm26, %zmm25 #72.49
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm29, %zmm29, %zmm25 #72.63
223 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm25, %zmm24 #75.38
224 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm25, %k2 #74.22
225 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm24, %k0 #75.38
226 | 1.00 | | | | | | | || | | kmovw %k2, %edx #74.22
227 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38
228 | | | | | | | | || | | * vmovaps %zmm25, %zmm17 #75.38
229 | 0.00 | 0.34 | | | | 0.00 | 0.66 | || | | andl %edx, %r9d #74.22
230 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #75.38
231 | 1.00 | | | | | | | || | | kmovw %r9d, %k3 #78.17
232 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18 #75.38
233 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #75.38
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #75.38
235 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm24, %zmm19 #76.38
236 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm24, %zmm21 #77.54
237 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm19, %zmm24, %zmm22 #76.44
238 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm22, %zmm24, %zmm20 #76.50
239 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm22, %zmm24 #77.54
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm21, %zmm20, %zmm23 #77.61
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm24, %zmm23, %zmm28 #77.67
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm26, %zmm28, %zmm9{%k3} #78.17
243 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm27, %zmm28, %zmm8{%k3} #79.17
244 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #80.17
245 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %rbx, %r13 #67.9
246 | | | | | | | | || | | * jb ..B1.18 # Prob 82% #67.9
20.5 6.00 13.0 2.50 13.0 2.50 20.5 4.00 70.0 4
Loop-Carried Dependencies Analysis Report
-----------------------------------------
215 | 1.0 | addq $8, %r13 #67.9| [215]
216 | 1.0 | vpaddd %ymm16, %ymm3, %ymm3 #67.9| [216]
244 | 4.0 | vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #80.17| [244]
243 | 4.0 | vfmadd231pd %zmm27, %zmm28, %zmm8{%k3} #79.17| [243]
242 | 4.0 | vfmadd231pd %zmm26, %zmm28, %zmm9{%k3} #78.17| [242]

View File

@ -0,0 +1,112 @@
iwia021h@testfront1:~/MD-Bench/ICC$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force.s
Architecture: CSX
Timestamp: 2021-04-26 22:33:06
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
261 | | | | | | | | || | | ..B1.25: # Preds ..B1.24
262 | | | | | | | | || | | # Execution count [4.50e+00]
263 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | 1.0 | movq %r8, %r13 #56.43
264 | | 1.00 | | | | | | || 3.0 | 3.0 | imulq %rcx, %r13 #56.43
265 | | | | | | 1.00 | | || | | vbroadcastsd %xmm6, %zmm2 #58.23
266 | | | | | | 1.00 | | || | | vbroadcastsd %xmm7, %zmm1 #59.23
267 | | | | | | 1.00 | | || | | vbroadcastsd %xmm12, %zmm0 #60.23
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | movslq %r12d, %rbx #67.9
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | 1.0 | addq %r10, %r13 #37.5
270 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rax, -64(%rsp) #37.5[spill]
271 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %r8, -56(%rsp) #37.5[spill]
272 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %r10, -48(%rsp) #37.5[spill]
273 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rsi, -40(%rsp) #37.5[spill]
274 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rcx, -32(%rsp) #37.5[spill]
275 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %r9, -80(%rsp) #37.5[spill]
276 | | | 0.00 | 0.00 | 1.00 | | | 1.00 || | | movq %rdx, -72(%rsp) #37.5[spill]
277 | | | | | | | | || | | # LOE rbx rbp rdi r13 r11d r12d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
278 | | | | | | | | || | | ..B1.26: # Preds ..B1.30 ..B1.25
279 | | | | | | | | || | | # Execution count [2.50e+01]
280 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%r13,%rbx,4), %ymm3 #68.21
281 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm3, %ymm3, %ymm4 #69.36
282 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm4, %ymm3, %ymm3 #69.36
283 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl (%r13,%rbx,4), %r10d #68.21
284 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 4(%r13,%rbx,4), %r9d #68.21
285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | 4.0 | movl 8(%r13,%rbx,4), %r8d #68.21
286 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 12(%r13,%rbx,4), %esi #68.21
287 | | 1.00 | | | | 0.00 | | || | | lea (%r10,%r10,2), %r10d #69.36
288 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 16(%r13,%rbx,4), %ecx #68.21
289 | | 1.00 | | | | 0.00 | | || | | lea (%r9,%r9,2), %r9d #69.36
290 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 20(%r13,%rbx,4), %edx #68.21
291 | | 1.00 | | | | 0.00 | | || | 1.0 | lea (%r8,%r8,2), %r8d #69.36
292 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 24(%r13,%rbx,4), %eax #68.21
293 | | 1.00 | | | | 0.00 | | || | | lea (%rsi,%rsi,2), %esi #69.36
294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movl 28(%r13,%rbx,4), %r15d #68.21
295 | | 1.00 | | | | 0.00 | | || | | lea (%rcx,%rcx,2), %ecx #69.36
296 | | 1.00 | | | | 0.00 | | || | | lea (%rdx,%rdx,2), %edx #69.36
297 | | 1.00 | | | | 0.00 | | || | | lea (%rax,%rax,2), %eax #69.36
298 | | 1.00 | | | | 0.00 | | || | | lea (%r15,%r15,2), %r15d #69.36
299 | | | | | | | | || | | # LOE rbx rbp rdi r13 eax edx ecx esi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm7 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
300 | | | | | | | | || | | ..B1.29: # Preds ..B1.26
301 | | | | | | | | || | | # Execution count [1.25e+01]
302 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k1 #69.36
303 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k2 #69.36
304 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k3 #69.36
305 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm4, %zmm4, %zmm4 #69.36
306 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm17, %zmm17, %zmm17 #69.36
307 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm18, %zmm18, %zmm18 #69.36
308 | 1.50 | 0.17 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.83 | || | | vgatherdpd 16(%rdi,%ymm3,8), %zmm4{%k1} #69.36
309 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || 4.0 | | vgatherdpd 8(%rdi,%ymm3,8), %zmm17{%k2} #69.36
310 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdi,%ymm3,8), %zmm18{%k3} #69.36
311 | | | | | | | | || | | # LOE rbx rbp rdi r13 r11d r12d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 zmm17 zmm18
312 | | | | | | | | || | | ..B1.30: # Preds ..B1.29
313 | | | | | | | | || | | # Execution count [2.50e+01]
314 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | addl $8, %r12d #67.9
315 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | addq $8, %rbx #67.9
316 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm4, %zmm0, %zmm26 #71.36
317 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm1, %zmm24 #70.36
318 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm2, %zmm23 #69.36
319 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm24, %zmm3 #72.49
320 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm23, %zmm23, %zmm3 #72.49
321 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm26, %zmm26, %zmm3 #72.63
322 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm3, %zmm22 #75.38
323 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm3, %k2 #74.22
324 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm22, %k0 #75.38
325 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #75.38
326 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38
327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm3, %zmm3, %zmm4 #75.38
328 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #75.38
329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #75.38
330 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm22, %zmm17 #76.38
331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm22, %zmm19 #77.54
332 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm22, %zmm20 #76.44
333 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm22, %zmm18 #76.50
334 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm5, %zmm20, %zmm22 #77.54
335 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm21 #77.61
336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm21, %zmm25 #77.67
337 | 1.00 | | | | | 0.00 | | || | | vfmadd231pd %zmm23, %zmm25, %zmm9{%k2} #78.17
338 | 1.00 | | | | | 0.00 | | || | | vfmadd231pd %zmm24, %zmm25, %zmm8{%k2} #79.17
339 | 1.00 | | | | | 0.00 | | || 4.0 | | vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #80.17
340 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpl %r14d, %r12d #67.9
341 | | | | | | | | || | | * jb ..B1.26 # Prob 82% #67.9
342 | | | | | | | | || | | # LOE rbx rbp rdi r13 r11d r12d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
21.0 11.2 17.0 6.50 17.0 6.50 7.00 17.0 8.83 7.00 75.0 10.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
287 | 6.0 | lea (%r10,%r10,2), %r10d #69.36| [269, 283, 287]
291 | 10.0 | lea (%r8,%r8,2), %r8d #69.36| [263, 264, 269, 285, 291]
295 | 9.0 | lea (%rcx,%rcx,2), %ecx #69.36| [264, 269, 288, 295]
314 | 1.0 | addl $8, %r12d #67.9| [314]
339 | 4.0 | vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #80.17| [339]
338 | 4.0 | vfmadd231pd %zmm24, %zmm25, %zmm8{%k2} #79.17| [338]
337 | 4.0 | vfmadd231pd %zmm23, %zmm25, %zmm9{%k2} #78.17| [337]

View File

@ -0,0 +1,91 @@
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_aos_lt8_markers.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force_aos_lt8_markers.s
Architecture: CSX
Timestamp: 2021-04-29 15:49:27
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
358 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
359 | | | | | | | | || | | ..B1.33: # Preds ..B1.32
360 | | | | | | | | || | | # Execution count [2.50e+01]
361 | | 1.00 | | | | | | || 3.0 | | imulq %r8, %rcx #56.43
362 | | | | | | 1.00 | | || | 3.0 | vbroadcastsd %xmm6, %zmm4 #58.23
363 | | | | | | | | || | | X subl %r14d, %r11d #67.9
364 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || 1.0 | | addq %r10, %rcx #37.5
365 | | | | | | | | || | | X vpbroadcastd %r11d, %ymm0 #67.9
366 | | | | | | 1.00 | | || | | vpcmpgtd %ymm15, %ymm0, %k3 #67.9
367 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | movslq %r14d, %r14 #67.9
368 | 1.00 | | | | | | | || | | kmovw %k3, %ebx #67.9
369 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%rcx,%r14,4), %ymm1{%k3}{z} #68.21
370 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm1, %ymm1, %ymm2 #69.36
371 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm2, %ymm1, %ymm0 #69.36
372 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r9 r10 ebx xmm7 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 k3
373 | | | | | | | | || | | ..B1.36: # Preds ..B1.33
374 | | | | | | | | || | | # Execution count [1.25e+01]
375 | 1.00 | | | | | | | || | | kmovw %k3, %k1 #69.36
376 | 1.00 | | | | | | | || | | kmovw %k3, %k2 #69.36
377 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm1, %zmm1, %zmm1 #69.36
378 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm2, %zmm2, %zmm2 #69.36
379 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm3, %zmm3, %zmm3 #69.36
380 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd 16(%rdi,%ymm0,8), %zmm1{%k1} #69.36
381 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd 8(%rdi,%ymm0,8), %zmm2{%k2} #69.36
382 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdi,%ymm0,8), %zmm3{%k3} #69.36
383 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r9 r10 ebx xmm7 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
384 | | | | | | | | || | | ..B1.37: # Preds ..B1.36
385 | | | | | | | | || | | # Execution count [2.50e+01]
386 | | | | | | 1.00 | | || | | vbroadcastsd %xmm7, %zmm7 #59.23
387 | | | | | | 1.00 | | || | | vbroadcastsd %xmm12, %zmm12 #60.23
388 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm1, %zmm12, %zmm23 #71.36
389 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm2, %zmm7, %zmm21 #70.36
390 | 0.50 | | | | | 0.50 | | || | 4.0 | vsubpd %zmm3, %zmm4, %zmm20 #69.36
391 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm21, %zmm19 #72.49
392 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm20, %zmm20, %zmm19 #72.49
393 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm23, %zmm23, %zmm19 #72.63
394 | 2.50 | | | | | 0.50 | | || 8.0 | 8.0 | vrcp14pd %zmm19, %zmm18 #75.38
395 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm19, %k2 #74.22
396 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm18, %k0 #75.38
397 | 1.00 | | | | | | | || | | kmovw %k2, %ecx #74.22
398 | 1.00 | | | | | | | || | | knotw %k0, %k1 #75.38
399 | | | | | | | | || | | * vmovaps %zmm19, %zmm0 #75.38
400 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | andl %ecx, %ebx #74.22
401 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #75.38
402 | 1.00 | | | | | | | || | | kmovw %ebx, %k3 #78.17
403 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm0, %zmm0, %zmm1 #75.38
404 | 0.50 | | | | | 0.50 | | || | 4.0 | vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #75.38
405 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #75.38
406 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm18, %zmm2 #76.38
407 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm10, %zmm18, %zmm4 #77.54
408 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vmulpd %zmm2, %zmm18, %zmm6 #76.44
409 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm6, %zmm18, %zmm3 #76.50
410 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm5, %zmm6, %zmm18 #77.54
411 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm4, %zmm3, %zmm17 #77.61
412 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm18, %zmm17, %zmm22 #77.67
413 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm22, %zmm9{%k3} #78.17
414 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm21, %zmm22, %zmm8{%k3} #79.17
415 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17
22.0 5.00 13.0 2.50 13.0 2.50 22.0 5.00 70.0 35.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
363 | 0.0 | subl %r14d, %r11d #67.9| [363]
367 | 1.0 | movslq %r14d, %r14 #67.9| [367]
386 | 3.0 | vbroadcastsd %xmm7, %zmm7 #59.23| [386]
387 | 3.0 | vbroadcastsd %xmm12, %zmm12 #60.23| [387]
415 | 4.0 | vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17| [415]
414 | 4.0 | vfmadd231pd %zmm21, %zmm22, %zmm8{%k3} #79.17| [414]
413 | 4.0 | vfmadd231pd %zmm20, %zmm22, %zmm9{%k3} #78.17| [413]
397 | 28.0 | kmovw %k2, %ecx #74.22| [361, 364, 369, 371, 382, 390, 392, 393, 395, 397]
408 | 35.0 | vmulpd %zmm2, %zmm18, %zmm6 #76.44| [362, 390, 392, 393, 394, 404, 405, 408]

View File

@ -0,0 +1,71 @@
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_geq1200_markers.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force_soa_geq1200_markers.s
Architecture: CSX
Timestamp: 2021-04-29 15:54:23
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
189 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r10 r11 r12 r14 r15 ebx r9d r13d xmm8 xmm9 xmm10 ymm3 ymm4 ymm17 ymm18 zmm2 zmm5 zmm6 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19
190 | | | | | | | | || | | ..B1.18: # Preds ..B1.18 ..B1.17
191 | | | | | | | | || | | # Execution count [2.50e+01]
192 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #67.9
193 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm18, %ymm4, %ymm4 #67.9
194 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rcx,%r15,4), %ymm20{%k5}{z} #68.21
195 | | | | | | | | || | | * vmovaps %zmm19, %zmm22 #70.36
196 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addq $8, %r15 #67.9
197 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #70.36
198 | | | | | | | | || | | * vmovaps %zmm19, %zmm21 #69.36
199 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #69.36
200 | | | | | | | | || | | * vmovaps %zmm19, %zmm23 #71.36
201 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #71.36
202 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm20,8), %zmm23{%k3} #71.36
203 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%rax,%ymm20,8), %zmm22{%k2} #70.36
204 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdx,%ymm20,8), %zmm21{%k1} #69.36
205 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm22, %zmm5, %zmm0 #70.36
206 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm21, %zmm2, %zmm1 #69.36
207 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm23, %zmm6, %zmm21 #71.36
208 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm0, %zmm0, %zmm20 #72.49
209 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm1, %zmm1, %zmm20 #72.49
210 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm21, %zmm21, %zmm20 #72.63
211 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm20, %zmm31 #75.38
212 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm20, %k6{%k5} #74.22
213 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm31, %k0 #75.38
214 | | | | | | | | || | | * vmovaps %zmm20, %zmm24 #75.38
215 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm31, %zmm24 #75.38
216 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38
217 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm24, %zmm25 #75.38
218 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm31, %zmm24, %zmm31{%k4} #75.38
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm31, %zmm25, %zmm31{%k4} #75.38
220 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm31, %zmm26 #76.38
221 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm31, %zmm28 #77.54
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm26, %zmm31, %zmm29 #76.44
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm31, %zmm27 #76.50
224 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm7, %zmm29, %zmm31 #77.54
225 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm28, %zmm27, %zmm30 #77.61
226 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm31, %zmm30, %zmm24 #77.67
227 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm1, %zmm24, %zmm13{%k6} #78.17
228 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm0, %zmm24, %zmm12{%k6} #79.17
229 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm21, %zmm24, %zmm11{%k6} #80.17
230 | 0.00 | 0.17 | | | | 0.00 | 0.83 | || | | cmpq %r14, %r15 #67.9
231 | | | | | | | | || | | * jb ..B1.18 # Prob 82% #67.9
18.0 4.17 13.0 2.50 13.0 2.50 18.0 2.83 68.0 4
Loop-Carried Dependencies Analysis Report
-----------------------------------------
193 | 1.0 | vpaddd %ymm18, %ymm4, %ymm4 #67.9| [193]
196 | 1.0 | addq $8, %r15 #67.9| [196]
228 | 4.0 | vfmadd231pd %zmm0, %zmm24, %zmm12{%k6} #79.17| [228]
227 | 4.0 | vfmadd231pd %zmm1, %zmm24, %zmm13{%k6} #78.17| [227]
229 | 4.0 | vfmadd231pd %zmm21, %zmm24, %zmm11{%k6} #80.17| [229]

View File

@ -0,0 +1,69 @@
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_lt1200_markers.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force_soa_lt1200_markers.s
Architecture: CSX
Timestamp: 2021-04-29 15:39:58
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
253 | | | | | | | | || | | # LOE rax rdx rcx rbp rsi rdi r8 r10 r11 r12 r14 ebx r9d r13d xmm8 xmm9 xmm10 ymm17 ymm18 zmm0 zmm1 zmm2 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19
254 | | | | | | | | || | | ..B1.22: # Preds ..B1.22 ..B1.21
255 | | | | | | | | || | | # Execution count [2.50e+01]
256 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k2 #70.36
257 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl $8, %r9d #67.9
258 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k1 #69.36
259 | | | | | | | | || | | X vpcmpeqb %xmm0, %xmm0, %k3 #71.36
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rcx,%r14,4), %ymm3 #68.21
261 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addq $8, %r14 #67.9
262 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm5, %zmm5, %zmm5 #70.36
263 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm4, %zmm4, %zmm4 #69.36
264 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm6, %zmm6, %zmm6 #71.36
265 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%rax,%ymm3,8), %zmm5{%k2} #70.36
266 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rdx,%ymm3,8), %zmm4{%k1} #69.36
267 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm3,8), %zmm6{%k3} #71.36
268 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm5, %zmm1, %zmm29 #70.36
269 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm4, %zmm0, %zmm28 #69.36
270 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm6, %zmm2, %zmm31 #71.36
271 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm20 #72.49
272 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm20 #72.49
273 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm20 #72.63
274 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm20, %zmm27 #75.38
275 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm20, %k5 #74.22
276 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm27, %k0 #75.38
277 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm27, %zmm20 #75.38
278 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38
279 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm20, %zmm21 #75.38
280 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm27, %zmm20, %zmm27{%k4} #75.38
281 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm27, %zmm21, %zmm27{%k4} #75.38
282 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm27, %zmm22 #76.38
283 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm27, %zmm24 #77.54
284 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm27, %zmm25 #76.44
285 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm27, %zmm23 #76.50
286 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm7, %zmm25, %zmm27 #77.54
287 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm23, %zmm26 #77.61
288 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm30 #77.67
289 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm13{%k5} #78.17
290 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm12{%k5} #79.17
291 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm31, %zmm30, %zmm11{%k5} #80.17
292 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | cmpl %ebx, %r9d #67.9
293 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #67.9
17.5 3.00 13.0 2.50 13.0 2.50 17.5 3.00 68.0 4
Loop-Carried Dependencies Analysis Report
-----------------------------------------
257 | 1.0 | addl $8, %r9d #67.9| [257]
261 | 1.0 | addq $8, %r14 #67.9| [261]
290 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm12{%k5} #79.17| [290]
289 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm13{%k5} #78.17| [289]
291 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm11{%k5} #80.17| [291]

View File

@ -0,0 +1,79 @@
iwia021h@testfront1:~/MD-Bench/asm$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX force_soa_lt8_markers.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: force_soa_lt8_markers.s
Architecture: CSX
Timestamp: 2021-04-29 15:52:48
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
300 | | | | | | | | || | | # LOE rax rdx rbp rsi rdi r8 r10 r11 r12 ebx r13d xmm8 xmm9 xmm10 ymm17 ymm18 zmm7 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16 zmm19
301 | | | | | | | | || | | ..B1.25: # Preds ..B1.24
302 | | | | | | | | || | | # Execution count [2.50e+01]
303 | | 1.00 | | | | | | || 3.0 | | imulq %r12, %r8 #56.43
304 | | | | | | 1.00 | | || | | vbroadcastsd %xmm9, %zmm9 #59.23
305 | | | | | | 1.00 | | || | 3.0 | vbroadcastsd %xmm8, %zmm2 #58.23
306 | | | | | | 1.00 | | || | | vbroadcastsd %xmm10, %zmm10 #60.23
307 | | | | | | | | || | | X subl %ebx, %r13d #67.9
308 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | addq %r11, %r8 #37.5
309 | | | | | | | | || | | X vpbroadcastd %r13d, %ymm0 #67.9
310 | | | | | | 1.00 | | || | | vpcmpgtd %ymm17, %ymm0, %k5 #67.9
311 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ebx, %rbx #67.9
312 | | | | | | | | || | | * vmovaps %zmm19, %zmm4 #70.36
313 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #70.36
314 | | | | | | | | || | | * vmovaps %zmm19, %zmm3 #69.36
315 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r8,%rbx,4), %ymm1{%k5}{z} #68.21
316 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #69.36
317 | | | | | | | | || | | * vmovaps %zmm19, %zmm5 #71.36
318 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #71.36
319 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%rsi,%ymm1,8), %zmm5{%k3} #71.36
320 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || 4.0 | | vgatherdpd (%rax,%ymm1,8), %zmm4{%k2} #70.36
321 | 1.50 | 0.00 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 1.00 | || | | vgatherdpd (%rdx,%ymm1,8), %zmm3{%k1} #69.36
322 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm5, %zmm10, %zmm30 #71.36
323 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm4, %zmm9, %zmm28 #70.36
324 | 0.50 | | | | | 0.50 | | || | 4.0 | vsubpd %zmm3, %zmm2, %zmm27 #69.36
325 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm28, %zmm26 #72.49
326 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm27, %zmm27, %zmm26 #72.49
327 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vfmadd231pd %zmm30, %zmm30, %zmm26 #72.63
328 | 2.50 | | | | | 0.50 | | || 8.0 | 8.0 | vrcp14pd %zmm26, %zmm25 #75.38
329 | | | | | | 1.00 | | || | | vcmppd $1, %zmm16, %zmm26, %k6{%k5} #74.22
330 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm25, %k0 #75.38
331 | | | | | | | | || | | * vmovaps %zmm26, %zmm6 #75.38
332 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | 4.0 | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm25, %zmm6 #75.38
333 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.38
334 | 0.50 | | | | | 0.50 | | || 4.0 | 4.0 | vmulpd %zmm6, %zmm6, %zmm8 #75.38
335 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm25, %zmm6, %zmm25{%k4} #75.38
336 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm25, %zmm8, %zmm25{%k4} #75.38
337 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm15, %zmm25, %zmm20 #76.38
338 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm14, %zmm25, %zmm22 #77.54
339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm25, %zmm23 #76.44
340 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm25, %zmm21 #76.50
341 | 0.50 | | | | | 0.50 | | || | | vfmsub213pd %zmm7, %zmm23, %zmm25 #77.54
342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm21, %zmm24 #77.61
343 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm24, %zmm29 #77.67
344 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm27, %zmm29, %zmm13{%k6} #78.17
345 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm29, %zmm12{%k6} #79.17
346 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm30, %zmm29, %zmm11{%k6} #80.17
19.5 3.50 13.0 2.50 13.0 2.50 19.5 3.50 68.0 31.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
308 | 4.0 | addq %r11, %r8 #37.5| [303, 308]
304 | 3.0 | vbroadcastsd %xmm9, %zmm9 #59.23| [304]
306 | 3.0 | vbroadcastsd %xmm10, %zmm10 #60.23| [306]
307 | 0.0 | subl %ebx, %r13d #67.9| [307]
311 | 1.0 | movslq %ebx, %rbx #67.9| [311]
346 | 4.0 | vfmadd231pd %zmm30, %zmm29, %zmm11{%k6} #80.17| [346]
345 | 4.0 | vfmadd231pd %zmm28, %zmm29, %zmm12{%k6} #79.17| [345]
344 | 4.0 | vfmadd231pd %zmm27, %zmm29, %zmm13{%k6} #78.17| [344]
334 | 31.0 | vmulpd %zmm6, %zmm6, %zmm8 #75.38| [305, 324, 326, 327, 328, 332, 334]

View File

@ -0,0 +1,70 @@
iwia021h@testfront1:~/MD-Bench$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX ICC/force.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: ICC/force.s
Architecture: CSX
Timestamp: 2021-04-30 16:08:44
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
306 | | | | | | | | || | | # LOE rbp rdi r8 r9 r10 edx ecx r11d r12d r13d r14d r15d ymm13 ymm14 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm15
307 | | | | | | | | || | | ..B1.29: # Preds ..B1.28
308 | | | | | | | | || | | # Execution count [2.50e+04]
309 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | movl %r14d, %eax #64.13
310 | | | | | | | | || | | X subl %ecx, %eax #64.13
311 | | | | | | | | || | | X vpbroadcastd %eax, %ymm0 #64.13
312 | | | | | | 1.00 | | || | | vpcmpgtd %ymm14, %ymm0, %k5 #64.13
313 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ecx, %rcx #64.13
314 | | | | | | | | || | | * vmovaps %zmm15, %zmm17 #67.40
315 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #67.40
316 | | | | | | | | || | | * vmovaps %zmm15, %zmm16 #66.40
317 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rdi,%rcx,4), %ymm1{%k5}{z} #65.25
318 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #66.40
319 | | | | | | | | || | | * vmovaps %zmm15, %zmm18 #68.40
320 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #68.40
321 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%r8,%ymm1,8), %zmm18{%k3} #68.40
322 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%r9,%ymm1,8), %zmm17{%k2} #67.40
323 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || | | vgatherdpd (%r10,%ymm1,8), %zmm16{%k1} #66.40
324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm3, %zmm31 #68.40
325 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm2, %zmm29 #67.40
326 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm16, %zmm4, %zmm28 #66.40
327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm27 #69.53
328 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm27 #69.53
329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm27 #69.67
330 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm27, %zmm26 #72.42
331 | | | | | | 1.00 | | || | | vcmppd $1, %zmm12, %zmm27, %k6{%k5} #71.26
332 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm26, %k0 #72.42
333 | | | | | | | | || | | * vmovaps %zmm27, %zmm19 #72.42
334 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #72.42
335 | 1.00 | | | | | | | || | | knotw %k0, %k4 #72.42
336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm19, %zmm20 #72.42
337 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #72.42
338 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #72.42
339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm11, %zmm26, %zmm21 #73.42
340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm9, %zmm26, %zmm23 #74.58
341 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm26, %zmm24 #73.48
342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm26, %zmm22 #73.54
343 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm10, %zmm24, %zmm26 #74.58
344 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm23, %zmm22, %zmm25 #74.65
345 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm26, %zmm25, %zmm30 #74.71
346 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21
347 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21
348 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21
18.0 3.00 13.0 2.50 13.0 2.50 18.0 3.00 68.0 4
Loop-Carried Dependencies Analysis Report
-----------------------------------------
313 | 1.0 | movslq %ecx, %rcx #64.13| [313]
348 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21| [348]
347 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21| [347]
346 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21| [346]

12567
asm/avx2/force_soa_att.s Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,87 @@
200,4x4x2,8,256,21.5040,6.1440,9.2160,2.1360,23.9701,95.9944,13.7135
200,4x4x4,8,512,43.0080,12.2880,18.4320,4.3167,23.7221,96.9983,13.8569
200,4x4x8,8,1024,86.0160,24.5760,36.8640,8.5194,24.0392,95.7186,13.6741
200,4x4x16,8,2048,172.0320,49.1520,73.7280,17.4337,23.4947,97.9370,13.9910
200,4x8x2,8,512,43.0080,12.2880,18.4320,4.2566,24.0568,95.6488,13.6641
200,4x8x4,8,1024,86.0160,24.5760,36.8640,9.2081,22.2412,103.4567,14.7795
200,4x8x8,8,2048,172.0320,49.1520,73.7280,17.5763,23.3041,98.7380,14.1054
200,4x8x16,8,4096,344.0640,98.3040,147.4560,34.0899,24.0306,95.7531,13.6790
200,4x16x2,8,1024,86.0160,24.5760,36.8640,8.5118,24.0608,95.6326,13.6618
200,4x16x4,8,2048,172.0320,49.1520,73.7280,17.0260,24.0574,95.6463,13.6638
200,4x16x8,8,4096,344.0640,98.3040,147.4560,34.2228,23.9373,96.1263,13.7323
200,4x16x16,8,8192,688.1280,196.6080,294.9120,70.1974,23.3399,98.5865,14.0838
200,8x4x2,8,512,43.0080,12.2880,18.4320,4.2777,23.9379,96.1236,13.7319
200,8x4x4,8,1024,86.0160,24.5760,36.8640,8.5118,24.0608,95.6328,13.6618
200,8x4x8,8,2048,172.0320,49.1520,73.7280,17.0249,24.0589,95.6404,13.6629
200,8x4x16,8,4096,344.0640,98.3040,147.4560,34.0480,24.0601,95.6354,13.6622
200,8x8x2,8,1024,86.0160,24.5760,36.8640,8.5123,24.0593,95.6388,13.6627
200,8x8x4,8,2048,172.0320,49.1520,73.7280,17.0434,24.0327,95.7445,13.6778
200,8x8x8,8,4096,344.0640,98.3040,147.4560,34.0453,24.0621,95.6277,13.6611
200,8x8x16,8,8192,688.1280,196.6080,294.9120,68.7834,23.8197,96.6007,13.8001
200,8x16x2,8,2048,172.0320,49.1520,73.7280,17.1665,23.8604,96.4357,13.7765
200,8x16x4,8,4096,344.0640,98.3040,147.4560,34.7349,23.5844,97.5647,13.9378
200,8x16x8,8,8192,688.1280,196.6080,294.9120,68.0974,24.0597,95.6373,13.6625
200,8x16x16,8,16384,1376.2560,393.2160,589.8240,137.8436,-7.3864,-311.5186,-44.5027
200,16x4x2,8,1024,86.0160,24.5760,36.8640,8.5148,24.0524,95.6662,13.6666
200,16x4x4,8,2048,172.0320,49.1520,73.7280,17.0260,24.0573,95.6467,13.6638
200,16x4x8,8,4096,344.0640,98.3040,147.4560,34.0505,24.0584,95.6424,13.6632
200,16x4x16,8,8192,688.1280,196.6080,294.9120,69.0856,23.7155,97.0251,13.8607
200,16x8x2,8,2048,172.0320,49.1520,73.7280,17.4353,23.4925,97.9460,13.9923
200,16x8x4,8,4096,344.0640,98.3040,147.4560,34.0727,24.0427,95.7047,13.6721
200,16x8x8,8,8192,688.1280,196.6080,294.9120,69.3569,23.6227,97.4062,13.9152
200,16x8x16,8,16384,1376.2560,393.2160,589.8240,137.4764,-7.4061,-310.6889,-44.3841
200,16x16x2,8,4096,344.0640,98.3040,147.4560,34.0576,24.0534,95.6622,13.6660
200,16x16x4,8,8192,688.1280,196.6080,294.9120,71.0575,23.0574,99.7945,14.2564
200,16x16x8,8,16384,1376.2560,393.2160,589.8240,136.6013,-7.4536,-308.7110,-44.1016
200,16x16x16,8,32768,2752.5120,786.4320,1179.6480,274.6401,-7.4146,-310.3355,-44.3336
200,4x4x2,16,512,59.3920,12.2880,34.8160,7.2058,14.2108,161.9192,10.7946
200,4x4x4,16,1024,118.7840,24.5760,69.6320,14.4343,14.1884,162.1750,10.8117
200,4x4x8,16,2048,237.5680,49.1520,139.2640,28.8264,14.2092,161.9376,10.7958
200,4x4x16,16,4096,475.1360,98.3040,278.5280,58.3994,14.0275,164.0345,10.9356
200,4x8x2,16,1024,118.7840,24.5760,69.6320,14.4088,14.2135,161.8880,10.7925
200,4x8x4,16,2048,237.5680,49.1520,139.2640,29.4276,13.9189,165.3146,11.0210
200,4x8x8,16,4096,475.1360,98.3040,278.5280,59.7154,13.7184,167.7309,11.1821
200,4x8x16,16,8192,950.2720,196.6080,557.0560,115.2898,14.2111,161.9152,10.7943
200,4x16x2,16,2048,237.5680,49.1520,139.2640,28.8249,14.2100,161.9287,10.7952
200,4x16x4,16,4096,475.1360,98.3040,278.5280,58.7774,13.9373,165.0962,11.0064
200,4x16x8,16,8192,950.2720,196.6080,557.0560,116.0715,14.1154,163.0130,10.8675
200,4x16x16,16,16384,1900.5440,393.2160,1114.1120,231.6558,-4.3952,-523.5290,-34.9019
200,8x4x2,16,1024,118.7840,24.5760,69.6320,14.4120,14.2104,161.9233,10.7949
200,8x4x4,16,2048,237.5680,49.1520,139.2640,28.8246,14.2101,161.9270,10.7951
200,8x4x8,16,4096,475.1360,98.3040,278.5280,58.9677,13.8924,165.6307,11.0420
200,8x4x16,16,8192,950.2720,196.6080,557.0560,117.3380,13.9631,164.7917,10.9861
200,8x8x2,16,2048,237.5680,49.1520,139.2640,29.2781,13.9900,164.4749,10.9650
200,8x8x4,16,4096,475.1360,98.3040,278.5280,59.7120,13.7192,167.7213,11.1814
200,8x8x8,16,8192,950.2720,196.6080,557.0560,116.6388,14.0468,163.8097,10.9206
200,8x8x16,16,16384,1900.5440,393.2160,1114.1120,235.1777,-4.3294,-531.4882,-35.4325
200,8x16x2,16,4096,475.1360,98.3040,278.5280,57.6495,14.2100,161.9280,10.7952
200,8x16x4,16,8192,950.2720,196.6080,557.0560,116.3551,14.0810,163.4112,10.8941
200,8x16x8,16,16384,1900.5440,393.2160,1114.1120,230.6178,-4.4150,-521.1831,-34.7455
200,8x16x16,16,32768,3801.0880,786.4320,2228.2240,475.6845,-4.2809,-537.5100,-35.8340
200,16x4x2,16,2048,237.5680,49.1520,139.2640,28.8208,14.2120,161.9058,10.7937
200,16x4x4,16,4096,475.1360,98.3040,278.5280,57.6418,14.2119,161.9066,10.7938
200,16x4x8,16,8192,950.2720,196.6080,557.0560,117.6560,13.9253,165.2383,11.0159
200,16x4x16,16,16384,1900.5440,393.2160,1114.1120,239.2523,-4.2556,-540.6966,-36.0464
200,16x8x2,16,4096,475.1360,98.3040,278.5280,58.7259,13.9495,164.9516,10.9968
200,16x8x4,16,8192,950.2720,196.6080,557.0560,118.8258,13.7883,166.8811,11.1254
200,16x8x8,16,16384,1900.5440,393.2160,1114.1120,231.1737,-4.4043,-522.4394,-34.8293
200,16x8x16,16,32768,3801.0880,786.4320,2228.2240,472.1670,-4.3127,-533.5352,-35.5690
200,16x16x2,16,8192,950.2720,196.6080,557.0560,116.7953,14.0280,164.0296,10.9353
200,16x16x4,16,16384,1900.5440,393.2160,1114.1120,230.5746,-4.4158,-521.0854,-34.7390
200,16x16x8,16,32768,3801.0880,786.4320,2228.2240,464.7158,-4.3819,-525.1156,-35.0077
200,16x16x16,16,65536,7602.1760,1572.8640,4456.4480,925.1097,0.2403,9575.7780,638.3852
200,4x4x2,32,1024,184.3200,24.5760,135.1680,26.0030,7.8760,292.1528,9.4243
200,4x4x4,32,2048,368.6400,49.1520,270.3360,51.8944,7.8930,291.5258,9.4041
200,4x4x8,32,4096,737.2800,98.3040,540.6720,107.3953,7.6279,301.6561,9.7308
200,4x4x16,32,8192,1474.5600,196.6080,1081.3440,213.9949,7.6563,300.5385,9.6948
200,4x8x2,32,2048,368.6400,49.1520,270.3360,51.8926,7.8932,291.5157,9.4037
200,4x8x4,32,4096,737.2800,98.3040,540.6720,104.0172,7.8756,292.1675,9.4248
200,4x8x8,32,8192,1474.5600,196.6080,1081.3440,209.5100,7.8202,294.2398,9.4916
200,4x8x16,32,16384,2949.1200,393.2160,2162.6880,429.5337,-2.3704,-970.7216,-31.3136
200,4x16x2,32,4096,737.2800,98.3040,540.6720,103.7967,7.8923,291.5482,9.4048
200,4x16x4,32,8192,1474.5600,196.6080,1081.3440,211.2654,7.7552,296.7051,9.5711
200,4x16x8,32,16384,2949.1200,393.2160,2162.6880,422.9005,-2.4076,-955.7310,-30.8300
200,4x16x16,32,32768,5898.2400,786.4320,4325.3760,833.3645,-2.4435,-941.6781,-30.3767
200,8x4x2,32,2048,368.6400,49.1520,270.3360,52.0055,7.8761,292.1499,9.4242
200,8x4x4,32,4096,737.2800,98.3040,540.6720,106.8542,7.6665,300.1363,9.6818
200,8x4x8,32,8192,1474.5600,196.6080,1081.3440,209.6327,7.8156,294.4121,9.4972