From 4e99f7a62328b93f5525bd8316a42249b7d5089f Mon Sep 17 00:00:00 2001 From: JanLJL Date: Tue, 14 Feb 2023 13:52:59 +0100 Subject: [PATCH] fixed wrong markers and added OSACA output for ICX --- .../gromacs-icx-avx512-dp-osaca-icx.out | 167 ++++++++++++++++++ .../gromacs-icx-avx512-sp-osaca-icx.out | 116 ++++++++++++ static_analysis/jan/gromacs-icx-avx512-sp.o | Bin 13184 -> 13152 bytes static_analysis/jan/gromacs-icx-avx512-sp.s | 22 +-- 4 files changed, 294 insertions(+), 11 deletions(-) create mode 100644 static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out create mode 100644 static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out new file mode 100644 index 0000000..1593c72 --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out @@ -0,0 +1,167 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-icx-avx512-dp.s +Architecture: ICX +Timestamp: 2023-02-14 12:51:57 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | +------------------------------------------------------------------------------------------------------------------------ +2241 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 +2242 | | | | | | | | | | || | | # LLVM-MCA-BEGIN +2243 | | | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1 +2244 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +2245 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r10,%rbx,4), %rcx +2246 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx +2247 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rdx +2248 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV +2249 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV +2250 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV +2251 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload +2252 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm3 +2253 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31 +2254 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload +2255 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16 +2256 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17 +2257 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17 +2258 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17 +2259 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm17, %zmm18 +2260 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19 +2261 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19 +2262 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19 +2263 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20 +2264 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm22, %zmm18 +2265 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18 +2266 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm30, %zmm25, %zmm20 +2267 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx +2268 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rdx, %r11 +2269 | 0.00 | | | | | | 1.00 | | | || | | setne %dl +2270 | 0.00 | | | | | | 1.00 | | | || | | sete %al +2271 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl %ecx, %ecx +2272 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | incl %ecx +2273 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rcx, %r11 +2274 | 0.00 | | | | | | 1.00 | | | || | | sete %cl +2275 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18 +2276 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload +2277 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm28, %zmm19, %zmm19 +2278 | 0.00 | | | | | | 1.00 | | | || | | setne %dil +2279 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp +2280 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $4, %bpl +2281 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | subb %al, %bpl +2282 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | addb $-17, %bpl +2283 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1 +2284 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1} +2285 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload +2286 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm17, %zmm17 +2287 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rdx,%rdx), %eax +2288 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp +2289 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18 +2290 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14 +2291 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm3 +2292 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3 +2293 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3 +2294 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11 +2295 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm3, %zmm16 +2296 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7 +2297 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm21, %zmm18 +2298 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18 +2299 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18 +2300 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm18, %zmm31 +2301 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm22, %zmm16 +2302 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm31, %zmm16, %zmm16 +2303 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload +2304 | 0.75 | | | | | 0.250 | | | | || | | vsubpd %zmm28, %zmm31, %zmm31 +2305 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $5, %bpl +2306 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb %al, %bpl +2307 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb $-35, %bpl +2308 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1 +2309 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1} +2310 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload +2311 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm3, %zmm3 +2312 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm18, %zmm16 +2313 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm26, %zmm18 +2314 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16 +2315 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15 +2316 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm19 +2317 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19 +2318 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19 +2319 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10 +2320 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm19, %zmm17 +2321 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6 +2322 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm21, %zmm16 +2323 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16 +2324 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16 +2325 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm16, %zmm20 +2326 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm22, %zmm17 +2327 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm17, %zmm17 +2328 | 0.75 | | | | | 0.250 | | | | || | | vmulpd %zmm17, %zmm16, %zmm16 +2329 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (,%rdx,4), %eax +2330 | 0.00 | | | | | | 1.00 | | | || | | shlb $6, %dil +2331 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %al, %dil +2332 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb $-69, %dil +2333 | 1.00 | | | | | | | | | || | | kmovd %edi, %k1 +2334 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1} +2335 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload +2336 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm17, %zmm17 +2337 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm23, %zmm19 +2338 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm20 +2339 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16 +2340 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13 +2341 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm28 +2342 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28 +2343 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28 +2344 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9 +2345 | 2.00 | | | | | 1.000 | | | | || | | vrcp14pd %zmm28, %zmm3 +2346 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5 +2347 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm21, %zmm16 +2348 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16 +2349 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16 +2350 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm1, %zmm16, %zmm18 +2351 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm22, %zmm3 +2352 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm18, %zmm3, %zmm3 +2353 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm16, %zmm3 +2354 | 0.00 | | | | | | 1.00 | | | || | | shlb $3, %dl +2355 | 0.00 | | | | | | 1.00 | | | || | | shlb $7, %cl +2356 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %dl, %cl +2357 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addb $-9, %cl +2358 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k1 +2359 | 0.00 | | | | | 1.000 | | | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1} +2360 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm2, %zmm3, %zmm3 +2361 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12 +2362 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8 +2363 | 0.24 | | | | | 0.760 | | | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4 +2364 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rbx +2365 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rbx, %r9 +2366 | | | | | | | | | | || | | * jne .LBB5_12 +2367 | | | | | | | | | | || | | # LLVM-MCA-END + + 44.0 15.0 5.50 5.50 5.50 5.50 43.99 15.0 71 6.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307] +2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363] +2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362] +2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361] +2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346] +2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344] +2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340] +2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321] +2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319] +2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315] +2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296] +2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294] +2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290] +2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332] +2364 | 1.0 | incq %rbx | [2364] + diff --git a/static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out new file mode 100644 index 0000000..1b138a4 --- /dev/null +++ b/static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out @@ -0,0 +1,116 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-icx-avx512-sp.s +Architecture: ICX +Timestamp: 2023-02-14 12:51:43 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD | +------------------------------------------------------------------------------------------------------------------------ +1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649 +1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN +1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1 +1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx +1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx +1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx +1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16 +1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3] +1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] +1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7] +1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18 +1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17 +1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16 +1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22 +1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22 +1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22 +1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23 +1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24 +1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24 +1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24 +1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25 +1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23 +1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23 +1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23 +1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx +1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi +1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp +1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12 +1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil +1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx +1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl +1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx +1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx +1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12 +1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl +1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx +1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl +1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx +1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d +1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d +1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d +1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx +1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx +1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d +1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00 +1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx +1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2 +1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2} +1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21 +1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20 +1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19 +1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22 +1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12 +1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18 +1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18 +1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18 +1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9 +1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17 +1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5 +1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16 +1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16 +1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16 +1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22 +1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17 +1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17 +1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16 +1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx +1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx +1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx +1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx +1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx +1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx +1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB +1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2 +1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2} +1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16 +1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13 +1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8 +1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4 +1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax +1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10 +1420 | | | | | | | | | | || | | * jne .LBB2_12 +1421 | | | | | | | | | | || | | # LLVM-MCA-END + + 22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417] +1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416] +1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415] +1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397] +1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395] +1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391] +1418 | 1.0 | incq %rax | [1418] + diff --git a/static_analysis/jan/gromacs-icx-avx512-sp.o b/static_analysis/jan/gromacs-icx-avx512-sp.o index 119d7f91edfd9dd989140b09573efc0c0801e182..41366741eb6b2a75026334f08563f8e1c990ec8b 100644 GIT binary patch delta 573 zcmZoje~>mogE3*FW*Il5>Es363LLxh85kH+(kE;_z|F(Rcw+Kd9zDj$$$xn6^QNWg z@yoXWWf>UKCTxDd>%ycp;rIXlyYE3Xcyzm{aQJk0EC6ynx;r)iDbN3hUrzn||3AOH z%j7r#CB~%9jRM^qOdpsh^GQD1{6q2!lT3+}XZL=vmj8!d{{Hj-Kawq*6=c6NGKv1! zEGfU6iRlOH=4*;kjEomH8!DHBxRY0@=1DTh0__2UrHl}I3Xm3JU}V^^*-&jg8{?J9 z4|T*nZ^}R0w2A`0`)sVz_Cp+qjONv0P1nH7Sl69M0 z2$T(!0}25F%q}qqmmzHOLZECC4%zg{7j?xM^FSsrR!nBp6KAZSEU7nN@&eQ)IZ*F0 z0Zjv8hRGL!YBo%MsrQ}n=;Vj`=D?VeG_Yr4m7kobBsRIgKm*3yV4%UsG5Mx}J*$B{ zNZ&^l;mHPu8jKE;BMt2t6DH3DlEIU28rm~WQ2^^TFw$V0F*(x6o>5@(Od#no`6iH5 Vm@H{*&v`))WH$rDgvp8W@&GvOpfCUc delta 586 zcmZ`$PbdUo6#w4W(yqnMQpV)ZtUs$!X6$ZCPD43x_|^OKd%ySI_kDej>CCjDKql9@6b2O!!ZGtITnTW4 zca<7UVxa1wEFupwW?EH)Jh1!eIM^?U^4_=Oq|;hNUSlqNX++L2hT}pjjG0+sGsuMi zR!h&p!aVg&O&g{qz4M;PnnrYZ^ZbCqkaPJ~{BoSaz1QN~=1Yy^rXuil5IIk}^ zPg}+SEHm1+>{8r_&q=W++CVr2FK$z;VZd(8Xva}GyyHceIBYk1Vg-c|*^Wn9urwSX z9w(z4{i4QK@9@=CY*9t2$EP;<)HIHgy3kA|!rxhS9%W0IC3W>5bscl62wUz3*u^LE z`)I5Cl2y9-Z0UKNwz(F=oF>wrU28>n!He!*ctyKg=W!VaTI+Bmq5P#rLIDNy3B7DY zra`=IuSFrLzy$Usb;#ff`9;hpb$$~2ofT3FoMB%|hXk&WpTRu&eza41$a4RXVBml( F{{R54olpP( diff --git a/static_analysis/jan/gromacs-icx-avx512-sp.s b/static_analysis/jan/gromacs-icx-avx512-sp.s index b1506e1..8325527 100644 --- a/static_analysis/jan/gromacs-icx-avx512-sp.s +++ b/static_analysis/jan/gromacs-icx-avx512-sp.s @@ -1331,6 +1331,12 @@ computeForceLJ_2xnn_full: # vxorps %xmm8, %xmm8, %xmm8 vxorps %xmm4, %xmm4, %xmm4 .p2align 4, 0x90 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649 +# LLVM-MCA-BEGIN .LBB2_12: # Parent Loop BB2_7 Depth=1 # => This Inner Loop Header: Depth=2 movslq (%r11,%rax,4), %rcx @@ -1412,6 +1418,11 @@ computeForceLJ_2xnn_full: # incq %rax cmpq %rax, %r10 jne .LBB2_12 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER # %bb.13: # in Loop: Header=BB2_7 Depth=1 movq %r15, %r14 movq 8(%rsp), %rbp # 8-byte Reload @@ -1655,12 +1666,6 @@ computeForceLJ_4xn_half: # vmovups 64(%rsp), %zmm6 # 64-byte Reload vmovups 512(%rsp), %zmm7 # 64-byte Reload .p2align 4, 0x90 -movl $111, %ebx # OSACA START MARKER -.byte 100 # OSACA START MARKER -.byte 103 # OSACA START MARKER -.byte 144 # OSACA START MARKER -# pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649 -# LLVM-MCA-BEGIN .LBB4_8: # =>This Inner Loop Header: Depth=1 movslq (%r11,%rdx,4), %rax movq %rax, %rsi @@ -1793,11 +1798,6 @@ movl $111, %ebx # OSACA START MARKER movq 160(%r15), %rdi incq %rdx jmp .LBB4_8 -# LLVM-MCA-END -movl $222, %ebx # OSACA END MARKER -.byte 100 # OSACA END MARKER -.byte 103 # OSACA END MARKER -.byte 144 # OSACA END MARKER .p2align 5, 0x90 .LBB4_18: vzeroupper