From c438fc6832edd6123b5190e1c4dcdde6d848b542 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Fri, 7 Apr 2023 21:54:07 +0200 Subject: [PATCH 1/3] Fix GROMACS AVX2 code Signed-off-by: Rafael Ravedutti --- common/includes/simd/avx2_double.h | 8 +++++--- gromacs/neighbor.c | 30 ++++++++++++++++++------------ 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/common/includes/simd/avx2_double.h b/common/includes/simd/avx2_double.h index c48fe6c..e57df9b 100644 --- a/common/includes/simd/avx2_double.h +++ b/common/includes/simd/avx2_double.h @@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S t2 = _mm256_permute2f128_pd(t0, t1, 0x21); t0 = _mm256_add_pd(t0, t2); t1 = _mm256_add_pd(t1, t2); - t0 = _mm256_blend_pd(t0, t1, 0b1100); + t0 = _mm256_blend_pd(t0, t1, 0xC); + //t0 = _mm256_blend_pd(t0, t1, 0b1100); t1 = _mm256_add_pd(t0, _mm256_load_pd(m)); _mm256_store_pd(m, t1); - t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101)); + t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5)); + //t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101)); a0 = _mm256_castpd256_pd128(t0); a1 = _mm256_extractf128_pd(t0, 0x1); a0 = _mm_add_sd(a0, a1); @@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, } // Functions used in LAMMPS kernel -static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); } +#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s); static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); } static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); } static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); } diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index dfba8f5..ba9fc82 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -58,6 +58,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { neighbor->numneigh = NULL; neighbor->numneigh_masked = NULL; neighbor->neighbors = NULL; + neighbor->neighbors_imask = NULL; } void setupNeighbor(Parameter *param, Atom *atom) { @@ -229,7 +230,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { if(atom->Nclusters_local > nmax) { nmax = atom->Nclusters_local; if(neighbor->numneigh) free(neighbor->numneigh); + if(neighbor->numneigh_masked) free(neighbor->numneigh_masked); if(neighbor->neighbors) free(neighbor->neighbors); + if(neighbor->neighbors_imask) free(neighbor->neighbors_imask); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int)); neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int)); @@ -326,15 +329,17 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { imask = get_imask_simd_4xn(1, ci, cj); #endif - if(imask == NBNXN_INTERACTION_MASK_ALL) { - neighptr[n] = cj; - neighptr_imask[n] = imask; - } else { - neighptr[n] = neighptr[nmasked]; - neighptr_imask[n] = neighptr_imask[nmasked]; - neighptr[nmasked] = cj; - neighptr_imask[nmasked] = imask; - nmasked++; + if(n < neighbor->maxneighs) { + if(imask == NBNXN_INTERACTION_MASK_ALL) { + neighptr[n] = cj; + neighptr_imask[n] = imask; + } else { + neighptr[n] = neighptr[nmasked]; + neighptr_imask[n] = neighptr_imask[nmasked]; + neighptr[nmasked] = cj; + neighptr_imask[nmasked] = imask; + nmasked++; + } } n++; @@ -377,11 +382,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { } if(resize) { - fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); neighbor->maxneighs = new_maxneighs * 1.2; + fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); free(neighbor->neighbors); - neighbor->neighbors = (int *) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); - neighbor->neighbors_imask = (unsigned int *) malloc(atom->Nmax * neighbor->maxneighs * sizeof(unsigned int)); + free(neighbor->neighbors_imask); + neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int)); + neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int)); } } From 1ad981a059155d73161270ebe76719398877c6f7 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Sun, 9 Apr 2023 00:07:04 +0200 Subject: [PATCH 2/3] Add static analysis for gromacs-avx2-dp on Zen3 Signed-off-by: Rafael Ravedutti --- .../gromacs-icx-avx2-dp-osaca-icx.out | 213 ++ static_analysis/rafael/gromacs-avx2-dp.s | 2706 +++++++++++++++++ 2 files changed, 2919 insertions(+) create mode 100644 static_analysis/rafael/analyses/gromacs-icx-avx2-dp-osaca-icx.out create mode 100644 static_analysis/rafael/gromacs-avx2-dp.s diff --git a/static_analysis/rafael/analyses/gromacs-icx-avx2-dp-osaca-icx.out b/static_analysis/rafael/analyses/gromacs-icx-avx2-dp-osaca-icx.out new file mode 100644 index 0000000..964d318 --- /dev/null +++ b/static_analysis/rafael/analyses/gromacs-icx-avx2-dp-osaca-icx.out @@ -0,0 +1,213 @@ +Open Source Architecture Code Analyzer (OSACA) - 0.4.12 +Analyzed file: gromacs-avx2-dp.s +Architecture: ZEN3 +Timestamp: 2023-04-08 22:04:23 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD | +-------------------------------------------------------------------------------------------------------------------------------------------- +2436 | | | | | | | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 +2437 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN +2438 | | | | | | | | | | | | | | | | || | | .LBB6_26: # +2439 | | | | | | | | | | | | | | | | || | | # Parent Loop BB6_8 Depth=1 +2440 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2 +2441 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovups %ymm6, 96(%rsp) # 32-byte Spill +2442 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm14, 32(%rsp) # 32-byte Spill +2443 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm15, 192(%rsp) # 32-byte Spill +2444 | | | | | | | 1.00 | | | | | | | | | 1.00 || | 0.0 | vmovupd %ymm2, (%rsp) # 32-byte Spill +2445 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm4, 224(%rsp) # 32-byte Spill +2446 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm12, 256(%rsp) # 32-byte Spill +2447 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm8, 288(%rsp) # 32-byte Spill +2448 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm7, 320(%rsp) # 32-byte Spill +2449 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm10, 352(%rsp) # 32-byte Spill +2450 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm9, 384(%rsp) # 32-byte Spill +2451 | | | | | | | | | | | | | | 0.50 | 0.50 | 0.00 || 4.0 | | movslq (%rdx,%rcx,4), %rax +2452 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | 0.50 | 0.50 | 0.00 || 5.0 | | leaq (%rax,%rax,2), %rax +2453 | | | | | | | | | | 0.50 | 0.50 | | | | | || 1.0 | | shlq $5, %rax +2454 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovapd (%rdi,%rax), %ymm5 +2455 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovapd 32(%rdi,%rax), %ymm12 +2456 | | | | | | | | | | | | | | 0.50 | 0.50 | || 4.0 | | vmovapd 64(%rdi,%rax), %ymm14 +2457 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 896(%rsp), %ymm0 # 32-byte Reload +2458 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm0, %ymm11 +2459 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm11, 640(%rsp) # 32-byte Spill +2460 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 832(%rsp), %ymm0 # 32-byte Reload +2461 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm0, %ymm10 +2462 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm10, 672(%rsp) # 32-byte Spill +2463 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1024(%rsp), %ymm0 # 32-byte Reload +2464 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm14, %ymm0, %ymm1 +2465 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm1, 160(%rsp) # 32-byte Spill +2466 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1088(%rsp), %ymm0 # 32-byte Reload +2467 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm0, %ymm8 +2468 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm8, 448(%rsp) # 32-byte Spill +2469 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1120(%rsp), %ymm0 # 32-byte Reload +2470 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm0, %ymm4 +2471 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm4, 704(%rsp) # 32-byte Spill +2472 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 992(%rsp), %ymm0 # 32-byte Reload +2473 | | | 0.50 | 0.50 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm14, %ymm0, %ymm2 +2474 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm2, 416(%rsp) # 32-byte Spill +2475 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1184(%rsp), %ymm0 # 32-byte Reload +2476 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm0, %ymm7 +2477 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm7, 576(%rsp) # 32-byte Spill +2478 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 960(%rsp), %ymm0 # 32-byte Reload +2479 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm14, %ymm0, %ymm6 +2480 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm6, 608(%rsp) # 32-byte Spill +2481 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm1, %ymm1, %ymm9 +2482 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm10, %ymm10, %ymm9 # ymm9 = (ymm10 * ymm10) + ymm9 +2483 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm11, %ymm11, %ymm9 # ymm9 = (ymm11 * ymm11) + ymm9 +2484 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm1 +2485 | 0.50 | 0.50 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm1 # ymm1 = (ymm4 * ymm4) + ymm1 +2486 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtpd2ps %ymm9, %xmm0 +2487 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vrcpps %xmm0, %xmm0 +2488 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtps2pd %xmm0, %ymm0 +2489 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 800(%rsp), %ymm11 # 32-byte Reload +2490 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm11, %ymm2 +2491 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovapd %ymm3, %ymm4 +2492 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm0, %ymm3 +2493 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm2, %ymm3, %ymm3 +2494 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm6, %ymm2 +2495 | 0.50 | 0.50 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm8, %ymm8, %ymm1 # ymm1 = (ymm8 * ymm8) + ymm1 +2496 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm7, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm7) + ymm2 +2497 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 864(%rsp), %ymm6 # 32-byte Reload +2498 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm6, %ymm6 +2499 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm6, 480(%rsp) # 32-byte Spill +2500 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || 6.0 | | vcvtpd2ps %ymm1, %xmm7 +2501 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm6, %ymm6, %ymm2 # ymm2 = (ymm6 * ymm6) + ymm2 +2502 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vrcpps %xmm7, %xmm7 +2503 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || 4.0 | | vcvtps2pd %xmm7, %ymm7 +2504 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtpd2ps %ymm2, %xmm6 +2505 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm10 +2506 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm7, %ymm15 +2507 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm10, %ymm15, %ymm10 +2508 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vrcpps %xmm6, %xmm6 +2509 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtps2pd %xmm6, %ymm6 +2510 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm11, %ymm15 +2511 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm6, %ymm13 +2512 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm15, %ymm13, %ymm13 +2513 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1152(%rsp), %ymm8 # 32-byte Reload +2514 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm8, %ymm12 +2515 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm12, 512(%rsp) # 32-byte Spill +2516 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 928(%rsp), %ymm8 # 32-byte Reload +2517 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm14, %ymm8, %ymm8 +2518 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm8, 544(%rsp) # 32-byte Spill +2519 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm8, %ymm8, %ymm14 +2520 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm12, %ymm12, %ymm14 # ymm14 = (ymm12 * ymm12) + ymm14 +2521 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1056(%rsp), %ymm8 # 32-byte Reload +2522 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm8, %ymm5 +2523 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm5, 1216(%rsp) # 32-byte Spill +2524 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm5, %ymm5, %ymm14 # ymm14 = (ymm5 * ymm5) + ymm14 +2525 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtpd2ps %ymm14, %xmm8 +2526 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vrcpps %xmm8, %xmm5 +2527 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtps2pd %xmm5, %ymm5 +2528 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm11, %ymm8 +2529 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm5, %ymm12 +2530 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm8, %ymm12, %ymm8 +2531 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 736(%rsp), %ymm11 # 32-byte Reload +2532 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm11, %ymm0 +2533 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm3, %ymm0, %ymm0 +2534 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI6_2(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +2535 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm12, %ymm3 +2536 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm3, %ymm0, %ymm3 +2537 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm0 +2538 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm0, %ymm10, %ymm0 +2539 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm12, %ymm10, %ymm7 +2540 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm0, %ymm7 +2541 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm11, %ymm0 +2542 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm13, %ymm0 +2543 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm12, %ymm13, %ymm6 +2544 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 192(%rsp), %ymm15 # 32-byte Reload +2545 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm0, %ymm6 +2546 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm11, %ymm0 +2547 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm12, %ymm8, %ymm5 +2548 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 256(%rsp), %ymm12 # 32-byte Reload +2549 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 352(%rsp), %ymm10 # 32-byte Reload +2550 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm8, %ymm0 +2551 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm0, %ymm0 +2552 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 768(%rsp), %ymm13 # 32-byte Reload +2553 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm9, %ymm5 +2554 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 384(%rsp), %ymm9 # 32-byte Reload +2555 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 640(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload +2556 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm5, %ymm8, %ymm8 +2557 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm8, %ymm9, %ymm9 +2558 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 672(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload +2559 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm5, %ymm8, %ymm8 +2560 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm8, %ymm12, %ymm12 +2561 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 160(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload +2562 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm5, %ymm3, %ymm3 +2563 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm15, %ymm15 +2564 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm1, %ymm1 +2565 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 448(%rsp), %ymm7, %ymm3 # 32-byte Folded Reload +2566 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm3, %ymm3 +2567 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm10, %ymm10 +2568 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 704(%rsp), %ymm7, %ymm3 # 32-byte Folded Reload +2569 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vmulpd 416(%rsp), %ymm7, %ymm5 # 32-byte Folded Reload +2570 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm3, %ymm3 +2571 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm4, %ymm3 +2572 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 224(%rsp), %ymm4 # 32-byte Reload +2573 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vandpd %ymm1, %ymm5, %ymm1 +2574 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 64(%rsp), %ymm5 # 32-byte Reload +2575 | | | 0.50 | 0.50 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm1, %ymm5, %ymm5 +2576 | | | | | | | 1.00 | | | | | | | | | 1.00 || 0.0 | | vmovupd %ymm5, 64(%rsp) # 32-byte Spill +2577 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm2, %ymm1 +2578 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 480(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload +2579 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 288(%rsp), %ymm8 # 32-byte Reload +2580 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 320(%rsp), %ymm7 # 32-byte Reload +2581 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2 +2582 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm2, %ymm7, %ymm7 +2583 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 576(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload +2584 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2 +2585 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm2, %ymm4, %ymm4 +2586 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 608(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload +2587 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 96(%rsp), %ymm6 # 32-byte Reload +2588 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm1 +2589 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm1, %ymm6, %ymm6 +2590 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm14, %ymm1 +2591 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 1216(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload +2592 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2 +2593 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm2, %ymm8, %ymm8 +2594 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 512(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload +2595 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 544(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +2596 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2 +2597 | | | | | | | | | | | | | | 0.50 | 0.50 | || | 4.0 | vmovupd (%rsp), %ymm5 # 32-byte Reload +2598 | | | 0.50 | 0.50 | | | | | | | | | | | | || | 3.0 | vaddpd %ymm2, %ymm5, %ymm5 +2599 | | | | | | | 1.00 | | | | | | | | | 1.00 || | 0.0 | vmovupd %ymm5, (%rsp) # 32-byte Spill +2600 | | | | | | | | | | | | | | 0.50 | 0.50 | || | 4.0 | vmovupd (%rsp), %ymm2 # 32-byte Reload +2601 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm0, %ymm0 +2602 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 32(%rsp), %ymm1 # 32-byte Reload +2603 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm0, %ymm1, %ymm1 +2604 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm1, 32(%rsp) # 32-byte Spill +2605 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 32(%rsp), %ymm14 # 32-byte Reload +2606 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | incq %rcx +2607 | | | | | | | | | 0.50 | 0.00 | 0.00 | 0.50 | | | | || | | cmpq %rcx, %r12 +2608 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jne .LBB6_26 +2609 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jmp .LBB6_12 +2610 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END + + 30.5 30.5 26.5 26.5 25.0 1.00 1.00 1.00 1.00 23.0 23.0 25.0 60 11.0 + + + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- +2444 | 11.0 | vmovupd %ymm2, (%rsp) # 32-byte Spill| [2444, 2597, 2598, 2599, 2600] +2442 | 11.0 | vmovupd %ymm14, 32(%rsp) # 32-byte Spill| [2442, 2602, 2603, 2604, 2605] +2574 | 7.0 | vmovupd 64(%rsp), %ymm5 # 32-byte Reload| [2574, 2575, 2576] +2450 | 7.0 | vmovupd %ymm9, 384(%rsp) # 32-byte Spill| [2450, 2554, 2557] +2449 | 7.0 | vmovupd %ymm10, 352(%rsp) # 32-byte Spill| [2449, 2549, 2567] +2448 | 7.0 | vmovupd %ymm7, 320(%rsp) # 32-byte Spill| [2448, 2580, 2582] +2447 | 7.0 | vmovupd %ymm8, 288(%rsp) # 32-byte Spill| [2447, 2579, 2593] +2446 | 7.0 | vmovupd %ymm12, 256(%rsp) # 32-byte Spill| [2446, 2548, 2560] +2445 | 7.0 | vmovupd %ymm4, 224(%rsp) # 32-byte Spill| [2445, 2572, 2585] +2443 | 7.0 | vmovupd %ymm15, 192(%rsp) # 32-byte Spill| [2443, 2544, 2563] +2441 | 7.0 | vmovups %ymm6, 96(%rsp) # 32-byte Spill| [2441, 2587, 2589] +2491 | 4.0 | vmovapd %ymm3, %ymm4 | [2491, 2571] +2606 | 1.0 | incq %rcx | [2606] + diff --git a/static_analysis/rafael/gromacs-avx2-dp.s b/static_analysis/rafael/gromacs-avx2-dp.s new file mode 100644 index 0000000..b15b852 --- /dev/null +++ b/static_analysis/rafael/gromacs-avx2-dp.s @@ -0,0 +1,2706 @@ + .text + .file "force_lj.c" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_ref +.LCPI0_0: + .quad 4631952216750555136 # 48 +.LCPI0_1: + .quad 4607182418800017408 # 1 +.LCPI0_2: + .quad -4620693217682128896 # -0.5 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_3: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_ref + .p2align 4, 0x90 + .type computeForceLJ_ref,@function +computeForceLJ_ref: # +.LcomputeForceLJ_ref$local: + .cfi_startproc +# %bb.0: # + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $168, %rsp + .cfi_def_cfa_offset 224 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, %r14 + movq %rdx, 24(%rsp) # 8-byte Spill + movq %rsi, %r12 + movq %rdi, %rbx + movl $.L.str, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%rbx), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + vmovsd 40(%rbx), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + vmovsd 56(%rbx), %xmm1 # xmm1 = mem[0],zero + movl 20(%r12), %r9d + testl %r9d, %r9d + jle .LBB0_6 +# %bb.1: # + movq 176(%r12), %rcx + movq 192(%r12), %r8 + xorl %r10d, %r10d + vxorps %xmm0, %xmm0, %xmm0 + leaq 288(%rcx), %rdi + jmp .LBB0_2 + .p2align 4, 0x90 +.LBB0_22: # + # in Loop: Header=BB0_2 Depth=1 + incq %r10 + addq $96, %rdi + addq $96, %rcx + cmpq %r9, %r10 + jae .LBB0_6 +.LBB0_2: # + # =>This Loop Header: Depth=1 + # Child Loop BB0_17 Depth 2 + # Child Loop BB0_20 Depth 2 + # Child Loop BB0_5 Depth 2 + imulq $56, %r10, %rax + movl (%r8,%rax), %ebx + testl %ebx, %ebx + jle .LBB0_22 +# %bb.3: # + # in Loop: Header=BB0_2 Depth=1 + cmpl $3, %ebx + ja .LBB0_14 +# %bb.4: # + # in Loop: Header=BB0_2 Depth=1 + xorl %ebp, %ebp + jmp .LBB0_5 + .p2align 4, 0x90 +.LBB0_14: # + # in Loop: Header=BB0_2 Depth=1 + movl %ebx, %ebp + andl $-4, %ebp + leaq -4(%rbp), %rdx + movq %rdx, %rax + shrq $2, %rax + incq %rax + movl %eax, %esi + andl $7, %esi + cmpq $28, %rdx + jae .LBB0_16 +# %bb.15: # + # in Loop: Header=BB0_2 Depth=1 + xorl %eax, %eax + jmp .LBB0_18 +.LBB0_16: # + # in Loop: Header=BB0_2 Depth=1 + movq %rsi, %rdx + subq %rax, %rdx + xorl %eax, %eax + .p2align 4, 0x90 +.LBB0_17: # + # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, -288(%rdi,%rax,8) + vmovups %ymm0, -256(%rdi,%rax,8) + vmovups %ymm0, -224(%rdi,%rax,8) + vmovups %ymm0, -192(%rdi,%rax,8) + vmovups %ymm0, -160(%rdi,%rax,8) + vmovups %ymm0, -128(%rdi,%rax,8) + vmovups %ymm0, -96(%rdi,%rax,8) + vmovups %ymm0, -64(%rdi,%rax,8) + vmovups %ymm0, -32(%rdi,%rax,8) + vmovups %ymm0, (%rdi,%rax,8) + addq $32, %rax + addq $8, %rdx + jne .LBB0_17 +.LBB0_18: # + # in Loop: Header=BB0_2 Depth=1 + testq %rsi, %rsi + je .LBB0_21 +# %bb.19: # + # in Loop: Header=BB0_2 Depth=1 + leaq (%rcx,%rax,8), %rax + shlq $5, %rsi + xorl %edx, %edx + .p2align 4, 0x90 +.LBB0_20: # + # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, (%rax,%rdx) + vmovups %ymm0, 32(%rax,%rdx) + vmovups %ymm0, 64(%rax,%rdx) + addq $32, %rdx + cmpq %rdx, %rsi + jne .LBB0_20 +.LBB0_21: # + # in Loop: Header=BB0_2 Depth=1 + cmpq %rbx, %rbp + je .LBB0_22 + .p2align 4, 0x90 +.LBB0_5: # + # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, (%rcx,%rbp,8) + movq $0, 32(%rcx,%rbp,8) + movq $0, 64(%rcx,%rbp,8) + incq %rbp + cmpq %rbx, %rbp + jb .LBB0_5 + jmp .LBB0_22 +.LBB0_6: # + vmovsd %xmm1, (%rsp) # 8-byte Spill + xorl %r13d, %r13d + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 56(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + movl 20(%r12), %eax + movq %rax, 104(%rsp) # 8-byte Spill + testl %eax, %eax + jle .LBB0_24 +# %bb.7: # + vmovsd 8(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm8 + movq 160(%r12), %rcx + movq 176(%r12), %rdx + movq 24(%rsp), %rax # 8-byte Reload + movq 16(%rax), %rsi + movq %rsi, 96(%rsp) # 8-byte Spill + movq 40(%rax), %rsi + movq %rsi, 88(%rsp) # 8-byte Spill + movslq 8(%rax), %rax + movq %rax, 80(%rsp) # 8-byte Spill + vmovsd 16(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd .LCPI0_0(%rip), %xmm0, %xmm12 + leaq 32(%r14), %r8 + leaq 24(%r14), %rsi + leaq 40(%r14), %rax + leaq 48(%r14), %rdi + movq %rdi, 16(%rsp) # 8-byte Spill + vmovupd (%r14), %xmm0 + vmovapd %xmm0, 32(%rsp) # 16-byte Spill + movq %r14, 48(%rsp) # 8-byte Spill + movq 16(%r14), %rdi + movq %rdx, 128(%rsp) # 8-byte Spill + leaq 64(%rdx), %rdx + movq %rdx, 72(%rsp) # 8-byte Spill + movq %rcx, 136(%rsp) # 8-byte Spill + leaq 64(%rcx), %rcx + movq %rcx, 64(%rsp) # 8-byte Spill + vmovsd .LCPI0_1(%rip), %xmm9 # xmm9 = mem[0],zero + vmovsd .LCPI0_2(%rip), %xmm13 # xmm13 = mem[0],zero + xorl %r15d, %r15d + vmovsd %xmm8, 8(%rsp) # 8-byte Spill + jmp .LBB0_8 + .p2align 4, 0x90 +.LBB0_26: # + # in Loop: Header=BB0_8 Depth=1 + movq 120(%rsp), %r13 # 8-byte Reload + movq 112(%rsp), %rdi # 8-byte Reload +.LBB0_27: # + # in Loop: Header=BB0_8 Depth=1 + vmovdqa .LCPI0_3(%rip), %xmm0 # xmm0 = <1,u> + vpinsrq $1, %rbp, %xmm0, %xmm0 + vmovdqa 32(%rsp), %xmm1 # 16-byte Reload + vpaddq %xmm0, %xmm1, %xmm1 + vmovdqa %xmm1, 32(%rsp) # 16-byte Spill + addq %rbp, %rdi + incq %r15 + addl $3, %r13d + cmpq 104(%rsp), %r15 # 8-byte Folded Reload + jae .LBB0_23 +.LBB0_8: # + # =>This Loop Header: Depth=1 + # Child Loop BB0_10 Depth 2 + # Child Loop BB0_11 Depth 3 + movq 96(%rsp), %rcx # 8-byte Reload + movslq (%rcx,%r15,4), %rbp + testq %rbp, %rbp + jle .LBB0_27 +# %bb.9: # + # in Loop: Header=BB0_8 Depth=1 + movq %rdi, 112(%rsp) # 8-byte Spill + movq %r13, 120(%rsp) # 8-byte Spill + # kill: def $r13d killed $r13d killed $r13 def $r13 + andl $1073741823, %r13d # imm = 0x3FFFFFFF + shlq $5, %r13 + movq 72(%rsp), %rcx # 8-byte Reload + leaq (%rcx,%r13), %r9 + addq 64(%rsp), %r13 # 8-byte Folded Reload + movq %r15, %rcx + imulq 80(%rsp), %rcx # 8-byte Folded Reload + movq 88(%rsp), %rdx # 8-byte Reload + leaq (%rdx,%rcx,4), %rcx + movq %rcx, 144(%rsp) # 8-byte Spill + movq 24(%rsp), %rcx # 8-byte Reload + movl 32(%rcx), %r10d + movl %ebp, %ebp + xorl %edx, %edx + movq %rbp, 152(%rsp) # 8-byte Spill + jmp .LBB0_10 + .p2align 4, 0x90 +.LBB0_25: # + # in Loop: Header=BB0_10 Depth=2 + movq 160(%rsp), %rdx # 8-byte Reload + incq %rdx + movq 152(%rsp), %rbp # 8-byte Reload + cmpq %rbp, %rdx + je .LBB0_26 +.LBB0_10: # + # Parent Loop BB0_8 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_11 Depth 3 + movq 144(%rsp), %rcx # 8-byte Reload + movq %rdx, 160(%rsp) # 8-byte Spill + movl (%rcx,%rdx,4), %r12d + movslq %r12d, %rcx + shlq $5, %rcx + leaq (%rcx,%rcx,2), %rbp + movq 136(%rsp), %rcx # 8-byte Reload + addq %rbp, %rcx + addq 128(%rsp), %rbp # 8-byte Folded Reload + xorl %edx, %edx + xorl %r14d, %r14d + jmp .LBB0_11 + .p2align 4, 0x90 +.LBB0_41: # + # in Loop: Header=BB0_11 Depth=3 + incq (%r8) +.LBB0_42: # + # in Loop: Header=BB0_11 Depth=3 + testl %r14d, %r14d + movq %rax, %rdi + cmoveq 16(%rsp), %rdi # 8-byte Folded Reload +.LBB0_44: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rdi) + vaddsd -64(%r9,%rdx,8), %xmm5, %xmm0 + vmovsd %xmm0, -64(%r9,%rdx,8) + vaddsd -32(%r9,%rdx,8), %xmm11, %xmm0 + vmovsd %xmm0, -32(%r9,%rdx,8) + vaddsd (%r9,%rdx,8), %xmm6, %xmm0 + vmovsd %xmm0, (%r9,%rdx,8) + incq %rdx + cmpq $4, %rdx + je .LBB0_25 +.LBB0_11: # + # Parent Loop BB0_8 Depth=1 + # Parent Loop BB0_10 Depth=2 + # => This Inner Loop Header: Depth=3 + vmovsd -64(%r13,%rdx,8), %xmm10 # xmm10 = mem[0],zero + vmovsd -32(%r13,%rdx,8), %xmm15 # xmm15 = mem[0],zero + vmovsd (%r13,%rdx,8), %xmm14 # xmm14 = mem[0],zero + cmpq %r12, %r15 + jne .LBB0_35 +# %bb.12: # + # in Loop: Header=BB0_11 Depth=3 + vxorpd %xmm6, %xmm6, %xmm6 + testl %r10d, %r10d + jne .LBB0_13 +# %bb.28: # + # in Loop: Header=BB0_11 Depth=3 + testq %rdx, %rdx + je .LBB0_13 +# %bb.29: # + # in Loop: Header=BB0_11 Depth=3 + vsubsd (%rcx), %xmm10, %xmm0 + vsubsd 32(%rcx), %xmm15, %xmm1 + vsubsd 64(%rcx), %xmm14, %xmm2 + vmulsd %xmm0, %xmm0, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vxorpd %xmm6, %xmm6, %xmm6 + vucomisd %xmm8, %xmm3 + movq %r8, %rdi + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm5, %xmm5, %xmm5 + jae .LBB0_31 +# %bb.30: # + # in Loop: Header=BB0_11 Depth=3 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm5 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm6 + vmulsd %xmm5, %xmm6, %xmm5 + vaddsd %xmm5, %xmm13, %xmm6 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm5, %xmm3, %xmm3 + vmulsd %xmm6, %xmm3, %xmm3 + vmulsd %xmm0, %xmm3, %xmm5 + vmulsd %xmm1, %xmm3, %xmm11 + vmulsd %xmm2, %xmm3, %xmm6 + movl $1, %r14d + movq %rsi, %rdi +.LBB0_31: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rdi) + jmp .LBB0_32 + .p2align 4, 0x90 +.LBB0_35: # + # in Loop: Header=BB0_11 Depth=3 + vsubsd (%rcx), %xmm10, %xmm1 + vsubsd 32(%rcx), %xmm15, %xmm0 + vsubsd 64(%rcx), %xmm14, %xmm2 + vmulsd %xmm1, %xmm1, %xmm3 + vfmadd231sd %xmm0, %xmm0, %xmm3 # xmm3 = (xmm0 * xmm0) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vxorpd %xmm6, %xmm6, %xmm6 + vucomisd %xmm8, %xmm3 + movq %r8, %rdi + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm5, %xmm5, %xmm5 + jae .LBB0_39 +# %bb.36: # + # in Loop: Header=BB0_11 Depth=3 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm5 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm6 + vmulsd %xmm5, %xmm6, %xmm5 + vaddsd %xmm5, %xmm13, %xmm6 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm5, %xmm3, %xmm3 + vmulsd %xmm6, %xmm3, %xmm3 + vmulsd %xmm1, %xmm3, %xmm5 + vmulsd %xmm0, %xmm3, %xmm11 + vmulsd %xmm2, %xmm3, %xmm6 + movl $1, %r14d + testl %r10d, %r10d + je .LBB0_38 +# %bb.37: # + # in Loop: Header=BB0_11 Depth=3 + vmovsd (%rbp), %xmm0 # xmm0 = mem[0],zero + vmovsd 32(%rbp), %xmm1 # xmm1 = mem[0],zero + vsubsd %xmm5, %xmm0, %xmm0 + vmovsd %xmm0, (%rbp) + vsubsd %xmm11, %xmm1, %xmm0 + vmovsd %xmm0, 32(%rbp) + vmovsd 64(%rbp), %xmm0 # xmm0 = mem[0],zero + vsubsd %xmm6, %xmm0, %xmm0 + vmovsd %xmm0, 64(%rbp) +.LBB0_38: # + # in Loop: Header=BB0_11 Depth=3 + movq %rsi, %rdi +.LBB0_39: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rdi) + vsubsd 8(%rcx), %xmm10, %xmm0 + vsubsd 40(%rcx), %xmm15, %xmm1 + vsubsd 72(%rcx), %xmm14, %xmm2 + vmulsd %xmm0, %xmm0, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vucomisd %xmm8, %xmm3 + jae .LBB0_40 +# %bb.61: # + # in Loop: Header=BB0_11 Depth=3 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm4 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm7 + vmulsd %xmm4, %xmm7, %xmm4 + vaddsd %xmm4, %xmm13, %xmm7 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm3, %xmm3 + vmulsd %xmm7, %xmm3, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vmulsd %xmm1, %xmm3, %xmm1 + vmulsd %xmm2, %xmm3, %xmm2 + testl %r10d, %r10d + je .LBB0_63 +# %bb.62: # + # in Loop: Header=BB0_11 Depth=3 + vmovsd 8(%rbp), %xmm3 # xmm3 = mem[0],zero + vmovsd 40(%rbp), %xmm4 # xmm4 = mem[0],zero + vsubsd %xmm0, %xmm3, %xmm3 + vmovsd %xmm3, 8(%rbp) + vsubsd %xmm1, %xmm4, %xmm3 + vmovsd %xmm3, 40(%rbp) + vmovsd 72(%rbp), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm2, %xmm3, %xmm3 + vmovsd %xmm3, 72(%rbp) +.LBB0_63: # + # in Loop: Header=BB0_11 Depth=3 + vaddsd %xmm5, %xmm0, %xmm5 + vaddsd %xmm1, %xmm11, %xmm11 + vaddsd %xmm6, %xmm2, %xmm6 + movl $1, %r14d + movq %rsi, %rdi + jmp .LBB0_64 + .p2align 4, 0x90 +.LBB0_13: # + # in Loop: Header=BB0_11 Depth=3 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm5, %xmm5, %xmm5 +.LBB0_32: # + # in Loop: Header=BB0_11 Depth=3 + xorl %edi, %edi + cmpq $1, %rdx + setne %dil + xorl %r11d, %r11d + testq %rdx, %rdx + sete %r11b + testl %r10d, %r10d + cmovel %edi, %r11d + testb %r11b, %r11b + je .LBB0_49 +# %bb.33: # + # in Loop: Header=BB0_11 Depth=3 + vsubsd 8(%rcx), %xmm10, %xmm0 + vsubsd 40(%rcx), %xmm15, %xmm1 + vsubsd 72(%rcx), %xmm14, %xmm2 + vmulsd %xmm0, %xmm0, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vucomisd %xmm8, %xmm3 + jae .LBB0_34 +# %bb.45: # + # in Loop: Header=BB0_11 Depth=3 + vmovapd %xmm13, %xmm7 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm8 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm9 + vmulsd %xmm8, %xmm9, %xmm4 + vaddsd %xmm4, %xmm13, %xmm7 + vmovapd %xmm12, %xmm8 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm3, %xmm3 + vmulsd %xmm7, %xmm3, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vmulsd %xmm1, %xmm3, %xmm1 + vmulsd %xmm2, %xmm3, %xmm2 + testl %r10d, %r10d + je .LBB0_47 +# %bb.46: # + # in Loop: Header=BB0_11 Depth=3 + vmovsd 8(%rbp), %xmm3 # xmm3 = mem[0],zero + vmovsd 40(%rbp), %xmm4 # xmm4 = mem[0],zero + vsubsd %xmm0, %xmm3, %xmm3 + vmovsd %xmm3, 8(%rbp) + vsubsd %xmm1, %xmm4, %xmm3 + vmovsd %xmm3, 40(%rbp) + vmovsd 72(%rbp), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm2, %xmm3, %xmm3 + vmovsd %xmm3, 72(%rbp) +.LBB0_47: # + # in Loop: Header=BB0_11 Depth=3 + vaddsd %xmm5, %xmm0, %xmm5 + vaddsd %xmm1, %xmm11, %xmm11 + vaddsd %xmm6, %xmm2, %xmm6 + movl $1, %r14d + movq %rsi, %rdi + vmovsd .LCPI0_1(%rip), %xmm9 # xmm9 = mem[0],zero + vmovapd %xmm8, %xmm12 + vmovsd 8(%rsp), %xmm8 # 8-byte Reload + # xmm8 = mem[0],zero + jmp .LBB0_48 + .p2align 4, 0x90 +.LBB0_40: # + # in Loop: Header=BB0_11 Depth=3 + movq %r8, %rdi +.LBB0_64: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rdi) + vsubsd 16(%rcx), %xmm10, %xmm0 + vsubsd 48(%rcx), %xmm15, %xmm1 + vsubsd 80(%rcx), %xmm14, %xmm2 + vmulsd %xmm0, %xmm0, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vucomisd %xmm8, %xmm3 + jae .LBB0_65 +# %bb.66: # + # in Loop: Header=BB0_11 Depth=3 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm4 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm7 + vmulsd %xmm4, %xmm7, %xmm4 + vaddsd %xmm4, %xmm13, %xmm7 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm3, %xmm3 + vmulsd %xmm7, %xmm3, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vmulsd %xmm1, %xmm3, %xmm1 + vmulsd %xmm2, %xmm3, %xmm2 + testl %r10d, %r10d + je .LBB0_68 +# %bb.67: # + # in Loop: Header=BB0_11 Depth=3 + vmovsd 16(%rbp), %xmm3 # xmm3 = mem[0],zero + vmovsd 48(%rbp), %xmm4 # xmm4 = mem[0],zero + vsubsd %xmm0, %xmm3, %xmm3 + vmovsd %xmm3, 16(%rbp) + vsubsd %xmm1, %xmm4, %xmm3 + vmovsd %xmm3, 48(%rbp) + vmovsd 80(%rbp), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm2, %xmm3, %xmm3 + vmovsd %xmm3, 80(%rbp) +.LBB0_68: # + # in Loop: Header=BB0_11 Depth=3 + vaddsd %xmm5, %xmm0, %xmm5 + vaddsd %xmm1, %xmm11, %xmm11 + vaddsd %xmm6, %xmm2, %xmm6 + movl $1, %r14d + movq %rsi, %rdi + incq (%rsi) + jmp .LBB0_57 + .p2align 4, 0x90 +.LBB0_65: # + # in Loop: Header=BB0_11 Depth=3 + movq %r8, %rdi + incq (%r8) + jmp .LBB0_57 +.LBB0_34: # + # in Loop: Header=BB0_11 Depth=3 + movq %r8, %rdi +.LBB0_48: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rdi) +.LBB0_49: # + # in Loop: Header=BB0_11 Depth=3 + xorl %edi, %edi + xorl %ebx, %ebx + cmpq $2, %rdx + setne %dil + setb %bl + testl %r10d, %r10d + cmovel %edi, %ebx + cmpb $1, %bl + jne .LBB0_56 +# %bb.50: # + # in Loop: Header=BB0_11 Depth=3 + vsubsd 16(%rcx), %xmm10, %xmm0 + vsubsd 48(%rcx), %xmm15, %xmm1 + vsubsd 80(%rcx), %xmm14, %xmm2 + vmulsd %xmm0, %xmm0, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vucomisd %xmm8, %xmm3 + jae .LBB0_51 +# %bb.52: # + # in Loop: Header=BB0_11 Depth=3 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm4 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm7 + vmulsd %xmm4, %xmm7, %xmm4 + vaddsd %xmm4, %xmm13, %xmm7 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm3, %xmm3 + vmulsd %xmm7, %xmm3, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vmulsd %xmm1, %xmm3, %xmm1 + vmulsd %xmm2, %xmm3, %xmm2 + testl %r10d, %r10d + je .LBB0_54 +# %bb.53: # + # in Loop: Header=BB0_11 Depth=3 + vmovsd 16(%rbp), %xmm3 # xmm3 = mem[0],zero + vmovsd 48(%rbp), %xmm4 # xmm4 = mem[0],zero + vsubsd %xmm0, %xmm3, %xmm3 + vmovsd %xmm3, 16(%rbp) + vsubsd %xmm1, %xmm4, %xmm3 + vmovsd %xmm3, 48(%rbp) + vmovsd 80(%rbp), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm2, %xmm3, %xmm3 + vmovsd %xmm3, 80(%rbp) +.LBB0_54: # + # in Loop: Header=BB0_11 Depth=3 + vaddsd %xmm5, %xmm0, %xmm5 + vaddsd %xmm1, %xmm11, %xmm11 + vaddsd %xmm6, %xmm2, %xmm6 + movl $1, %r14d + movq %rsi, %rdi + jmp .LBB0_55 +.LBB0_51: # + # in Loop: Header=BB0_11 Depth=3 + movq %r8, %rdi +.LBB0_55: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rdi) +.LBB0_56: # + # in Loop: Header=BB0_11 Depth=3 + xorl %edi, %edi + xorl %ebx, %ebx + cmpq $3, %rdx + setne %dil + setb %bl + testl %r10d, %r10d + cmovel %edi, %ebx + cmpb $1, %bl + jne .LBB0_42 +.LBB0_57: # + # in Loop: Header=BB0_11 Depth=3 + vsubsd 24(%rcx), %xmm10, %xmm0 + vsubsd 56(%rcx), %xmm15, %xmm1 + vsubsd 88(%rcx), %xmm14, %xmm2 + vmulsd %xmm0, %xmm0, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm2, %xmm2, %xmm3 # xmm3 = (xmm2 * xmm2) + xmm3 + vucomisd %xmm8, %xmm3 + jae .LBB0_41 +# %bb.58: # + # in Loop: Header=BB0_11 Depth=3 + vdivsd %xmm3, %xmm9, %xmm3 + vmulsd (%rsp), %xmm3, %xmm4 # 8-byte Folded Reload + vmulsd %xmm3, %xmm3, %xmm7 + vmulsd %xmm4, %xmm7, %xmm4 + vaddsd %xmm4, %xmm13, %xmm7 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm3, %xmm3 + vmulsd %xmm7, %xmm3, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vmulsd %xmm1, %xmm3, %xmm1 + vmulsd %xmm2, %xmm3, %xmm2 + testl %r10d, %r10d + je .LBB0_60 +# %bb.59: # + # in Loop: Header=BB0_11 Depth=3 + vmovsd 24(%rbp), %xmm3 # xmm3 = mem[0],zero + vmovsd 56(%rbp), %xmm4 # xmm4 = mem[0],zero + vsubsd %xmm0, %xmm3, %xmm3 + vmovsd %xmm3, 24(%rbp) + vsubsd %xmm1, %xmm4, %xmm3 + vmovsd %xmm3, 56(%rbp) + vmovsd 88(%rbp), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm2, %xmm3, %xmm3 + vmovsd %xmm3, 88(%rbp) +.LBB0_60: # + # in Loop: Header=BB0_11 Depth=3 + vaddsd %xmm5, %xmm0, %xmm5 + vaddsd %xmm1, %xmm11, %xmm11 + vaddsd %xmm6, %xmm2, %xmm6 +# %bb.43: # + # in Loop: Header=BB0_11 Depth=3 + incq (%rsi) + movl $1, %r14d + movq %rax, %rdi + jmp .LBB0_44 +.LBB0_23: # + movq 48(%rsp), %rax # 8-byte Reload + vmovaps 32(%rsp), %xmm0 # 16-byte Reload + vmovups %xmm0, (%rax) + movq %rdi, 16(%rax) +.LBB0_24: # + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.2, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 56(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $168, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end0: + .size computeForceLJ_ref, .Lfunc_end0-computeForceLJ_ref + .cfi_endproc + # -- End function + .globl computeForceLJ_2xnn_half # -- Begin function computeForceLJ_2xnn_half + .p2align 4, 0x90 + .type computeForceLJ_2xnn_half,@function +computeForceLJ_2xnn_half: # +.LcomputeForceLJ_2xnn_half$local: + .cfi_startproc +# %bb.0: # + pushq %r14 + .cfi_def_cfa_offset 16 + pushq %rbx + .cfi_def_cfa_offset 24 + subq $24, %rsp + .cfi_def_cfa_offset 48 + .cfi_offset %rbx, -24 + .cfi_offset %r14, -16 + movq %rsi, %r14 + movl $.L.str.3, %edi + xorl %eax, %eax + callq debug_printf + movl 20(%r14), %r9d + testl %r9d, %r9d + jle .LBB1_15 +# %bb.1: # + movq 176(%r14), %rcx + movq 192(%r14), %r8 + xorl %r10d, %r10d + vxorps %xmm0, %xmm0, %xmm0 + leaq 288(%rcx), %rdi + jmp .LBB1_2 + .p2align 4, 0x90 +.LBB1_14: # + # in Loop: Header=BB1_2 Depth=1 + incq %r10 + addq $96, %rdi + addq $96, %rcx + cmpq %r9, %r10 + jae .LBB1_15 +.LBB1_2: # + # =>This Loop Header: Depth=1 + # Child Loop BB1_9 Depth 2 + # Child Loop BB1_12 Depth 2 + # Child Loop BB1_5 Depth 2 + imulq $56, %r10, %rax + movl (%r8,%rax), %edx + testl %edx, %edx + jle .LBB1_14 +# %bb.3: # + # in Loop: Header=BB1_2 Depth=1 + cmpl $3, %edx + ja .LBB1_6 +# %bb.4: # + # in Loop: Header=BB1_2 Depth=1 + xorl %ebx, %ebx + jmp .LBB1_5 + .p2align 4, 0x90 +.LBB1_6: # + # in Loop: Header=BB1_2 Depth=1 + movl %edx, %ebx + andl $-4, %ebx + leaq -4(%rbx), %rax + movq %rax, %rsi + shrq $2, %rsi + incq %rsi + movl %esi, %r11d + andl $7, %r11d + cmpq $28, %rax + jae .LBB1_8 +# %bb.7: # + # in Loop: Header=BB1_2 Depth=1 + xorl %esi, %esi + jmp .LBB1_10 +.LBB1_8: # + # in Loop: Header=BB1_2 Depth=1 + movq %r11, %rax + subq %rsi, %rax + xorl %esi, %esi + .p2align 4, 0x90 +.LBB1_9: # + # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, -288(%rdi,%rsi,8) + vmovups %ymm0, -256(%rdi,%rsi,8) + vmovups %ymm0, -224(%rdi,%rsi,8) + vmovups %ymm0, -192(%rdi,%rsi,8) + vmovups %ymm0, -160(%rdi,%rsi,8) + vmovups %ymm0, -128(%rdi,%rsi,8) + vmovups %ymm0, -96(%rdi,%rsi,8) + vmovups %ymm0, -64(%rdi,%rsi,8) + vmovups %ymm0, -32(%rdi,%rsi,8) + vmovups %ymm0, (%rdi,%rsi,8) + addq $32, %rsi + addq $8, %rax + jne .LBB1_9 +.LBB1_10: # + # in Loop: Header=BB1_2 Depth=1 + testq %r11, %r11 + je .LBB1_13 +# %bb.11: # + # in Loop: Header=BB1_2 Depth=1 + leaq (%rcx,%rsi,8), %rax + shlq $5, %r11 + xorl %esi, %esi + .p2align 4, 0x90 +.LBB1_12: # + # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, (%rax,%rsi) + vmovups %ymm0, 32(%rax,%rsi) + vmovups %ymm0, 64(%rax,%rsi) + addq $32, %rsi + cmpq %rsi, %r11 + jne .LBB1_12 +.LBB1_13: # + # in Loop: Header=BB1_2 Depth=1 + cmpq %rdx, %rbx + je .LBB1_14 + .p2align 4, 0x90 +.LBB1_5: # + # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, (%rcx,%rbx,8) + movq $0, 32(%rcx,%rbx,8) + movq $0, 64(%rcx,%rbx,8) + incq %rbx + cmpq %rdx, %rbx + jb .LBB1_5 + jmp .LBB1_14 +.LBB1_15: # + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%r14) + jg .LBB1_17 +# %bb.16: # + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movl $.L.str.4, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 8(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 16(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $24, %rsp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %r14 + .cfi_def_cfa_offset 8 + retq +.LBB1_17: # + .cfi_def_cfa_offset 48 + callq simd_load_h_dual +.Lfunc_end1: + .size computeForceLJ_2xnn_half, .Lfunc_end1-computeForceLJ_2xnn_half + .cfi_endproc + # -- End function + .p2align 4, 0x90 # -- Begin function simd_load_h_dual + .type simd_load_h_dual,@function +simd_load_h_dual: # + .cfi_startproc +# %bb.0: # + pushq %rax + .cfi_def_cfa_offset 16 + movq stderr(%rip), %rcx + movl $.L.str.7, %edi + movl $67, %esi + movl $1, %edx + callq fwrite + movl $-1, %edi + callq exit +.Lfunc_end2: + .size simd_load_h_dual, .Lfunc_end2-simd_load_h_dual + .cfi_endproc + # -- End function + .globl computeForceLJ_2xnn_full # -- Begin function computeForceLJ_2xnn_full + .p2align 4, 0x90 + .type computeForceLJ_2xnn_full,@function +computeForceLJ_2xnn_full: # +.LcomputeForceLJ_2xnn_full$local: + .cfi_startproc +# %bb.0: # + pushq %r14 + .cfi_def_cfa_offset 16 + pushq %rbx + .cfi_def_cfa_offset 24 + subq $24, %rsp + .cfi_def_cfa_offset 48 + .cfi_offset %rbx, -24 + .cfi_offset %r14, -16 + movq %rsi, %r14 + movl $.L.str.3, %edi + xorl %eax, %eax + callq debug_printf + movl 20(%r14), %r9d + testl %r9d, %r9d + jle .LBB3_15 +# %bb.1: # + movq 176(%r14), %rcx + movq 192(%r14), %r8 + xorl %r10d, %r10d + vxorps %xmm0, %xmm0, %xmm0 + leaq 288(%rcx), %rdi + jmp .LBB3_2 + .p2align 4, 0x90 +.LBB3_14: # + # in Loop: Header=BB3_2 Depth=1 + incq %r10 + addq $96, %rdi + addq $96, %rcx + cmpq %r9, %r10 + jae .LBB3_15 +.LBB3_2: # + # =>This Loop Header: Depth=1 + # Child Loop BB3_9 Depth 2 + # Child Loop BB3_12 Depth 2 + # Child Loop BB3_5 Depth 2 + imulq $56, %r10, %rax + movl (%r8,%rax), %edx + testl %edx, %edx + jle .LBB3_14 +# %bb.3: # + # in Loop: Header=BB3_2 Depth=1 + cmpl $3, %edx + ja .LBB3_6 +# %bb.4: # + # in Loop: Header=BB3_2 Depth=1 + xorl %ebx, %ebx + jmp .LBB3_5 + .p2align 4, 0x90 +.LBB3_6: # + # in Loop: Header=BB3_2 Depth=1 + movl %edx, %ebx + andl $-4, %ebx + leaq -4(%rbx), %rax + movq %rax, %rsi + shrq $2, %rsi + incq %rsi + movl %esi, %r11d + andl $7, %r11d + cmpq $28, %rax + jae .LBB3_8 +# %bb.7: # + # in Loop: Header=BB3_2 Depth=1 + xorl %esi, %esi + jmp .LBB3_10 +.LBB3_8: # + # in Loop: Header=BB3_2 Depth=1 + movq %r11, %rax + subq %rsi, %rax + xorl %esi, %esi + .p2align 4, 0x90 +.LBB3_9: # + # Parent Loop BB3_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, -288(%rdi,%rsi,8) + vmovups %ymm0, -256(%rdi,%rsi,8) + vmovups %ymm0, -224(%rdi,%rsi,8) + vmovups %ymm0, -192(%rdi,%rsi,8) + vmovups %ymm0, -160(%rdi,%rsi,8) + vmovups %ymm0, -128(%rdi,%rsi,8) + vmovups %ymm0, -96(%rdi,%rsi,8) + vmovups %ymm0, -64(%rdi,%rsi,8) + vmovups %ymm0, -32(%rdi,%rsi,8) + vmovups %ymm0, (%rdi,%rsi,8) + addq $32, %rsi + addq $8, %rax + jne .LBB3_9 +.LBB3_10: # + # in Loop: Header=BB3_2 Depth=1 + testq %r11, %r11 + je .LBB3_13 +# %bb.11: # + # in Loop: Header=BB3_2 Depth=1 + leaq (%rcx,%rsi,8), %rax + shlq $5, %r11 + xorl %esi, %esi + .p2align 4, 0x90 +.LBB3_12: # + # Parent Loop BB3_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, (%rax,%rsi) + vmovups %ymm0, 32(%rax,%rsi) + vmovups %ymm0, 64(%rax,%rsi) + addq $32, %rsi + cmpq %rsi, %r11 + jne .LBB3_12 +.LBB3_13: # + # in Loop: Header=BB3_2 Depth=1 + cmpq %rdx, %rbx + je .LBB3_14 + .p2align 4, 0x90 +.LBB3_5: # + # Parent Loop BB3_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, (%rcx,%rbx,8) + movq $0, 32(%rcx,%rbx,8) + movq $0, 64(%rcx,%rbx,8) + incq %rbx + cmpq %rdx, %rbx + jb .LBB3_5 + jmp .LBB3_14 +.LBB3_15: # + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%r14) + jg .LBB3_17 +# %bb.16: # + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movl $.L.str.4, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 8(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 16(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $24, %rsp + .cfi_def_cfa_offset 24 + popq %rbx + .cfi_def_cfa_offset 16 + popq %r14 + .cfi_def_cfa_offset 8 + retq +.LBB3_17: # + .cfi_def_cfa_offset 48 + callq simd_load_h_dual +.Lfunc_end3: + .size computeForceLJ_2xnn_full, .Lfunc_end3-computeForceLJ_2xnn_full + .cfi_endproc + # -- End function + .globl computeForceLJ_2xnn # -- Begin function computeForceLJ_2xnn + .p2align 4, 0x90 + .type computeForceLJ_2xnn,@function +computeForceLJ_2xnn: # +.LcomputeForceLJ_2xnn$local: + .cfi_startproc +# %bb.0: # + cmpl $0, 32(%rdx) + je .LBB4_2 +# %bb.1: # + jmp .LcomputeForceLJ_2xnn_half$local # TAILCALL +.LBB4_2: # + jmp .LcomputeForceLJ_2xnn_full$local # TAILCALL +.Lfunc_end4: + .size computeForceLJ_2xnn, .Lfunc_end4-computeForceLJ_2xnn + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_4xn_half +.LCPI5_0: + .quad 4631952216750555136 # 48 +.LCPI5_2: + .quad -4620693217682128896 # -0.5 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI5_1: + .long 31 # 0x1f + .long 30 # 0x1e + .long 29 # 0x1d + .long 28 # 0x1c +.LCPI5_3: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_4xn_half + .p2align 4, 0x90 + .type computeForceLJ_4xn_half,@function +computeForceLJ_4xn_half: # +.LcomputeForceLJ_4xn_half$local: + .cfi_startproc +# %bb.0: # + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $1256, %rsp # imm = 0x4E8 + .cfi_def_cfa_offset 1312 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 80(%rsp) # 8-byte Spill + movq %rdx, 72(%rsp) # 8-byte Spill + movq %rsi, %rbx + movq %rdi, %r12 + movl $.L.str.5, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 96(%rsp) # 8-byte Spill + vbroadcastsd 56(%r12), %ymm0 + vmovups %ymm0, 832(%rsp) # 32-byte Spill + vmovsd 40(%r12), %xmm0 # xmm0 = mem[0],zero + vmovupd %ymm0, 32(%rsp) # 32-byte Spill + movl 20(%rbx), %r9d + testl %r9d, %r9d + jle .LBB5_6 +# %bb.1: # + movq 176(%rbx), %rcx + movq 192(%rbx), %r8 + xorl %r10d, %r10d + vxorpd %xmm0, %xmm0, %xmm0 + leaq 288(%rcx), %rdi + jmp .LBB5_2 + .p2align 4, 0x90 +.LBB5_25: # + # in Loop: Header=BB5_2 Depth=1 + incq %r10 + addq $96, %rdi + addq $96, %rcx + cmpq %r9, %r10 + jae .LBB5_6 +.LBB5_2: # + # =>This Loop Header: Depth=1 + # Child Loop BB5_20 Depth 2 + # Child Loop BB5_23 Depth 2 + # Child Loop BB5_5 Depth 2 + imulq $56, %r10, %rax + movl (%r8,%rax), %edx + testl %edx, %edx + jle .LBB5_25 +# %bb.3: # + # in Loop: Header=BB5_2 Depth=1 + cmpl $3, %edx + ja .LBB5_17 +# %bb.4: # + # in Loop: Header=BB5_2 Depth=1 + xorl %ebp, %ebp + jmp .LBB5_5 + .p2align 4, 0x90 +.LBB5_17: # + # in Loop: Header=BB5_2 Depth=1 + movl %edx, %ebp + andl $-4, %ebp + leaq -4(%rbp), %rax + movq %rax, %rsi + shrq $2, %rsi + incq %rsi + movl %esi, %r11d + andl $7, %r11d + cmpq $28, %rax + jae .LBB5_19 +# %bb.18: # + # in Loop: Header=BB5_2 Depth=1 + xorl %esi, %esi + jmp .LBB5_21 +.LBB5_19: # + # in Loop: Header=BB5_2 Depth=1 + movq %r11, %rax + subq %rsi, %rax + xorl %esi, %esi + .p2align 4, 0x90 +.LBB5_20: # + # Parent Loop BB5_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm0, -288(%rdi,%rsi,8) + vmovupd %ymm0, -256(%rdi,%rsi,8) + vmovupd %ymm0, -224(%rdi,%rsi,8) + vmovupd %ymm0, -192(%rdi,%rsi,8) + vmovupd %ymm0, -160(%rdi,%rsi,8) + vmovupd %ymm0, -128(%rdi,%rsi,8) + vmovupd %ymm0, -96(%rdi,%rsi,8) + vmovupd %ymm0, -64(%rdi,%rsi,8) + vmovupd %ymm0, -32(%rdi,%rsi,8) + vmovupd %ymm0, (%rdi,%rsi,8) + addq $32, %rsi + addq $8, %rax + jne .LBB5_20 +.LBB5_21: # + # in Loop: Header=BB5_2 Depth=1 + testq %r11, %r11 + je .LBB5_24 +# %bb.22: # + # in Loop: Header=BB5_2 Depth=1 + leaq (%rcx,%rsi,8), %rax + shlq $5, %r11 + xorl %esi, %esi + .p2align 4, 0x90 +.LBB5_23: # + # Parent Loop BB5_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm0, (%rax,%rsi) + vmovupd %ymm0, 32(%rax,%rsi) + vmovupd %ymm0, 64(%rax,%rsi) + addq $32, %rsi + cmpq %rsi, %r11 + jne .LBB5_23 +.LBB5_24: # + # in Loop: Header=BB5_2 Depth=1 + cmpq %rdx, %rbp + je .LBB5_25 + .p2align 4, 0x90 +.LBB5_5: # + # Parent Loop BB5_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, (%rcx,%rbp,8) + movq $0, 32(%rcx,%rbp,8) + movq $0, 64(%rcx,%rbp,8) + incq %rbp + cmpq %rdx, %rbp + jb .LBB5_5 + jmp .LBB5_25 +.LBB5_6: # + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 64(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%rbx) + jle .LBB5_16 +# %bb.7: # + vmovsd 96(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm0 + vbroadcastsd %xmm0, %ymm0 + vmovupd %ymm0, 800(%rsp) # 32-byte Spill + vmovupd 32(%rsp), %ymm0 # 32-byte Reload + vmulsd .LCPI5_0(%rip), %xmm0, %xmm0 + vbroadcastsd %xmm0, %ymm0 + vmovupd %ymm0, 768(%rsp) # 32-byte Spill + xorl %r8d, %r8d + jmp .LBB5_8 + .p2align 4, 0x90 +.LBB5_15: # + # in Loop: Header=BB5_8 Depth=1 + vhaddpd %ymm0, %ymm1, %ymm0 + vhaddpd %ymm15, %ymm14, %ymm1 + vhaddpd %ymm4, %ymm11, %ymm2 + vhaddpd %ymm3, %ymm13, %ymm3 + vhaddpd %ymm7, %ymm5, %ymm4 + vhaddpd %ymm6, %ymm12, %ymm5 + vperm2f128 $33, %ymm4, %ymm5, %ymm6 # ymm6 = ymm5[2,3],ymm4[0,1] + vaddpd %ymm5, %ymm6, %ymm5 + vaddpd %ymm4, %ymm6, %ymm4 + vblendpd $12, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm4[2,3] + vaddpd (%r11,%r12,8), %ymm4, %ymm4 + vmovapd %ymm4, (%r11,%r12,8) + vperm2f128 $33, %ymm2, %ymm3, %ymm4 # ymm4 = ymm3[2,3],ymm2[0,1] + vaddpd %ymm3, %ymm4, %ymm3 + vaddpd %ymm2, %ymm4, %ymm2 + vblendpd $12, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3] + vaddpd 32(%r11,%r12,8), %ymm2, %ymm2 + vmovapd %ymm2, 32(%r11,%r12,8) + vperm2f128 $33, %ymm0, %ymm1, %ymm2 # ymm2 = ymm1[2,3],ymm0[0,1] + vaddpd %ymm1, %ymm2, %ymm1 + vaddpd %ymm0, %ymm2, %ymm0 + vblendpd $12, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1],ymm0[2,3] + vaddpd 64(%r11,%r12,8), %ymm0, %ymm0 + vmovapd %ymm0, 64(%r11,%r12,8) + vmovdqa .LCPI5_3(%rip), %xmm0 # xmm0 = <1,u> + vpinsrq $1, %r13, %xmm0, %xmm0 + movq 80(%rsp), %rax # 8-byte Reload + vpaddq (%rax), %xmm0, %xmm0 + vmovdqu %xmm0, (%rax) + addq %r13, 16(%rax) + incq %r8 + movslq 20(%rbx), %rax + cmpq %rax, %r8 + jge .LBB5_16 +.LBB5_8: # + # =>This Loop Header: Depth=1 + # Child Loop BB5_27 Depth 2 + # Child Loop BB5_12 Depth 2 + leaq (,%r8,4), %rax + leaq (%rax,%rax,2), %r12 + movq 160(%rbx), %r15 + movq 176(%rbx), %r11 + movq 72(%rsp), %rcx # 8-byte Reload + movq 40(%rcx), %rdx + movl 8(%rcx), %esi + movq 16(%rcx), %rax + movslq (%rax,%r8,4), %r13 + movq 24(%rcx), %rax + movl (%rax,%r8,4), %r10d + vbroadcastsd (%r15,%r12,8), %ymm0 + vmovups %ymm0, 1216(%rsp) # 32-byte Spill + vbroadcastsd 8(%r15,%r12,8), %ymm0 + vmovups %ymm0, 1184(%rsp) # 32-byte Spill + vbroadcastsd 16(%r15,%r12,8), %ymm0 + vmovups %ymm0, 1152(%rsp) # 32-byte Spill + vbroadcastsd 24(%r15,%r12,8), %ymm0 + vmovups %ymm0, 1120(%rsp) # 32-byte Spill + vbroadcastsd 32(%r15,%r12,8), %ymm0 + vmovups %ymm0, 1088(%rsp) # 32-byte Spill + vbroadcastsd 40(%r15,%r12,8), %ymm0 + vmovups %ymm0, 1056(%rsp) # 32-byte Spill + vbroadcastsd 48(%r15,%r12,8), %ymm0 + vmovups %ymm0, 1024(%rsp) # 32-byte Spill + vbroadcastsd 56(%r15,%r12,8), %ymm0 + vmovups %ymm0, 992(%rsp) # 32-byte Spill + vbroadcastsd 64(%r15,%r12,8), %ymm0 + vmovups %ymm0, 960(%rsp) # 32-byte Spill + vbroadcastsd 72(%r15,%r12,8), %ymm0 + vmovups %ymm0, 928(%rsp) # 32-byte Spill + vbroadcastsd 80(%r15,%r12,8), %ymm0 + vmovups %ymm0, 896(%rsp) # 32-byte Spill + vbroadcastsd 88(%r15,%r12,8), %ymm0 + vmovupd %ymm0, 864(%rsp) # 32-byte Spill + testl %r10d, %r10d + jle .LBB5_9 +# %bb.26: # + # in Loop: Header=BB5_8 Depth=1 + movl %esi, 28(%rsp) # 4-byte Spill + movl %esi, %eax + imull %r8d, %eax + cltq + movq %rdx, 88(%rsp) # 8-byte Spill + leaq (%rdx,%rax,4), %r9 + leaq -1(%r10), %rdx + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 32(%rsp) # 32-byte Spill + movq %r11, %rsi + movl $0, %ecx + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 384(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 256(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 96(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 352(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 224(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 448(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 320(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 160(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 416(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 288(%rsp) # 32-byte Spill + vxorpd %xmm0, %xmm0, %xmm0 + vmovupd %ymm0, 192(%rsp) # 32-byte Spill + .p2align 4, 0x90 +.LBB5_27: # + # Parent Loop BB5_8 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%r9,%rcx,4), %r14 + movq %r14, %rax + shlq $2, %rax + leaq (%rax,%rax,2), %rbp + vmovapd (%r15,%rbp,8), %ymm4 + vmovapd 32(%r15,%rbp,8), %ymm15 + vmovapd 64(%r15,%rbp,8), %ymm2 + vmovupd 1216(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm4, %ymm0, %ymm13 + vmovupd %ymm13, 608(%rsp) # 32-byte Spill + vmovupd 1088(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm15, %ymm0, %ymm12 + vmovupd %ymm12, 672(%rsp) # 32-byte Spill + vmovupd 960(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm2, %ymm0, %ymm11 + vmovupd %ymm11, 128(%rsp) # 32-byte Spill + vmovupd 1184(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm4, %ymm0, %ymm10 + vmovupd %ymm10, 640(%rsp) # 32-byte Spill + vmovupd 1056(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm15, %ymm0, %ymm9 + vmovupd %ymm9, 704(%rsp) # 32-byte Spill + vmovupd 928(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm2, %ymm0, %ymm6 + vmovupd %ymm6, 736(%rsp) # 32-byte Spill + movl %r14d, %edi + xorl %eax, %eax + cmpq %rdi, %r8 + sete %al + leal (,%rax,4), %edi + vpbroadcastd 312(%rbx,%rdi,4), %xmm0 + vmovdqa .LCPI5_1(%rip), %xmm7 # xmm7 = [31,30,29,28] + vpsllvd %xmm7, %xmm0, %xmm0 + vmovupd 1024(%rsp), %ymm1 # 32-byte Reload + vsubpd %ymm15, %ymm1, %ymm14 + vmovupd %ymm14, 544(%rsp) # 32-byte Spill + vpsrad $31, %xmm0, %xmm0 + vpmovsxdq %xmm0, %ymm5 + leal 1(,%rax,4), %edi + vpbroadcastd 312(%rbx,%rdi,4), %xmm0 + vpsllvd %xmm7, %xmm0, %xmm0 + vpsrad $31, %xmm0, %xmm0 + vpmovsxdq %xmm0, %ymm3 + leal 2(,%rax,4), %edi + vpbroadcastd 312(%rbx,%rdi,4), %xmm0 + vpsllvd %xmm7, %xmm0, %xmm8 + vmulpd %ymm11, %ymm11, %ymm0 + vfmadd231pd %ymm12, %ymm12, %ymm0 # ymm0 = (ymm12 * ymm12) + ymm0 + vfmadd231pd %ymm13, %ymm13, %ymm0 # ymm0 = (ymm13 * ymm13) + ymm0 + vmulpd %ymm6, %ymm6, %ymm1 + vfmadd231pd %ymm9, %ymm9, %ymm1 # ymm1 = (ymm9 * ymm9) + ymm1 + vfmadd231pd %ymm10, %ymm10, %ymm1 # ymm1 = (ymm10 * ymm10) + ymm1 + vmovupd 800(%rsp), %ymm9 # 32-byte Reload + vcmpltpd %ymm9, %ymm0, %ymm10 + vandpd %ymm5, %ymm10, %ymm5 + vcmpltpd %ymm9, %ymm1, %ymm10 + vandpd %ymm3, %ymm10, %ymm11 + vmovupd 896(%rsp), %ymm3 # 32-byte Reload + vsubpd %ymm2, %ymm3, %ymm6 + vmovupd %ymm6, 576(%rsp) # 32-byte Spill + vpsrad $31, %xmm8, %xmm3 + vpmovsxdq %xmm3, %ymm8 + vmulpd %ymm6, %ymm6, %ymm3 + vfmadd231pd %ymm14, %ymm14, %ymm3 # ymm3 = (ymm14 * ymm14) + ymm3 + vmovupd 1152(%rsp), %ymm6 # 32-byte Reload + vsubpd %ymm4, %ymm6, %ymm13 + vfmadd231pd %ymm13, %ymm13, %ymm3 # ymm3 = (ymm13 * ymm13) + ymm3 + vcmpltpd %ymm9, %ymm3, %ymm10 + vandpd %ymm8, %ymm10, %ymm10 + vmovupd 1120(%rsp), %ymm6 # 32-byte Reload + vsubpd %ymm4, %ymm6, %ymm12 + vmovupd %ymm12, 480(%rsp) # 32-byte Spill + leal 3(,%rax,4), %eax + vpbroadcastd 312(%rbx,%rax,4), %xmm6 + vpsllvd %xmm7, %xmm6, %xmm6 + vmovupd 992(%rsp), %ymm4 # 32-byte Reload + vsubpd %ymm15, %ymm4, %ymm8 + vmovupd 864(%rsp), %ymm4 # 32-byte Reload + vsubpd %ymm2, %ymm4, %ymm2 + vmovupd %ymm2, 512(%rsp) # 32-byte Spill + vpsrad $31, %xmm6, %xmm6 + vpmovsxdq %xmm6, %ymm15 + vmulpd %ymm2, %ymm2, %ymm6 + vfmadd231pd %ymm8, %ymm8, %ymm6 # ymm6 = (ymm8 * ymm8) + ymm6 + vfmadd231pd %ymm12, %ymm12, %ymm6 # ymm6 = (ymm12 * ymm12) + ymm6 + vcmpltpd %ymm9, %ymm6, %ymm9 + vandpd %ymm15, %ymm9, %ymm15 + vcvtpd2ps %ymm0, %xmm0 + vrcpps %xmm0, %xmm0 + vcvtps2pd %xmm0, %ymm0 + vmovupd 832(%rsp), %ymm2 # 32-byte Reload + vmulpd %ymm2, %ymm0, %ymm9 + vmulpd %ymm0, %ymm0, %ymm12 + vmulpd %ymm9, %ymm12, %ymm9 + vcvtpd2ps %ymm1, %xmm1 + vrcpps %xmm1, %xmm1 + vcvtps2pd %xmm1, %ymm1 + vmulpd %ymm2, %ymm1, %ymm12 + vmulpd %ymm1, %ymm1, %ymm14 + vmulpd %ymm12, %ymm14, %ymm12 + vcvtpd2ps %ymm3, %xmm3 + vrcpps %xmm3, %xmm3 + vcvtps2pd %xmm3, %ymm3 + vmulpd %ymm2, %ymm3, %ymm14 + vmulpd %ymm3, %ymm3, %ymm7 + vmulpd %ymm7, %ymm14, %ymm7 + vcvtpd2ps %ymm6, %xmm6 + vrcpps %xmm6, %xmm6 + vcvtps2pd %xmm6, %ymm6 + vmulpd %ymm2, %ymm6, %ymm14 + vmulpd %ymm6, %ymm6, %ymm2 + vmulpd %ymm2, %ymm14, %ymm4 + vmovupd 768(%rsp), %ymm2 # 32-byte Reload + vmulpd %ymm0, %ymm2, %ymm0 + vmulpd %ymm0, %ymm9, %ymm0 + vbroadcastsd .LCPI5_2(%rip), %ymm14 # ymm14 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vaddpd %ymm14, %ymm9, %ymm9 + vmulpd %ymm0, %ymm9, %ymm0 + vmulpd %ymm1, %ymm2, %ymm1 + vmulpd %ymm1, %ymm12, %ymm1 + vaddpd %ymm14, %ymm12, %ymm9 + vmulpd %ymm1, %ymm9, %ymm9 + vmulpd %ymm3, %ymm2, %ymm1 + vmulpd %ymm7, %ymm1, %ymm1 + vaddpd %ymm7, %ymm14, %ymm3 + vmulpd %ymm3, %ymm1, %ymm7 + vaddpd %ymm4, %ymm14, %ymm1 + vmulpd %ymm6, %ymm2, %ymm3 + vmulpd %ymm4, %ymm3, %ymm2 + vmulpd %ymm1, %ymm2, %ymm2 + vmulpd 608(%rsp), %ymm0, %ymm1 # 32-byte Folded Reload + vmulpd 672(%rsp), %ymm0, %ymm3 # 32-byte Folded Reload + vmulpd 128(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload + vandpd %ymm1, %ymm5, %ymm12 + vandpd %ymm3, %ymm5, %ymm6 + vandpd %ymm0, %ymm5, %ymm1 + vmulpd 640(%rsp), %ymm9, %ymm0 # 32-byte Folded Reload + vmulpd 704(%rsp), %ymm9, %ymm3 # 32-byte Folded Reload + vmulpd 736(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload + vandpd %ymm0, %ymm11, %ymm5 + vandpd %ymm3, %ymm11, %ymm3 + vandpd %ymm9, %ymm11, %ymm0 + vmulpd %ymm7, %ymm13, %ymm9 + vmulpd 544(%rsp), %ymm7, %ymm13 # 32-byte Folded Reload + vmulpd 576(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload + vandpd %ymm9, %ymm10, %ymm11 + vandpd %ymm13, %ymm10, %ymm9 + vandpd %ymm7, %ymm10, %ymm7 + vmulpd 480(%rsp), %ymm2, %ymm4 # 32-byte Folded Reload + vmulpd %ymm2, %ymm8, %ymm10 + vmulpd 512(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload + vandpd %ymm4, %ymm15, %ymm8 + vandpd %ymm10, %ymm15, %ymm4 + vandpd %ymm2, %ymm15, %ymm2 + cmpl 4(%rbx), %r14d + jge .LBB5_29 +# %bb.28: # + # in Loop: Header=BB5_27 Depth=2 + vmovapd 32(%rsi,%rbp,8), %ymm10 + vmovapd 64(%rsi,%rbp,8), %ymm13 + vaddpd %ymm5, %ymm12, %ymm14 + vaddpd %ymm11, %ymm8, %ymm15 + vsubpd (%rsi,%rbp,8), %ymm14, %ymm14 + vaddpd %ymm15, %ymm14, %ymm14 + vxorpd %xmm15, %xmm15, %xmm15 + vsubpd %ymm14, %ymm15, %ymm14 + vmovapd %ymm14, (%rsi,%rbp,8) + vaddpd %ymm3, %ymm6, %ymm14 + vsubpd %ymm10, %ymm14, %ymm10 + vaddpd %ymm4, %ymm9, %ymm14 + vaddpd %ymm14, %ymm10, %ymm10 + vsubpd %ymm10, %ymm15, %ymm10 + vmovapd %ymm10, 32(%rsi,%rbp,8) + vaddpd %ymm0, %ymm1, %ymm10 + vsubpd %ymm13, %ymm10, %ymm10 + vaddpd %ymm7, %ymm2, %ymm13 + vaddpd %ymm13, %ymm10, %ymm10 + vsubpd %ymm10, %ymm15, %ymm10 + vmovapd %ymm10, 64(%rsi,%rbp,8) +.LBB5_29: # + # in Loop: Header=BB5_27 Depth=2 + vmovupd 32(%rsp), %ymm10 # 32-byte Reload + vaddpd %ymm12, %ymm10, %ymm12 + vmovupd 384(%rsp), %ymm13 # 32-byte Reload + vaddpd %ymm6, %ymm13, %ymm13 + vmovupd 256(%rsp), %ymm14 # 32-byte Reload + vaddpd %ymm1, %ymm14, %ymm14 + vmovupd 96(%rsp), %ymm6 # 32-byte Reload + vaddpd %ymm5, %ymm6, %ymm6 + vmovupd 352(%rsp), %ymm1 # 32-byte Reload + vaddpd %ymm3, %ymm1, %ymm3 + vmovupd 224(%rsp), %ymm15 # 32-byte Reload + vaddpd %ymm0, %ymm15, %ymm15 + vmovupd 448(%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm5, %ymm11, %ymm5 + vmovupd 320(%rsp), %ymm11 # 32-byte Reload + vaddpd %ymm9, %ymm11, %ymm11 + vmovupd 160(%rsp), %ymm1 # 32-byte Reload + vaddpd %ymm7, %ymm1, %ymm1 + vmovupd 416(%rsp), %ymm7 # 32-byte Reload + vaddpd %ymm7, %ymm8, %ymm7 + vmovupd 288(%rsp), %ymm0 # 32-byte Reload + vaddpd %ymm4, %ymm0, %ymm4 + vmovupd 192(%rsp), %ymm0 # 32-byte Reload + vaddpd %ymm2, %ymm0, %ymm0 + cmpq %rcx, %rdx + je .LBB5_30 +# %bb.31: # + # in Loop: Header=BB5_27 Depth=2 + vmovupd %ymm1, 160(%rsp) # 32-byte Spill + vmovupd %ymm0, 192(%rsp) # 32-byte Spill + vmovupd %ymm15, 224(%rsp) # 32-byte Spill + vmovupd %ymm14, 256(%rsp) # 32-byte Spill + vmovupd %ymm4, 288(%rsp) # 32-byte Spill + vmovupd %ymm11, 320(%rsp) # 32-byte Spill + vmovupd %ymm3, 352(%rsp) # 32-byte Spill + vmovupd %ymm13, 384(%rsp) # 32-byte Spill + vmovupd %ymm7, 416(%rsp) # 32-byte Spill + vmovupd %ymm5, 448(%rsp) # 32-byte Spill + vmovupd %ymm6, 96(%rsp) # 32-byte Spill + vmovupd %ymm12, 32(%rsp) # 32-byte Spill + incq %rcx + movq 160(%rbx), %r15 + movq 176(%rbx), %rsi + jmp .LBB5_27 + .p2align 4, 0x90 +.LBB5_30: # + # in Loop: Header=BB5_8 Depth=1 + movq 88(%rsp), %rdx # 8-byte Reload + movl 28(%rsp), %esi # 4-byte Reload + cmpl %r13d, %r10d + jge .LBB5_15 + jmp .LBB5_11 + .p2align 4, 0x90 +.LBB5_9: # + # in Loop: Header=BB5_8 Depth=1 + vxorpd %xmm0, %xmm0, %xmm0 + vxorpd %xmm4, %xmm4, %xmm4 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm1, %xmm1, %xmm1 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm3, %xmm3, %xmm3 + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm12, %xmm12, %xmm12 + cmpl %r13d, %r10d + jge .LBB5_15 +.LBB5_11: # + # in Loop: Header=BB5_8 Depth=1 + movslq %r10d, %rcx + imull %r8d, %esi + movslq %esi, %rax + leaq (%rdx,%rax,4), %rdx + jmp .LBB5_12 + .p2align 4, 0x90 +.LBB5_14: # + # in Loop: Header=BB5_12 Depth=2 + vmovupd 32(%rsp), %ymm12 # 32-byte Reload + vaddpd %ymm3, %ymm12, %ymm12 + vmovupd 384(%rsp), %ymm13 # 32-byte Reload + vaddpd %ymm14, %ymm13, %ymm13 + vmovupd 256(%rsp), %ymm14 # 32-byte Reload + vaddpd %ymm6, %ymm14, %ymm14 + vmovupd 96(%rsp), %ymm3 # 32-byte Reload + vaddpd %ymm4, %ymm3, %ymm6 + vmovupd 352(%rsp), %ymm3 # 32-byte Reload + vaddpd %ymm5, %ymm3, %ymm3 + vmovupd 224(%rsp), %ymm15 # 32-byte Reload + vaddpd %ymm2, %ymm15, %ymm15 + vmovupd 448(%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm5, %ymm8, %ymm5 + vmovupd 320(%rsp), %ymm11 # 32-byte Reload + vaddpd %ymm1, %ymm11, %ymm11 + vmovupd 160(%rsp), %ymm1 # 32-byte Reload + vaddpd %ymm0, %ymm1, %ymm1 + vmovupd 416(%rsp), %ymm0 # 32-byte Reload + vaddpd %ymm7, %ymm0, %ymm7 + vmovupd 288(%rsp), %ymm4 # 32-byte Reload + vaddpd %ymm4, %ymm10, %ymm4 + vmovupd 192(%rsp), %ymm0 # 32-byte Reload + vaddpd %ymm0, %ymm9, %ymm0 + incq %rcx + cmpq %rcx, %r13 + je .LBB5_15 +.LBB5_12: # + # Parent Loop BB5_8 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm1, 160(%rsp) # 32-byte Spill + vmovupd %ymm0, 192(%rsp) # 32-byte Spill + vmovupd %ymm15, 224(%rsp) # 32-byte Spill + vmovupd %ymm14, 256(%rsp) # 32-byte Spill + vmovupd %ymm4, 288(%rsp) # 32-byte Spill + vmovupd %ymm11, 320(%rsp) # 32-byte Spill + vmovupd %ymm3, 352(%rsp) # 32-byte Spill + vmovupd %ymm13, 384(%rsp) # 32-byte Spill + vmovupd %ymm7, 416(%rsp) # 32-byte Spill + vmovupd %ymm5, 448(%rsp) # 32-byte Spill + vmovupd %ymm6, 96(%rsp) # 32-byte Spill + vmovupd %ymm12, 32(%rsp) # 32-byte Spill + movslq (%rdx,%rcx,4), %rdi + movq %rdi, %rax + shlq $2, %rax + leaq (%rax,%rax,2), %rsi + movq 160(%rbx), %rax + vmovapd (%rax,%rsi,8), %ymm2 + vmovapd 32(%rax,%rsi,8), %ymm13 + vmovapd 64(%rax,%rsi,8), %ymm5 + vmovupd 1216(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm2, %ymm0, %ymm10 + vmovupd %ymm10, 640(%rsp) # 32-byte Spill + vmovupd 1088(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm13, %ymm0, %ymm9 + vmovupd %ymm9, 672(%rsp) # 32-byte Spill + vmovupd 960(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm5, %ymm0, %ymm4 + vmovupd %ymm4, 128(%rsp) # 32-byte Spill + vmovupd 1184(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm2, %ymm0, %ymm8 + vmovupd %ymm8, 608(%rsp) # 32-byte Spill + vmovupd 1056(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm13, %ymm0, %ymm3 + vmovupd %ymm3, 704(%rsp) # 32-byte Spill + vmovupd 928(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm5, %ymm0, %ymm1 + vmovupd %ymm1, 736(%rsp) # 32-byte Spill + vmovupd 1024(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm13, %ymm0, %ymm7 + vmovupd %ymm7, 544(%rsp) # 32-byte Spill + vmovupd 896(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm5, %ymm0, %ymm6 + vmovupd %ymm6, 576(%rsp) # 32-byte Spill + vmulpd %ymm4, %ymm4, %ymm4 + vfmadd231pd %ymm9, %ymm9, %ymm4 # ymm4 = (ymm9 * ymm9) + ymm4 + vfmadd231pd %ymm10, %ymm10, %ymm4 # ymm4 = (ymm10 * ymm10) + ymm4 + vmulpd %ymm1, %ymm1, %ymm12 + vfmadd231pd %ymm3, %ymm3, %ymm12 # ymm12 = (ymm3 * ymm3) + ymm12 + vcvtpd2ps %ymm4, %xmm0 + vfmadd231pd %ymm8, %ymm8, %ymm12 # ymm12 = (ymm8 * ymm8) + ymm12 + vrcpps %xmm0, %xmm0 + vcvtps2pd %xmm0, %ymm0 + vmovupd 832(%rsp), %ymm10 # 32-byte Reload + vmulpd %ymm0, %ymm10, %ymm1 + vmulpd %ymm0, %ymm0, %ymm11 + vmulpd %ymm1, %ymm11, %ymm3 + vmulpd %ymm6, %ymm6, %ymm11 + vfmadd231pd %ymm7, %ymm7, %ymm11 # ymm11 = (ymm7 * ymm7) + ymm11 + vmovupd 1152(%rsp), %ymm1 # 32-byte Reload + vsubpd %ymm2, %ymm1, %ymm15 + vcvtpd2ps %ymm12, %xmm7 + vfmadd231pd %ymm15, %ymm15, %ymm11 # ymm11 = (ymm15 * ymm15) + ymm11 + vrcpps %xmm7, %xmm7 + vcvtps2pd %xmm7, %ymm8 + vcvtpd2ps %ymm11, %xmm7 + vmulpd %ymm10, %ymm8, %ymm1 + vmulpd %ymm8, %ymm8, %ymm9 + vmulpd %ymm1, %ymm9, %ymm1 + vrcpps %xmm7, %xmm7 + vcvtps2pd %xmm7, %ymm9 + vmulpd %ymm10, %ymm9, %ymm7 + vmulpd %ymm9, %ymm9, %ymm6 + vmulpd %ymm7, %ymm6, %ymm6 + vmovupd 1120(%rsp), %ymm7 # 32-byte Reload + vsubpd %ymm2, %ymm7, %ymm7 + vmovupd 992(%rsp), %ymm2 # 32-byte Reload + vsubpd %ymm13, %ymm2, %ymm14 + vmovupd %ymm14, 480(%rsp) # 32-byte Spill + vmovupd 864(%rsp), %ymm2 # 32-byte Reload + vsubpd %ymm5, %ymm2, %ymm2 + vmovupd %ymm2, 512(%rsp) # 32-byte Spill + vmulpd %ymm2, %ymm2, %ymm13 + vfmadd231pd %ymm14, %ymm14, %ymm13 # ymm13 = (ymm14 * ymm14) + ymm13 + vfmadd231pd %ymm7, %ymm7, %ymm13 # ymm13 = (ymm7 * ymm7) + ymm13 + vcvtpd2ps %ymm13, %xmm14 + vrcpps %xmm14, %xmm5 + vcvtps2pd %xmm5, %ymm5 + vmulpd %ymm5, %ymm10, %ymm14 + vmulpd %ymm5, %ymm5, %ymm10 + vmulpd %ymm14, %ymm10, %ymm10 + vmovupd 768(%rsp), %ymm2 # 32-byte Reload + vmulpd %ymm0, %ymm2, %ymm0 + vmulpd %ymm3, %ymm0, %ymm0 + vbroadcastsd .LCPI5_2(%rip), %ymm14 # ymm14 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vaddpd %ymm3, %ymm14, %ymm3 + vmulpd %ymm3, %ymm0, %ymm0 + vmulpd %ymm2, %ymm8, %ymm3 + vmulpd %ymm1, %ymm3, %ymm3 + vaddpd %ymm1, %ymm14, %ymm1 + vmulpd %ymm1, %ymm3, %ymm1 + vmulpd %ymm2, %ymm9, %ymm3 + vmulpd %ymm6, %ymm3, %ymm3 + vaddpd %ymm6, %ymm14, %ymm6 + vmulpd %ymm6, %ymm3, %ymm8 + vaddpd %ymm14, %ymm10, %ymm3 + vmulpd %ymm5, %ymm2, %ymm5 + vmulpd %ymm5, %ymm10, %ymm5 + vmulpd %ymm3, %ymm5, %ymm9 + vmulpd 640(%rsp), %ymm0, %ymm3 # 32-byte Folded Reload + vmulpd 672(%rsp), %ymm0, %ymm5 # 32-byte Folded Reload + vmulpd 128(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload + vmovupd 800(%rsp), %ymm14 # 32-byte Reload + vcmpltpd %ymm14, %ymm4, %ymm4 + vandpd %ymm4, %ymm3, %ymm2 + vmovupd %ymm2, 128(%rsp) # 32-byte Spill + vandpd %ymm4, %ymm5, %ymm3 + vandpd %ymm4, %ymm0, %ymm6 + vmulpd 608(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload + vmulpd 704(%rsp), %ymm1, %ymm5 # 32-byte Folded Reload + vmulpd 736(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload + vcmpltpd %ymm14, %ymm12, %ymm2 + vandpd %ymm2, %ymm0, %ymm4 + vandpd %ymm2, %ymm5, %ymm5 + vandpd %ymm2, %ymm1, %ymm2 + vmulpd %ymm15, %ymm8, %ymm0 + vmulpd 544(%rsp), %ymm8, %ymm1 # 32-byte Folded Reload + vmulpd 576(%rsp), %ymm8, %ymm10 # 32-byte Folded Reload + vcmpltpd %ymm14, %ymm11, %ymm11 + vandpd %ymm0, %ymm11, %ymm8 + vandpd %ymm1, %ymm11, %ymm1 + vandpd %ymm11, %ymm10, %ymm0 + vmulpd %ymm7, %ymm9, %ymm7 + vmulpd 480(%rsp), %ymm9, %ymm10 # 32-byte Folded Reload + vmulpd 512(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload + vcmpltpd %ymm14, %ymm13, %ymm11 + vmovapd %ymm3, %ymm14 + vmovupd 128(%rsp), %ymm3 # 32-byte Reload + vandpd %ymm7, %ymm11, %ymm7 + vandpd %ymm11, %ymm10, %ymm10 + vandpd %ymm11, %ymm9, %ymm9 + cmpl 4(%rbx), %edi + jge .LBB5_14 +# %bb.13: # + # in Loop: Header=BB5_12 Depth=2 + movq 176(%rbx), %rax + vmovapd 32(%rax,%rsi,8), %ymm11 + vmovapd 64(%rax,%rsi,8), %ymm12 + vaddpd %ymm4, %ymm3, %ymm13 + vaddpd %ymm7, %ymm8, %ymm15 + vsubpd (%rax,%rsi,8), %ymm13, %ymm13 + vaddpd %ymm15, %ymm13, %ymm13 + vxorpd %xmm15, %xmm15, %xmm15 + vsubpd %ymm13, %ymm15, %ymm13 + vmovapd %ymm13, (%rax,%rsi,8) + vaddpd %ymm5, %ymm14, %ymm13 + vsubpd %ymm11, %ymm13, %ymm11 + vaddpd %ymm1, %ymm10, %ymm13 + vaddpd %ymm13, %ymm11, %ymm11 + vsubpd %ymm11, %ymm15, %ymm11 + vmovapd %ymm11, 32(%rax,%rsi,8) + vaddpd %ymm2, %ymm6, %ymm11 + vsubpd %ymm12, %ymm11, %ymm11 + vaddpd %ymm0, %ymm9, %ymm12 + vaddpd %ymm12, %ymm11, %ymm11 + vsubpd %ymm11, %ymm15, %ymm11 + vmovapd %ymm11, 64(%rax,%rsi,8) + jmp .LBB5_14 +.LBB5_16: # + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 32(%rsp) # 8-byte Spill + movl $.L.str.6, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 32(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 64(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $1256, %rsp # imm = 0x4E8 + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end5: + .size computeForceLJ_4xn_half, .Lfunc_end5-computeForceLJ_4xn_half + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_4xn_full +.LCPI6_0: + .quad 4631952216750555136 # 48 +.LCPI6_2: + .quad -4620693217682128896 # -0.5 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI6_1: + .long 31 # 0x1f + .long 30 # 0x1e + .long 29 # 0x1d + .long 28 # 0x1c +.LCPI6_3: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_4xn_full + .p2align 4, 0x90 + .type computeForceLJ_4xn_full,@function +computeForceLJ_4xn_full: # +.LcomputeForceLJ_4xn_full$local: + .cfi_startproc +# %bb.0: # + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $1256, %rsp # imm = 0x4E8 + .cfi_def_cfa_offset 1312 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, %r14 + movq %rdx, %r15 + movq %rsi, %rbp + movq %rdi, %r12 + movl $.L.str.5, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 32(%rsp) # 8-byte Spill + vbroadcastsd 56(%r12), %ymm1 + vmovsd 40(%r12), %xmm0 # xmm0 = mem[0],zero + vmovupd %ymm0, (%rsp) # 32-byte Spill + movl 20(%rbp), %r9d + testl %r9d, %r9d + jle .LBB6_6 +# %bb.1: # + movq 176(%rbp), %rcx + movq 192(%rbp), %r8 + xorl %r10d, %r10d + vxorpd %xmm0, %xmm0, %xmm0 + leaq 288(%rcx), %rdi + jmp .LBB6_2 + .p2align 4, 0x90 +.LBB6_22: # + # in Loop: Header=BB6_2 Depth=1 + incq %r10 + addq $96, %rdi + addq $96, %rcx + cmpq %r9, %r10 + jae .LBB6_6 +.LBB6_2: # + # =>This Loop Header: Depth=1 + # Child Loop BB6_17 Depth 2 + # Child Loop BB6_20 Depth 2 + # Child Loop BB6_5 Depth 2 + imulq $56, %r10, %rax + movl (%r8,%rax), %eax + testl %eax, %eax + jle .LBB6_22 +# %bb.3: # + # in Loop: Header=BB6_2 Depth=1 + cmpl $3, %eax + ja .LBB6_14 +# %bb.4: # + # in Loop: Header=BB6_2 Depth=1 + xorl %edx, %edx + jmp .LBB6_5 + .p2align 4, 0x90 +.LBB6_14: # + # in Loop: Header=BB6_2 Depth=1 + movl %eax, %edx + andl $-4, %edx + leaq -4(%rdx), %rbx + movq %rbx, %rsi + shrq $2, %rsi + incq %rsi + movl %esi, %r11d + andl $7, %r11d + cmpq $28, %rbx + jae .LBB6_16 +# %bb.15: # + # in Loop: Header=BB6_2 Depth=1 + xorl %esi, %esi + jmp .LBB6_18 +.LBB6_16: # + # in Loop: Header=BB6_2 Depth=1 + movq %r11, %rbx + subq %rsi, %rbx + xorl %esi, %esi + .p2align 4, 0x90 +.LBB6_17: # + # Parent Loop BB6_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm0, -288(%rdi,%rsi,8) + vmovupd %ymm0, -256(%rdi,%rsi,8) + vmovupd %ymm0, -224(%rdi,%rsi,8) + vmovupd %ymm0, -192(%rdi,%rsi,8) + vmovupd %ymm0, -160(%rdi,%rsi,8) + vmovupd %ymm0, -128(%rdi,%rsi,8) + vmovupd %ymm0, -96(%rdi,%rsi,8) + vmovupd %ymm0, -64(%rdi,%rsi,8) + vmovupd %ymm0, -32(%rdi,%rsi,8) + vmovupd %ymm0, (%rdi,%rsi,8) + addq $32, %rsi + addq $8, %rbx + jne .LBB6_17 +.LBB6_18: # + # in Loop: Header=BB6_2 Depth=1 + testq %r11, %r11 + je .LBB6_21 +# %bb.19: # + # in Loop: Header=BB6_2 Depth=1 + leaq (%rcx,%rsi,8), %rsi + shlq $5, %r11 + xorl %ebx, %ebx + .p2align 4, 0x90 +.LBB6_20: # + # Parent Loop BB6_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm0, (%rsi,%rbx) + vmovupd %ymm0, 32(%rsi,%rbx) + vmovupd %ymm0, 64(%rsi,%rbx) + addq $32, %rbx + cmpq %rbx, %r11 + jne .LBB6_20 +.LBB6_21: # + # in Loop: Header=BB6_2 Depth=1 + cmpq %rax, %rdx + je .LBB6_22 + .p2align 4, 0x90 +.LBB6_5: # + # Parent Loop BB6_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, (%rcx,%rdx,8) + movq $0, 32(%rcx,%rdx,8) + movq $0, 64(%rcx,%rdx,8) + incq %rdx + cmpq %rax, %rdx + jb .LBB6_5 + jmp .LBB6_22 +.LBB6_6: # + vmovupd %ymm1, 800(%rsp) # 32-byte Spill + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 152(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%rbp) + jle .LBB6_13 +# %bb.7: # + vmovsd 32(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm0 + vbroadcastsd %xmm0, %ymm1 + vmovupd (%rsp), %ymm0 # 32-byte Reload + vmulsd .LCPI6_0(%rip), %xmm0, %xmm0 + vbroadcastsd %xmm0, %ymm0 + xorl %r13d, %r13d + vmovupd %ymm1, 768(%rsp) # 32-byte Spill + vmovupd %ymm0, 736(%rsp) # 32-byte Spill + jmp .LBB6_8 + .p2align 4, 0x90 +.LBB6_11: # + # in Loop: Header=BB6_8 Depth=1 + vmovapd %ymm13, %ymm3 + vmovapd %ymm15, %ymm4 + vmovapd %ymm11, %ymm15 + vmovupd 96(%rsp), %ymm6 # 32-byte Reload +.LBB6_12: # + # in Loop: Header=BB6_8 Depth=1 + vhaddpd %ymm14, %ymm6, %ymm0 + vhaddpd 64(%rsp), %ymm15, %ymm1 # 32-byte Folded Reload + vhaddpd %ymm2, %ymm4, %ymm2 + vhaddpd %ymm3, %ymm12, %ymm3 + vhaddpd %ymm8, %ymm7, %ymm4 + vhaddpd %ymm10, %ymm9, %ymm5 + vperm2f128 $33, %ymm4, %ymm5, %ymm6 # ymm6 = ymm5[2,3],ymm4[0,1] + vaddpd %ymm5, %ymm6, %ymm5 + vaddpd %ymm4, %ymm6, %ymm4 + vblendpd $12, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm4[2,3] + vaddpd (%r10,%r11,8), %ymm4, %ymm4 + vmovapd %ymm4, (%r10,%r11,8) + vperm2f128 $33, %ymm2, %ymm3, %ymm4 # ymm4 = ymm3[2,3],ymm2[0,1] + vaddpd %ymm3, %ymm4, %ymm3 + vaddpd %ymm2, %ymm4, %ymm2 + vblendpd $12, %ymm2, %ymm3, %ymm2 # ymm2 = ymm3[0,1],ymm2[2,3] + vaddpd 32(%r10,%r11,8), %ymm2, %ymm2 + vmovapd %ymm2, 32(%r10,%r11,8) + vperm2f128 $33, %ymm0, %ymm1, %ymm2 # ymm2 = ymm1[2,3],ymm0[0,1] + vaddpd %ymm1, %ymm2, %ymm1 + vaddpd %ymm0, %ymm2, %ymm0 + vblendpd $12, %ymm0, %ymm1, %ymm0 # ymm0 = ymm1[0,1],ymm0[2,3] + vaddpd 64(%r10,%r11,8), %ymm0, %ymm0 + vmovapd %ymm0, 64(%r10,%r11,8) + vmovdqa .LCPI6_3(%rip), %xmm0 # xmm0 = <1,u> + vpinsrq $1, %r12, %xmm0, %xmm0 + vpaddq (%r14), %xmm0, %xmm0 + vmovdqu %xmm0, (%r14) + addq %r12, 16(%r14) + incq %r13 + movslq 20(%rbp), %rax + cmpq %rax, %r13 + jge .LBB6_13 +.LBB6_8: # + # =>This Loop Header: Depth=1 + # Child Loop BB6_24 Depth 2 + # Child Loop BB6_26 Depth 2 + leaq (,%r13,4), %rax + leaq (%rax,%rax,2), %r11 + movq 160(%rbp), %rdi + movq 176(%rbp), %r10 + movq 40(%r15), %r8 + movl 8(%r15), %r9d + movq 16(%r15), %rax + movslq (%rax,%r13,4), %r12 + movq 24(%r15), %rax + movl (%rax,%r13,4), %edx + vbroadcastsd (%rdi,%r11,8), %ymm3 + vbroadcastsd 8(%rdi,%r11,8), %ymm4 + vbroadcastsd 16(%rdi,%r11,8), %ymm5 + vbroadcastsd 24(%rdi,%r11,8), %ymm6 + vbroadcastsd 32(%rdi,%r11,8), %ymm7 + vbroadcastsd 40(%rdi,%r11,8), %ymm8 + vbroadcastsd 48(%rdi,%r11,8), %ymm9 + vbroadcastsd 56(%rdi,%r11,8), %ymm10 + vbroadcastsd 64(%rdi,%r11,8), %ymm11 + vbroadcastsd 72(%rdi,%r11,8), %ymm12 + vbroadcastsd 80(%rdi,%r11,8), %ymm13 + vbroadcastsd 88(%rdi,%r11,8), %ymm14 + testl %edx, %edx + vmovupd %ymm9, 1184(%rsp) # 32-byte Spill + vmovupd %ymm10, 1152(%rsp) # 32-byte Spill + vmovupd %ymm8, 1120(%rsp) # 32-byte Spill + vmovupd %ymm4, 1088(%rsp) # 32-byte Spill + vmovupd %ymm6, 1056(%rsp) # 32-byte Spill + vmovupd %ymm11, 1024(%rsp) # 32-byte Spill + vmovupd %ymm12, 992(%rsp) # 32-byte Spill + vmovupd %ymm13, 960(%rsp) # 32-byte Spill + vmovupd %ymm14, 928(%rsp) # 32-byte Spill + vmovupd %ymm3, 896(%rsp) # 32-byte Spill + vmovupd %ymm5, 864(%rsp) # 32-byte Spill + vmovupd %ymm7, 832(%rsp) # 32-byte Spill + jle .LBB6_9 +# %bb.23: # + # in Loop: Header=BB6_8 Depth=1 + movl %r9d, %eax + imull %r13d, %eax + cltq + leaq (%r8,%rax,4), %rcx + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 384(%rsp) # 32-byte Spill + movl $0, %esi + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 256(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 192(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 352(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 160(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 64(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 320(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 224(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 96(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 288(%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, (%rsp) # 32-byte Spill + vxorps %xmm15, %xmm15, %xmm15 + vmovups %ymm15, 32(%rsp) # 32-byte Spill + .p2align 4, 0x90 +.LBB6_24: # + # Parent Loop BB6_8 Depth=1 + # => This Inner Loop Header: Depth=2 + movl (%rcx,%rsi,4), %ebx + movslq %ebx, %rax + leaq (%rax,%rax,2), %rax + shlq $5, %rax + vmovapd (%rdi,%rax), %ymm6 + vmovapd 32(%rdi,%rax), %ymm13 + vmovapd 64(%rdi,%rax), %ymm1 + vmovupd 896(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm6, %ymm0, %ymm11 + vmovupd %ymm11, 448(%rsp) # 32-byte Spill + vmovupd 832(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm13, %ymm0, %ymm10 + vmovupd %ymm10, 576(%rsp) # 32-byte Spill + vmovupd 1024(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm1, %ymm0, %ymm9 + vmovupd %ymm9, 672(%rsp) # 32-byte Spill + vmovupd 1088(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm6, %ymm0, %ymm8 + vmovupd %ymm8, 544(%rsp) # 32-byte Spill + xorl %eax, %eax + vmovupd 1120(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm13, %ymm0, %ymm12 + vmovupd %ymm12, 640(%rsp) # 32-byte Spill + cmpq %rbx, %r13 + sete %al + leal (,%rax,4), %ebx + vmovupd 992(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm1, %ymm0, %ymm14 + vmovupd %ymm14, 704(%rsp) # 32-byte Spill + vpbroadcastd 376(%rbp,%rbx,4), %xmm0 + leal 1(,%rax,4), %ebx + vpbroadcastd 376(%rbp,%rbx,4), %xmm2 + vmovupd 1184(%rsp), %ymm3 # 32-byte Reload + vsubpd %ymm13, %ymm3, %ymm3 + vmovupd %ymm3, 416(%rsp) # 32-byte Spill + leal 2(,%rax,4), %ebx + vmovdqa .LCPI6_1(%rip), %xmm5 # xmm5 = [31,30,29,28] + vpsllvd %xmm5, %xmm0, %xmm0 + vpbroadcastd 376(%rbp,%rbx,4), %xmm4 + vpsllvd %xmm5, %xmm2, %xmm2 + vpsllvd %xmm5, %xmm4, %xmm7 + vpsrad $31, %xmm0, %xmm0 + vpmovsxdq %xmm0, %ymm0 + vpsrad $31, %xmm2, %xmm2 + vmulpd %ymm9, %ymm9, %ymm4 + vpmovsxdq %xmm2, %ymm9 + vfmadd231pd %ymm10, %ymm10, %ymm4 # ymm4 = (ymm10 * ymm10) + ymm4 + vmulpd %ymm14, %ymm14, %ymm2 + vfmadd231pd %ymm12, %ymm12, %ymm2 # ymm2 = (ymm12 * ymm12) + ymm2 + vfmadd231pd %ymm11, %ymm11, %ymm4 # ymm4 = (ymm11 * ymm11) + ymm4 + vfmadd231pd %ymm8, %ymm8, %ymm2 # ymm2 = (ymm8 * ymm8) + ymm2 + vmovupd 768(%rsp), %ymm3 # 32-byte Reload + vcmpltpd %ymm3, %ymm4, %ymm11 + vandpd %ymm0, %ymm11, %ymm15 + vcmpltpd %ymm3, %ymm2, %ymm0 + vandpd %ymm0, %ymm9, %ymm9 + vmovupd 960(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm1, %ymm0, %ymm0 + vmovupd %ymm0, 512(%rsp) # 32-byte Spill + vpsrad $31, %xmm7, %xmm7 + vpmovsxdq %xmm7, %ymm7 + vmulpd %ymm0, %ymm0, %ymm8 + vmovupd 416(%rsp), %ymm0 # 32-byte Reload + vfmadd231pd %ymm0, %ymm0, %ymm8 # ymm8 = (ymm0 * ymm0) + ymm8 + vmovupd 864(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm6, %ymm0, %ymm14 + vfmadd231pd %ymm14, %ymm14, %ymm8 # ymm8 = (ymm14 * ymm14) + ymm8 + vcmpltpd %ymm3, %ymm8, %ymm11 + vandpd %ymm7, %ymm11, %ymm12 + leal 3(,%rax,4), %eax + vmovupd 1056(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm6, %ymm0, %ymm11 + vpbroadcastd 376(%rbp,%rax,4), %xmm6 + vpsllvd %xmm5, %xmm6, %xmm7 + vmovupd 1152(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm13, %ymm0, %ymm5 + vmovupd %ymm5, 480(%rsp) # 32-byte Spill + vmovupd 928(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm1, %ymm0, %ymm0 + vmovupd %ymm0, 608(%rsp) # 32-byte Spill + vpsrad $31, %xmm7, %xmm7 + vpmovsxdq %xmm7, %ymm7 + vmulpd %ymm0, %ymm0, %ymm10 + vfmadd231pd %ymm5, %ymm5, %ymm10 # ymm10 = (ymm5 * ymm5) + ymm10 + vfmadd231pd %ymm11, %ymm11, %ymm10 # ymm10 = (ymm11 * ymm11) + ymm10 + vcmpltpd %ymm3, %ymm10, %ymm13 + vandpd %ymm7, %ymm13, %ymm13 + vcvtpd2ps %ymm4, %xmm4 + vrcpps %xmm4, %xmm4 + vcvtps2pd %xmm4, %ymm4 + vmovupd 800(%rsp), %ymm0 # 32-byte Reload + vmulpd %ymm0, %ymm4, %ymm7 + vmulpd %ymm4, %ymm4, %ymm5 + vmulpd %ymm7, %ymm5, %ymm5 + vcvtpd2ps %ymm2, %xmm2 + vrcpps %xmm2, %xmm2 + vcvtps2pd %xmm2, %ymm2 + vmulpd %ymm0, %ymm2, %ymm7 + vmulpd %ymm2, %ymm2, %ymm3 + vmulpd %ymm7, %ymm3, %ymm3 + vcvtpd2ps %ymm8, %xmm7 + vrcpps %xmm7, %xmm7 + vcvtps2pd %xmm7, %ymm7 + vmulpd %ymm0, %ymm7, %ymm8 + vmulpd %ymm7, %ymm7, %ymm1 + vmulpd %ymm1, %ymm8, %ymm1 + vcvtpd2ps %ymm10, %xmm6 + vrcpps %xmm6, %xmm6 + vcvtps2pd %xmm6, %ymm6 + vmulpd %ymm0, %ymm6, %ymm8 + vmulpd %ymm6, %ymm6, %ymm10 + vmulpd %ymm8, %ymm10, %ymm8 + vmovupd 736(%rsp), %ymm0 # 32-byte Reload + vmulpd %ymm4, %ymm0, %ymm4 + vmulpd %ymm5, %ymm4, %ymm4 + vbroadcastsd .LCPI6_2(%rip), %ymm10 # ymm10 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vaddpd %ymm5, %ymm10, %ymm5 + vmulpd %ymm5, %ymm4, %ymm4 + vmulpd %ymm2, %ymm0, %ymm2 + vmulpd %ymm3, %ymm2, %ymm2 + vaddpd %ymm3, %ymm10, %ymm3 + vmulpd %ymm3, %ymm2, %ymm3 + vmulpd %ymm7, %ymm0, %ymm2 + vmulpd %ymm1, %ymm2, %ymm2 + vaddpd %ymm1, %ymm10, %ymm1 + vmulpd %ymm1, %ymm2, %ymm1 + vmulpd %ymm6, %ymm0, %ymm2 + vaddpd %ymm10, %ymm8, %ymm5 + vmulpd %ymm2, %ymm8, %ymm2 + vmulpd %ymm5, %ymm2, %ymm2 + vmulpd 448(%rsp), %ymm4, %ymm5 # 32-byte Folded Reload + vandpd %ymm5, %ymm15, %ymm5 + vmovupd 384(%rsp), %ymm6 # 32-byte Reload + vaddpd %ymm5, %ymm6, %ymm6 + vmovupd %ymm6, 384(%rsp) # 32-byte Spill + vmulpd 576(%rsp), %ymm4, %ymm5 # 32-byte Folded Reload + vandpd %ymm5, %ymm15, %ymm5 + vmovupd 256(%rsp), %ymm6 # 32-byte Reload + vaddpd %ymm5, %ymm6, %ymm6 + vmovupd %ymm6, 256(%rsp) # 32-byte Spill + vmulpd 672(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload + vandpd %ymm4, %ymm15, %ymm4 + vmovupd 192(%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm4, %ymm5, %ymm5 + vmovupd %ymm5, 192(%rsp) # 32-byte Spill + vmulpd 544(%rsp), %ymm3, %ymm4 # 32-byte Folded Reload + vandpd %ymm4, %ymm9, %ymm4 + vmovupd 352(%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm4, %ymm5, %ymm5 + vmovupd %ymm5, 352(%rsp) # 32-byte Spill + vmulpd 640(%rsp), %ymm3, %ymm4 # 32-byte Folded Reload + vandpd %ymm4, %ymm9, %ymm4 + vmovupd 160(%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm4, %ymm5, %ymm5 + vmovupd %ymm5, 160(%rsp) # 32-byte Spill + vmulpd 704(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload + vandpd %ymm3, %ymm9, %ymm0 + vmovupd 64(%rsp), %ymm3 # 32-byte Reload + vaddpd %ymm0, %ymm3, %ymm3 + vmovupd %ymm3, 64(%rsp) # 32-byte Spill + vmulpd %ymm1, %ymm14, %ymm0 + vandpd %ymm0, %ymm12, %ymm0 + vmovupd 320(%rsp), %ymm3 # 32-byte Reload + vaddpd %ymm0, %ymm3, %ymm3 + vmovupd %ymm3, 320(%rsp) # 32-byte Spill + vmulpd 416(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload + vmulpd 512(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload + vandpd %ymm0, %ymm12, %ymm0 + vandpd %ymm1, %ymm12, %ymm1 + vmovupd 224(%rsp), %ymm3 # 32-byte Reload + vaddpd %ymm0, %ymm3, %ymm3 + vmovupd %ymm3, 224(%rsp) # 32-byte Spill + vmovupd 96(%rsp), %ymm0 # 32-byte Reload + vaddpd %ymm1, %ymm0, %ymm0 + vmovupd %ymm0, 96(%rsp) # 32-byte Spill + vmulpd %ymm2, %ymm11, %ymm0 + vandpd %ymm0, %ymm13, %ymm0 + vmovupd 288(%rsp), %ymm1 # 32-byte Reload + vaddpd %ymm0, %ymm1, %ymm1 + vmovupd %ymm1, 288(%rsp) # 32-byte Spill + vmulpd 480(%rsp), %ymm2, %ymm0 # 32-byte Folded Reload + vmulpd 608(%rsp), %ymm2, %ymm1 # 32-byte Folded Reload + vandpd %ymm0, %ymm13, %ymm0 + vmovupd (%rsp), %ymm2 # 32-byte Reload + vaddpd %ymm0, %ymm2, %ymm2 + vmovupd %ymm2, (%rsp) # 32-byte Spill + vandpd %ymm1, %ymm13, %ymm0 + vmovupd 32(%rsp), %ymm1 # 32-byte Reload + vaddpd %ymm0, %ymm1, %ymm1 + vmovupd %ymm1, 32(%rsp) # 32-byte Spill + incq %rsi + cmpq %rsi, %rdx + jne .LBB6_24 + jmp .LBB6_10 + .p2align 4, 0x90 +.LBB6_9: # + # in Loop: Header=BB6_8 Depth=1 + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 32(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, (%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 288(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 96(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 224(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 320(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 64(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 160(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 352(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 192(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 256(%rsp) # 32-byte Spill + vxorpd %xmm2, %xmm2, %xmm2 + vmovupd %ymm2, 384(%rsp) # 32-byte Spill +.LBB6_10: # + # in Loop: Header=BB6_8 Depth=1 + vmovupd 32(%rsp), %ymm14 # 32-byte Reload + vmovupd 64(%rsp), %ymm5 # 32-byte Reload + vmovupd 192(%rsp), %ymm11 # 32-byte Reload + vmovupd (%rsp), %ymm2 # 32-byte Reload + vmovupd 224(%rsp), %ymm15 # 32-byte Reload + vmovupd 160(%rsp), %ymm13 # 32-byte Reload + vmovupd 256(%rsp), %ymm12 # 32-byte Reload + vmovupd 288(%rsp), %ymm8 # 32-byte Reload + vmovupd 320(%rsp), %ymm7 # 32-byte Reload + vmovupd 352(%rsp), %ymm10 # 32-byte Reload + vmovupd 384(%rsp), %ymm9 # 32-byte Reload + cmpl %r12d, %edx + vmovupd %ymm5, 64(%rsp) # 32-byte Spill + jge .LBB6_11 +# %bb.25: # + # in Loop: Header=BB6_8 Depth=1 + movslq %edx, %rcx + imull %r13d, %r9d + movslq %r9d, %rax + leaq (%r8,%rax,4), %rdx + vmovapd %ymm13, %ymm3 + vmovapd %ymm15, %ymm4 + vmovapd %ymm11, %ymm15 + vmovups 96(%rsp), %ymm6 # 32-byte Reload + .p2align 4, 0x90 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67 +# LLVM-MCA-BEGIN +.LBB6_26: # + # Parent Loop BB6_8 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm6, 96(%rsp) # 32-byte Spill + vmovupd %ymm14, 32(%rsp) # 32-byte Spill + vmovupd %ymm15, 192(%rsp) # 32-byte Spill + vmovupd %ymm2, (%rsp) # 32-byte Spill + vmovupd %ymm4, 224(%rsp) # 32-byte Spill + vmovupd %ymm12, 256(%rsp) # 32-byte Spill + vmovupd %ymm8, 288(%rsp) # 32-byte Spill + vmovupd %ymm7, 320(%rsp) # 32-byte Spill + vmovupd %ymm10, 352(%rsp) # 32-byte Spill + vmovupd %ymm9, 384(%rsp) # 32-byte Spill + movslq (%rdx,%rcx,4), %rax + leaq (%rax,%rax,2), %rax + shlq $5, %rax + vmovapd (%rdi,%rax), %ymm5 + vmovapd 32(%rdi,%rax), %ymm12 + vmovapd 64(%rdi,%rax), %ymm14 + vmovupd 896(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm5, %ymm0, %ymm11 + vmovupd %ymm11, 640(%rsp) # 32-byte Spill + vmovupd 832(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm12, %ymm0, %ymm10 + vmovupd %ymm10, 672(%rsp) # 32-byte Spill + vmovupd 1024(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm14, %ymm0, %ymm1 + vmovupd %ymm1, 160(%rsp) # 32-byte Spill + vmovupd 1088(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm5, %ymm0, %ymm8 + vmovupd %ymm8, 448(%rsp) # 32-byte Spill + vmovupd 1120(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm12, %ymm0, %ymm4 + vmovupd %ymm4, 704(%rsp) # 32-byte Spill + vmovupd 992(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm14, %ymm0, %ymm2 + vmovupd %ymm2, 416(%rsp) # 32-byte Spill + vmovupd 1184(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm12, %ymm0, %ymm7 + vmovupd %ymm7, 576(%rsp) # 32-byte Spill + vmovupd 960(%rsp), %ymm0 # 32-byte Reload + vsubpd %ymm14, %ymm0, %ymm6 + vmovupd %ymm6, 608(%rsp) # 32-byte Spill + vmulpd %ymm1, %ymm1, %ymm9 + vfmadd231pd %ymm10, %ymm10, %ymm9 # ymm9 = (ymm10 * ymm10) + ymm9 + vfmadd231pd %ymm11, %ymm11, %ymm9 # ymm9 = (ymm11 * ymm11) + ymm9 + vmulpd %ymm2, %ymm2, %ymm1 + vfmadd231pd %ymm4, %ymm4, %ymm1 # ymm1 = (ymm4 * ymm4) + ymm1 + vcvtpd2ps %ymm9, %xmm0 + vrcpps %xmm0, %xmm0 + vcvtps2pd %xmm0, %ymm0 + vmovupd 800(%rsp), %ymm11 # 32-byte Reload + vmulpd %ymm0, %ymm11, %ymm2 + vmovapd %ymm3, %ymm4 + vmulpd %ymm0, %ymm0, %ymm3 + vmulpd %ymm2, %ymm3, %ymm3 + vmulpd %ymm6, %ymm6, %ymm2 + vfmadd231pd %ymm8, %ymm8, %ymm1 # ymm1 = (ymm8 * ymm8) + ymm1 + vfmadd231pd %ymm7, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm7) + ymm2 + vmovupd 864(%rsp), %ymm6 # 32-byte Reload + vsubpd %ymm5, %ymm6, %ymm6 + vmovupd %ymm6, 480(%rsp) # 32-byte Spill + vcvtpd2ps %ymm1, %xmm7 + vfmadd231pd %ymm6, %ymm6, %ymm2 # ymm2 = (ymm6 * ymm6) + ymm2 + vrcpps %xmm7, %xmm7 + vcvtps2pd %xmm7, %ymm7 + vcvtpd2ps %ymm2, %xmm6 + vmulpd %ymm7, %ymm11, %ymm10 + vmulpd %ymm7, %ymm7, %ymm15 + vmulpd %ymm10, %ymm15, %ymm10 + vrcpps %xmm6, %xmm6 + vcvtps2pd %xmm6, %ymm6 + vmulpd %ymm6, %ymm11, %ymm15 + vmulpd %ymm6, %ymm6, %ymm13 + vmulpd %ymm15, %ymm13, %ymm13 + vmovupd 1152(%rsp), %ymm8 # 32-byte Reload + vsubpd %ymm12, %ymm8, %ymm12 + vmovupd %ymm12, 512(%rsp) # 32-byte Spill + vmovupd 928(%rsp), %ymm8 # 32-byte Reload + vsubpd %ymm14, %ymm8, %ymm8 + vmovupd %ymm8, 544(%rsp) # 32-byte Spill + vmulpd %ymm8, %ymm8, %ymm14 + vfmadd231pd %ymm12, %ymm12, %ymm14 # ymm14 = (ymm12 * ymm12) + ymm14 + vmovupd 1056(%rsp), %ymm8 # 32-byte Reload + vsubpd %ymm5, %ymm8, %ymm5 + vmovupd %ymm5, 1216(%rsp) # 32-byte Spill + vfmadd231pd %ymm5, %ymm5, %ymm14 # ymm14 = (ymm5 * ymm5) + ymm14 + vcvtpd2ps %ymm14, %xmm8 + vrcpps %xmm8, %xmm5 + vcvtps2pd %xmm5, %ymm5 + vmulpd %ymm5, %ymm11, %ymm8 + vmulpd %ymm5, %ymm5, %ymm12 + vmulpd %ymm8, %ymm12, %ymm8 + vmovupd 736(%rsp), %ymm11 # 32-byte Reload + vmulpd %ymm0, %ymm11, %ymm0 + vmulpd %ymm3, %ymm0, %ymm0 + vbroadcastsd .LCPI6_2(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vaddpd %ymm3, %ymm12, %ymm3 + vmulpd %ymm3, %ymm0, %ymm3 + vmulpd %ymm7, %ymm11, %ymm0 + vmulpd %ymm0, %ymm10, %ymm0 + vaddpd %ymm12, %ymm10, %ymm7 + vmulpd %ymm7, %ymm0, %ymm7 + vmulpd %ymm6, %ymm11, %ymm0 + vmulpd %ymm0, %ymm13, %ymm0 + vaddpd %ymm12, %ymm13, %ymm6 + vmovupd 192(%rsp), %ymm15 # 32-byte Reload + vmulpd %ymm6, %ymm0, %ymm6 + vmulpd %ymm5, %ymm11, %ymm0 + vaddpd %ymm12, %ymm8, %ymm5 + vmovupd 256(%rsp), %ymm12 # 32-byte Reload + vmovupd 352(%rsp), %ymm10 # 32-byte Reload + vmulpd %ymm0, %ymm8, %ymm0 + vmulpd %ymm5, %ymm0, %ymm0 + vmovupd 768(%rsp), %ymm13 # 32-byte Reload + vcmpltpd %ymm13, %ymm9, %ymm5 + vmovupd 384(%rsp), %ymm9 # 32-byte Reload + vmulpd 640(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload + vandpd %ymm5, %ymm8, %ymm8 + vaddpd %ymm8, %ymm9, %ymm9 + vmulpd 672(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload + vandpd %ymm5, %ymm8, %ymm8 + vaddpd %ymm8, %ymm12, %ymm12 + vmulpd 160(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload + vandpd %ymm5, %ymm3, %ymm3 + vaddpd %ymm3, %ymm15, %ymm15 + vcmpltpd %ymm13, %ymm1, %ymm1 + vmulpd 448(%rsp), %ymm7, %ymm3 # 32-byte Folded Reload + vandpd %ymm1, %ymm3, %ymm3 + vaddpd %ymm3, %ymm10, %ymm10 + vmulpd 704(%rsp), %ymm7, %ymm3 # 32-byte Folded Reload + vmulpd 416(%rsp), %ymm7, %ymm5 # 32-byte Folded Reload + vandpd %ymm1, %ymm3, %ymm3 + vaddpd %ymm3, %ymm4, %ymm3 + vmovupd 224(%rsp), %ymm4 # 32-byte Reload + vandpd %ymm1, %ymm5, %ymm1 + vmovupd 64(%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm1, %ymm5, %ymm5 + vmovupd %ymm5, 64(%rsp) # 32-byte Spill + vcmpltpd %ymm13, %ymm2, %ymm1 + vmulpd 480(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload + vmovupd 288(%rsp), %ymm8 # 32-byte Reload + vmovupd 320(%rsp), %ymm7 # 32-byte Reload + vandpd %ymm1, %ymm2, %ymm2 + vaddpd %ymm2, %ymm7, %ymm7 + vmulpd 576(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload + vandpd %ymm1, %ymm2, %ymm2 + vaddpd %ymm2, %ymm4, %ymm4 + vmulpd 608(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload + vmovupd 96(%rsp), %ymm6 # 32-byte Reload + vandpd %ymm1, %ymm2, %ymm1 + vaddpd %ymm1, %ymm6, %ymm6 + vcmpltpd %ymm13, %ymm14, %ymm1 + vmulpd 1216(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload + vandpd %ymm1, %ymm2, %ymm2 + vaddpd %ymm2, %ymm8, %ymm8 + vmulpd 512(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload + vmulpd 544(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload + vandpd %ymm1, %ymm2, %ymm2 + vmovupd (%rsp), %ymm5 # 32-byte Reload + vaddpd %ymm2, %ymm5, %ymm5 + vmovupd %ymm5, (%rsp) # 32-byte Spill + vmovupd (%rsp), %ymm2 # 32-byte Reload + vandpd %ymm1, %ymm0, %ymm0 + vmovupd 32(%rsp), %ymm1 # 32-byte Reload + vaddpd %ymm0, %ymm1, %ymm1 + vmovupd %ymm1, 32(%rsp) # 32-byte Spill + vmovupd 32(%rsp), %ymm14 # 32-byte Reload + incq %rcx + cmpq %rcx, %r12 + jne .LBB6_26 + jmp .LBB6_12 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER +.LBB6_13: # + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.6, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 152(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $1256, %rsp # imm = 0x4E8 + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end6: + .size computeForceLJ_4xn_full, .Lfunc_end6-computeForceLJ_4xn_full + .cfi_endproc + # -- End function + .globl computeForceLJ_4xn # -- Begin function computeForceLJ_4xn + .p2align 4, 0x90 + .type computeForceLJ_4xn,@function +computeForceLJ_4xn: # +.LcomputeForceLJ_4xn$local: + .cfi_startproc +# %bb.0: # + cmpl $0, 32(%rdx) + je .LBB7_2 +# %bb.1: # + jmp .LcomputeForceLJ_4xn_half$local # TAILCALL +.LBB7_2: # + jmp .LcomputeForceLJ_4xn_full$local # TAILCALL +.Lfunc_end7: + .size computeForceLJ_4xn, .Lfunc_end7-computeForceLJ_4xn + .cfi_endproc + # -- End function + .type .L.str,@object # + .section .rodata.str1.1,"aMS",@progbits,1 +.L.str: + .asciz "computeForceLJ begin\n" + .size .L.str, 22 + + .type .L.str.1,@object # +.L.str.1: + .asciz "force" + .size .L.str.1, 6 + + .type .L.str.2,@object # +.L.str.2: + .asciz "computeForceLJ end\n" + .size .L.str.2, 20 + + .type .L.str.3,@object # +.L.str.3: + .asciz "computeForceLJ_2xnn begin\n" + .size .L.str.3, 27 + + .type .L.str.4,@object # +.L.str.4: + .asciz "computeForceLJ_2xnn end\n" + .size .L.str.4, 25 + + .type .L.str.5,@object # +.L.str.5: + .asciz "computeForceLJ_4xn begin\n" + .size .L.str.5, 26 + + .type .L.str.6,@object # +.L.str.6: + .asciz "computeForceLJ_4xn end\n" + .size .L.str.6, 24 + + .type .L.str.7,@object # +.L.str.7: + .asciz "simd_load_h_dual(): Not implemented for AVX2 with double precision!" + .size .L.str.7, 68 + + .ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)" + .section ".note.GNU-stack","",@progbits From b6982d56f59dcf26ec87346ddfeb983c1b65fcfb Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Sun, 9 Apr 2023 01:19:12 +0200 Subject: [PATCH 3/3] Fix atom sorting Signed-off-by: Rafael Ravedutti --- Makefile | 4 ++++ common/parameter.c | 5 +++++ lammps/main.c | 9 ++++++++- lammps/neighbor.c | 26 +++++++++++++------------- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index a4d42b1..defffb6 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,10 @@ ifneq ($(ASM_SYNTAX), ATT) ASFLAGS += -masm=intel endif +ifeq ($(strip $(SORT_ATOMS)),true) + DEFINES += -DSORT_ATOMS +endif + ifeq ($(strip $(EXPLICIT_TYPES)),true) DEFINES += -DEXPLICIT_TYPES endif diff --git a/common/parameter.c b/common/parameter.c index 0ac5468..1d6d593 100644 --- a/common/parameter.c +++ b/common/parameter.c @@ -169,6 +169,11 @@ void printParameter(Parameter *param) { printf("\tNumber of timesteps: %d\n", param->ntimes); printf("\tReport stats every (timesteps): %d\n", param->nstat); printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every); + #ifdef SORT_ATOMS + printf("\tSort atoms when reneighboring: yes\n"); + #else + printf("\tSort atoms when reneighboring: no\n"); + #endif printf("\tPrune every (timesteps): %d\n", param->prune_every); printf("\tOutput positions every (timesteps): %d\n", param->x_out_every); printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every); diff --git a/lammps/main.c b/lammps/main.c index 1cc5c46..713c86e 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -63,6 +63,10 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * setupNeighbor(param); setupThermo(param, atom->Natoms); if(param->input_file == NULL) { adjustThermo(param, atom); } + #ifdef SORT_ATOMS + atom->Nghost = 0; + sortAtom(atom); + #endif setupPbc(atom, param); initDevice(atom, neighbor); updatePbc(atom, param, true); @@ -76,9 +80,12 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { S = getTimeStamp(); LIKWID_MARKER_START("reneighbour"); updateAtomsPbc(atom, param); + #ifdef SORT_ATOMS + atom->Nghost = 0; + sortAtom(atom); + #endif setupPbc(atom, param); updatePbc(atom, param, true); - //sortAtom(atom); buildNeighbor(atom, neighbor); LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); diff --git a/lammps/neighbor.c b/lammps/neighbor.c index 1c88096..c06f678 100644 --- a/lammps/neighbor.c +++ b/lammps/neighbor.c @@ -326,45 +326,45 @@ void sortAtom(Atom* atom) { int Nmax = atom->Nmax; int* binpos = bincount; - for(int i=1; ix; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z; MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz; - for(int mybin = 0; mybin0?binpos[mybin-1]:0; + for(int mybin = 0; mybin < mbins; mybin++) { + int start = mybin > 0 ? binpos[mybin - 1] : 0; int count = binpos[mybin] - start; - for(int k=0; kvx); atom->x = new_x; atom->vx = new_vx; -#ifndef AOS + #ifndef AOS free(atom->y); free(atom->z); free(atom->vy); @@ -381,5 +381,5 @@ void sortAtom(Atom* atom) { atom->z = new_z; atom->vy = new_vy; atom->vz = new_vz; -#endif + #endif }