Merge branch 'master' of github.com:RRZE-HPC/MD-Bench

This commit is contained in:
Rafael Ravedutti 2023-04-09 01:23:45 +02:00
commit e206c3566d
8 changed files with 2972 additions and 29 deletions

View File

@ -30,6 +30,10 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel ASFLAGS += -masm=intel
endif endif
ifeq ($(strip $(SORT_ATOMS)),true)
DEFINES += -DSORT_ATOMS
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true) ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES DEFINES += -DEXPLICIT_TYPES
endif endif

View File

@ -48,11 +48,13 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
t2 = _mm256_permute2f128_pd(t0, t1, 0x21); t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2); t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2); t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0b1100); t0 = _mm256_blend_pd(t0, t1, 0xC);
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m)); t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1); _mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101)); t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0); a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1); a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1); a0 = _mm_add_sd(a0, a1);
@ -91,7 +93,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
} }
// Functions used in LAMMPS kernel // Functions used in LAMMPS kernel
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); } #define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); } static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); } static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); } static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }

View File

@ -169,6 +169,11 @@ void printParameter(Parameter *param) {
printf("\tNumber of timesteps: %d\n", param->ntimes); printf("\tNumber of timesteps: %d\n", param->ntimes);
printf("\tReport stats every (timesteps): %d\n", param->nstat); printf("\tReport stats every (timesteps): %d\n", param->nstat);
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every); printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
#ifdef SORT_ATOMS
printf("\tSort atoms when reneighboring: yes\n");
#else
printf("\tSort atoms when reneighboring: no\n");
#endif
printf("\tPrune every (timesteps): %d\n", param->prune_every); printf("\tPrune every (timesteps): %d\n", param->prune_every);
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every); printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every); printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);

View File

@ -58,6 +58,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
neighbor->numneigh = NULL; neighbor->numneigh = NULL;
neighbor->numneigh_masked = NULL; neighbor->numneigh_masked = NULL;
neighbor->neighbors = NULL; neighbor->neighbors = NULL;
neighbor->neighbors_imask = NULL;
} }
void setupNeighbor(Parameter *param, Atom *atom) { void setupNeighbor(Parameter *param, Atom *atom) {
@ -229,7 +230,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
if(atom->Nclusters_local > nmax) { if(atom->Nclusters_local > nmax) {
nmax = atom->Nclusters_local; nmax = atom->Nclusters_local;
if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
if(neighbor->neighbors) free(neighbor->neighbors); if(neighbor->neighbors) free(neighbor->neighbors);
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int)); neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int)); neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
@ -326,6 +329,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
imask = get_imask_simd_4xn(1, ci, cj); imask = get_imask_simd_4xn(1, ci, cj);
#endif #endif
if(n < neighbor->maxneighs) {
if(imask == NBNXN_INTERACTION_MASK_ALL) { if(imask == NBNXN_INTERACTION_MASK_ALL) {
neighptr[n] = cj; neighptr[n] = cj;
neighptr_imask[n] = imask; neighptr_imask[n] = imask;
@ -336,6 +340,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
neighptr_imask[nmasked] = imask; neighptr_imask[nmasked] = imask;
nmasked++; nmasked++;
} }
}
n++; n++;
} }
@ -377,11 +382,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
} }
if(resize) { if(resize) {
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2; neighbor->maxneighs = new_maxneighs * 1.2;
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
free(neighbor->neighbors); free(neighbor->neighbors);
neighbor->neighbors = (int *) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); free(neighbor->neighbors_imask);
neighbor->neighbors_imask = (unsigned int *) malloc(atom->Nmax * neighbor->maxneighs * sizeof(unsigned int)); neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
} }
} }

View File

@ -63,6 +63,10 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
setupNeighbor(param); setupNeighbor(param);
setupThermo(param, atom->Natoms); setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); } if(param->input_file == NULL) { adjustThermo(param, atom); }
#ifdef SORT_ATOMS
atom->Nghost = 0;
sortAtom(atom);
#endif
setupPbc(atom, param); setupPbc(atom, param);
initDevice(atom, neighbor); initDevice(atom, neighbor);
updatePbc(atom, param, true); updatePbc(atom, param, true);
@ -76,9 +80,12 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
S = getTimeStamp(); S = getTimeStamp();
LIKWID_MARKER_START("reneighbour"); LIKWID_MARKER_START("reneighbour");
updateAtomsPbc(atom, param); updateAtomsPbc(atom, param);
#ifdef SORT_ATOMS
atom->Nghost = 0;
sortAtom(atom);
#endif
setupPbc(atom, param); setupPbc(atom, param);
updatePbc(atom, param, true); updatePbc(atom, param, true);
//sortAtom(atom);
buildNeighbor(atom, neighbor); buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour"); LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp(); E = getTimeStamp();

View File

@ -326,45 +326,45 @@ void sortAtom(Atom* atom) {
int Nmax = atom->Nmax; int Nmax = atom->Nmax;
int* binpos = bincount; int* binpos = bincount;
for(int i=1; i<mbins; i++) { for(int i = 1; i < mbins; i++) {
binpos[i] += binpos[i-1]; binpos[i] += binpos[i - 1];
} }
#ifdef AOS #ifdef AOS
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3); MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3); MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
#else #else
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT)); MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT)); MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT)); MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT)); MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT)); MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT)); MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
#endif #endif
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z; MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz; MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
for(int mybin = 0; mybin<mbins; mybin++) { for(int mybin = 0; mybin < mbins; mybin++) {
int start = mybin>0?binpos[mybin-1]:0; int start = mybin > 0 ? binpos[mybin - 1] : 0;
int count = binpos[mybin] - start; int count = binpos[mybin] - start;
for(int k=0; k<count; k++) { for(int k = 0; k < count; k++) {
int new_i = start + k; int new_i = start + k;
int old_i = bins[mybin * atoms_per_bin + k]; int old_i = bins[mybin * atoms_per_bin + k];
#ifdef AOS #ifdef AOS
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0]; new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1]; new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2]; new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0]; new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1]; new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2]; new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
#else #else
new_x[new_i] = old_x[old_i]; new_x[new_i] = old_x[old_i];
new_y[new_i] = old_y[old_i]; new_y[new_i] = old_y[old_i];
new_z[new_i] = old_z[old_i]; new_z[new_i] = old_z[old_i];
new_vx[new_i] = old_vx[old_i]; new_vx[new_i] = old_vx[old_i];
new_vy[new_i] = old_vy[old_i]; new_vy[new_i] = old_vy[old_i];
new_vz[new_i] = old_vz[old_i]; new_vz[new_i] = old_vz[old_i];
#endif #endif
} }
} }
@ -372,7 +372,7 @@ void sortAtom(Atom* atom) {
free(atom->vx); free(atom->vx);
atom->x = new_x; atom->x = new_x;
atom->vx = new_vx; atom->vx = new_vx;
#ifndef AOS #ifndef AOS
free(atom->y); free(atom->y);
free(atom->z); free(atom->z);
free(atom->vy); free(atom->vy);
@ -381,5 +381,5 @@ void sortAtom(Atom* atom) {
atom->z = new_z; atom->z = new_z;
atom->vy = new_vy; atom->vy = new_vy;
atom->vz = new_vz; atom->vz = new_vz;
#endif #endif
} }

View File

@ -0,0 +1,213 @@
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
Analyzed file: gromacs-avx2-dp.s
Architecture: ZEN3
Timestamp: 2023-04-08 22:04:23
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
--------------------------------------------------------------------------------------------------------------------------------------------
2436 | | | | | | | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
2437 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
2438 | | | | | | | | | | | | | | | | || | | .LBB6_26: #
2439 | | | | | | | | | | | | | | | | || | | # Parent Loop BB6_8 Depth=1
2440 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
2441 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovups %ymm6, 96(%rsp) # 32-byte Spill
2442 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm14, 32(%rsp) # 32-byte Spill
2443 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm15, 192(%rsp) # 32-byte Spill
2444 | | | | | | | 1.00 | | | | | | | | | 1.00 || | 0.0 | vmovupd %ymm2, (%rsp) # 32-byte Spill
2445 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm4, 224(%rsp) # 32-byte Spill
2446 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm12, 256(%rsp) # 32-byte Spill
2447 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm8, 288(%rsp) # 32-byte Spill
2448 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm7, 320(%rsp) # 32-byte Spill
2449 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm10, 352(%rsp) # 32-byte Spill
2450 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm9, 384(%rsp) # 32-byte Spill
2451 | | | | | | | | | | | | | | 0.50 | 0.50 | 0.00 || 4.0 | | movslq (%rdx,%rcx,4), %rax
2452 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | 0.50 | 0.50 | 0.00 || 5.0 | | leaq (%rax,%rax,2), %rax
2453 | | | | | | | | | | 0.50 | 0.50 | | | | | || 1.0 | | shlq $5, %rax
2454 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovapd (%rdi,%rax), %ymm5
2455 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovapd 32(%rdi,%rax), %ymm12
2456 | | | | | | | | | | | | | | 0.50 | 0.50 | || 4.0 | | vmovapd 64(%rdi,%rax), %ymm14
2457 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 896(%rsp), %ymm0 # 32-byte Reload
2458 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm0, %ymm11
2459 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm11, 640(%rsp) # 32-byte Spill
2460 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 832(%rsp), %ymm0 # 32-byte Reload
2461 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm0, %ymm10
2462 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm10, 672(%rsp) # 32-byte Spill
2463 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1024(%rsp), %ymm0 # 32-byte Reload
2464 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm14, %ymm0, %ymm1
2465 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm1, 160(%rsp) # 32-byte Spill
2466 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1088(%rsp), %ymm0 # 32-byte Reload
2467 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm0, %ymm8
2468 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm8, 448(%rsp) # 32-byte Spill
2469 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1120(%rsp), %ymm0 # 32-byte Reload
2470 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm0, %ymm4
2471 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm4, 704(%rsp) # 32-byte Spill
2472 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 992(%rsp), %ymm0 # 32-byte Reload
2473 | | | 0.50 | 0.50 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm14, %ymm0, %ymm2
2474 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm2, 416(%rsp) # 32-byte Spill
2475 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1184(%rsp), %ymm0 # 32-byte Reload
2476 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm0, %ymm7
2477 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm7, 576(%rsp) # 32-byte Spill
2478 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 960(%rsp), %ymm0 # 32-byte Reload
2479 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm14, %ymm0, %ymm6
2480 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm6, 608(%rsp) # 32-byte Spill
2481 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm1, %ymm1, %ymm9
2482 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm10, %ymm10, %ymm9 # ymm9 = (ymm10 * ymm10) + ymm9
2483 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm11, %ymm11, %ymm9 # ymm9 = (ymm11 * ymm11) + ymm9
2484 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm1
2485 | 0.50 | 0.50 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm1 # ymm1 = (ymm4 * ymm4) + ymm1
2486 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtpd2ps %ymm9, %xmm0
2487 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vrcpps %xmm0, %xmm0
2488 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtps2pd %xmm0, %ymm0
2489 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 800(%rsp), %ymm11 # 32-byte Reload
2490 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm11, %ymm2
2491 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovapd %ymm3, %ymm4
2492 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm0, %ymm3
2493 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm2, %ymm3, %ymm3
2494 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm6, %ymm2
2495 | 0.50 | 0.50 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm8, %ymm8, %ymm1 # ymm1 = (ymm8 * ymm8) + ymm1
2496 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm7, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm7) + ymm2
2497 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 864(%rsp), %ymm6 # 32-byte Reload
2498 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm6, %ymm6
2499 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm6, 480(%rsp) # 32-byte Spill
2500 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || 6.0 | | vcvtpd2ps %ymm1, %xmm7
2501 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm6, %ymm6, %ymm2 # ymm2 = (ymm6 * ymm6) + ymm2
2502 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vrcpps %xmm7, %xmm7
2503 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || 4.0 | | vcvtps2pd %xmm7, %ymm7
2504 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtpd2ps %ymm2, %xmm6
2505 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm10
2506 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm7, %ymm15
2507 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm10, %ymm15, %ymm10
2508 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vrcpps %xmm6, %xmm6
2509 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtps2pd %xmm6, %ymm6
2510 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm11, %ymm15
2511 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm6, %ymm13
2512 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm15, %ymm13, %ymm13
2513 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1152(%rsp), %ymm8 # 32-byte Reload
2514 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm12, %ymm8, %ymm12
2515 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm12, 512(%rsp) # 32-byte Spill
2516 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 928(%rsp), %ymm8 # 32-byte Reload
2517 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm14, %ymm8, %ymm8
2518 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm8, 544(%rsp) # 32-byte Spill
2519 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm8, %ymm8, %ymm14
2520 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm12, %ymm12, %ymm14 # ymm14 = (ymm12 * ymm12) + ymm14
2521 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 1056(%rsp), %ymm8 # 32-byte Reload
2522 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vsubpd %ymm5, %ymm8, %ymm5
2523 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm5, 1216(%rsp) # 32-byte Spill
2524 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vfmadd231pd %ymm5, %ymm5, %ymm14 # ymm14 = (ymm5 * ymm5) + ymm14
2525 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtpd2ps %ymm14, %xmm8
2526 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vrcpps %xmm8, %xmm5
2527 | | 1.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vcvtps2pd %xmm5, %ymm5
2528 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm11, %ymm8
2529 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm5, %ymm12
2530 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm8, %ymm12, %ymm8
2531 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 736(%rsp), %ymm11 # 32-byte Reload
2532 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm11, %ymm0
2533 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm3, %ymm0, %ymm0
2534 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI6_2(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
2535 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm12, %ymm3
2536 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm3, %ymm0, %ymm3
2537 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm0
2538 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm0, %ymm10, %ymm0
2539 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm12, %ymm10, %ymm7
2540 | 0.50 | 0.50 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm0, %ymm7
2541 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm11, %ymm0
2542 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm13, %ymm0
2543 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm12, %ymm13, %ymm6
2544 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 192(%rsp), %ymm15 # 32-byte Reload
2545 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm6, %ymm0, %ymm6
2546 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm11, %ymm0
2547 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm12, %ymm8, %ymm5
2548 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 256(%rsp), %ymm12 # 32-byte Reload
2549 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 352(%rsp), %ymm10 # 32-byte Reload
2550 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm0, %ymm8, %ymm0
2551 | 0.50 | 0.50 | | | | | | | | | | | | | | || | | vmulpd %ymm5, %ymm0, %ymm0
2552 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 768(%rsp), %ymm13 # 32-byte Reload
2553 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm9, %ymm5
2554 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 384(%rsp), %ymm9 # 32-byte Reload
2555 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 640(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload
2556 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm5, %ymm8, %ymm8
2557 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm8, %ymm9, %ymm9
2558 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 672(%rsp), %ymm3, %ymm8 # 32-byte Folded Reload
2559 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm5, %ymm8, %ymm8
2560 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm8, %ymm12, %ymm12
2561 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 160(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
2562 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm5, %ymm3, %ymm3
2563 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm15, %ymm15
2564 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm1, %ymm1
2565 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 448(%rsp), %ymm7, %ymm3 # 32-byte Folded Reload
2566 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm3, %ymm3
2567 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm10, %ymm10
2568 | 0.50 | 0.50 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 704(%rsp), %ymm7, %ymm3 # 32-byte Folded Reload
2569 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vmulpd 416(%rsp), %ymm7, %ymm5 # 32-byte Folded Reload
2570 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm3, %ymm3
2571 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm3, %ymm4, %ymm3
2572 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 224(%rsp), %ymm4 # 32-byte Reload
2573 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vandpd %ymm1, %ymm5, %ymm1
2574 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 64(%rsp), %ymm5 # 32-byte Reload
2575 | | | 0.50 | 0.50 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm1, %ymm5, %ymm5
2576 | | | | | | | 1.00 | | | | | | | | | 1.00 || 0.0 | | vmovupd %ymm5, 64(%rsp) # 32-byte Spill
2577 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm2, %ymm1
2578 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 480(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload
2579 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 288(%rsp), %ymm8 # 32-byte Reload
2580 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 320(%rsp), %ymm7 # 32-byte Reload
2581 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2
2582 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm2, %ymm7, %ymm7
2583 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 576(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload
2584 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2
2585 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm2, %ymm4, %ymm4
2586 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 608(%rsp), %ymm6, %ymm2 # 32-byte Folded Reload
2587 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 96(%rsp), %ymm6 # 32-byte Reload
2588 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm1
2589 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm1, %ymm6, %ymm6
2590 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vcmpltpd %ymm13, %ymm14, %ymm1
2591 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 1216(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload
2592 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2
2593 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm2, %ymm8, %ymm8
2594 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 512(%rsp), %ymm0, %ymm2 # 32-byte Folded Reload
2595 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 544(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
2596 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm2, %ymm2
2597 | | | | | | | | | | | | | | 0.50 | 0.50 | || | 4.0 | vmovupd (%rsp), %ymm5 # 32-byte Reload
2598 | | | 0.50 | 0.50 | | | | | | | | | | | | || | 3.0 | vaddpd %ymm2, %ymm5, %ymm5
2599 | | | | | | | 1.00 | | | | | | | | | 1.00 || | 0.0 | vmovupd %ymm5, (%rsp) # 32-byte Spill
2600 | | | | | | | | | | | | | | 0.50 | 0.50 | || | 4.0 | vmovupd (%rsp), %ymm2 # 32-byte Reload
2601 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vandpd %ymm1, %ymm0, %ymm0
2602 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 32(%rsp), %ymm1 # 32-byte Reload
2603 | | | 0.50 | 0.50 | | | | | | | | | | | | || | | vaddpd %ymm0, %ymm1, %ymm1
2604 | | | | | | | 1.00 | | | | | | | | | 1.00 || | | vmovupd %ymm1, 32(%rsp) # 32-byte Spill
2605 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovupd 32(%rsp), %ymm14 # 32-byte Reload
2606 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | incq %rcx
2607 | | | | | | | | | 0.50 | 0.00 | 0.00 | 0.50 | | | | || | | cmpq %rcx, %r12
2608 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jne .LBB6_26
2609 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jmp .LBB6_12
2610 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
30.5 30.5 26.5 26.5 25.0 1.00 1.00 1.00 1.00 23.0 23.0 25.0 60 11.0
Loop-Carried Dependencies Analysis Report
-----------------------------------------
2444 | 11.0 | vmovupd %ymm2, (%rsp) # 32-byte Spill| [2444, 2597, 2598, 2599, 2600]
2442 | 11.0 | vmovupd %ymm14, 32(%rsp) # 32-byte Spill| [2442, 2602, 2603, 2604, 2605]
2574 | 7.0 | vmovupd 64(%rsp), %ymm5 # 32-byte Reload| [2574, 2575, 2576]
2450 | 7.0 | vmovupd %ymm9, 384(%rsp) # 32-byte Spill| [2450, 2554, 2557]
2449 | 7.0 | vmovupd %ymm10, 352(%rsp) # 32-byte Spill| [2449, 2549, 2567]
2448 | 7.0 | vmovupd %ymm7, 320(%rsp) # 32-byte Spill| [2448, 2580, 2582]
2447 | 7.0 | vmovupd %ymm8, 288(%rsp) # 32-byte Spill| [2447, 2579, 2593]
2446 | 7.0 | vmovupd %ymm12, 256(%rsp) # 32-byte Spill| [2446, 2548, 2560]
2445 | 7.0 | vmovupd %ymm4, 224(%rsp) # 32-byte Spill| [2445, 2572, 2585]
2443 | 7.0 | vmovupd %ymm15, 192(%rsp) # 32-byte Spill| [2443, 2544, 2563]
2441 | 7.0 | vmovups %ymm6, 96(%rsp) # 32-byte Spill| [2441, 2587, 2589]
2491 | 4.0 | vmovapd %ymm3, %ymm4 | [2491, 2571]
2606 | 1.0 | incq %rcx | [2606]

File diff suppressed because it is too large Load Diff