Compare commits
45 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 59145644e3 | |||
| 4a460b2c88 | |||
| b15aa2f461 | |||
| 5c000444a4 | |||
| 04ade6bcec | |||
| 85f1484449 | |||
| 965fda3879 | |||
| a86d214c73 | |||
| d138f975f6 | |||
| 296a4c4e01 | |||
| f5fd3e265a | |||
| 1fbf9dbdac | |||
| 89e1b9a9b6 | |||
| 4e99f7a623 | |||
| 4607202752 | |||
| 301274c9b6 | |||
| 95d63334fa | |||
| d0277765c3 | |||
| 5814a86125 | |||
| 98583cdade | |||
| cb5598bc91 | |||
| 3b076cdb49 | |||
| 122a23e2b8 | |||
| 32e004944f | |||
| 6126d74aa9 | |||
| 016f07dcaa | |||
| 90f30d26a3 | |||
| 01cc05a5d6 | |||
| c61cf9a0ac | |||
| d545ca65d4 | |||
| 5833f00894 | |||
| 8aad7e87a0 | |||
| ffad9d40f3 | |||
| 99da76d59c | |||
| cfe888c132 | |||
| c7b136f629 | |||
| 07f2f74561 | |||
| fd368609e8 | |||
| db5f8cf1c6 | |||
| f467d10ed3 | |||
| fe86c948a8 | |||
| ae1cfa2800 | |||
| e5c233e072 | |||
| 8d5e10f635 | |||
| 56ff0d19af |
+13
-10
@@ -51,14 +51,17 @@ Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# TODO list
|
||||
todo.txt
|
||||
|
||||
# Build directories and executables
|
||||
GCC/
|
||||
ICC/
|
||||
ICX/
|
||||
CLANG/
|
||||
NVCC/
|
||||
MDBench-GCC*
|
||||
MDBench-ICC*
|
||||
MDBench-ICX*
|
||||
MDBench-CLANG*
|
||||
MDBench-NVCC*
|
||||
#GCC-*/
|
||||
#ICC-*/
|
||||
#ICX-*/
|
||||
#CLANG-*/
|
||||
#NVCC-*/
|
||||
build-*/
|
||||
MDBench-*
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#CONFIGURE BUILD SYSTEM
|
||||
TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
|
||||
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
|
||||
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
|
||||
TARGET = MDBench-$(IDENTIFIER)
|
||||
BUILD_DIR = ./build-$(IDENTIFIER)
|
||||
SRC_DIR = ./$(OPT_SCHEME)
|
||||
ASM_DIR = ./asm
|
||||
COMMON_DIR = ./common
|
||||
@@ -151,6 +152,13 @@ $(BUILD_DIR)/%.o: %.s
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf $(BUILD_DIR)
|
||||
@rm -rf MDBench-$(IDENTIFIER)
|
||||
@rm -f tags
|
||||
|
||||
cleanall:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf build-*
|
||||
@rm -rf MDBench-*
|
||||
@rm -f tags
|
||||
|
||||
distclean: clean
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -8,9 +8,11 @@
|
||||
#define __PARAMETER_H_
|
||||
|
||||
#if PRECISION == 1
|
||||
#define MD_FLOAT float
|
||||
# define MD_FLOAT float
|
||||
# define MD_UINT unsigned int
|
||||
#else
|
||||
#define MD_FLOAT double
|
||||
# define MD_FLOAT double
|
||||
# define MD_UINT unsigned long long int
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -9,10 +9,13 @@
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||
|
||||
@@ -7,11 +7,30 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512
|
||||
#define MD_SIMD_MASK __mmask16
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
#define MD_SIMD_INT32 __m512i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||
|
||||
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||
return _mm512_load_si512(m);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||
@@ -69,7 +88,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
|
||||
return _mm_cvtss_f32(t3);
|
||||
}
|
||||
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256 t;
|
||||
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||
t = _mm256_load_ps(m);
|
||||
|
||||
+4
-4
@@ -131,19 +131,19 @@ void readParameter(Parameter *param, const char *filename) {
|
||||
void printParameter(Parameter *param) {
|
||||
printf("Parameters:\n");
|
||||
if(param->input_file != NULL) {
|
||||
printf("Input file: %s\n", param->input_file);
|
||||
printf("\tInput file: %s\n", param->input_file);
|
||||
}
|
||||
|
||||
if(param->vtk_file != NULL) {
|
||||
printf("VTK file: %s\n", param->vtk_file);
|
||||
printf("\tVTK file: %s\n", param->vtk_file);
|
||||
}
|
||||
|
||||
if(param->xtc_file != NULL) {
|
||||
printf("XTC file: %s\n", param->xtc_file);
|
||||
printf("\tXTC file: %s\n", param->xtc_file);
|
||||
}
|
||||
|
||||
if(param->eam_file != NULL) {
|
||||
printf("EAM file: %s\n", param->eam_file);
|
||||
printf("\tEAM file: %s\n", param->eam_file);
|
||||
}
|
||||
|
||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||
|
||||
@@ -7,6 +7,6 @@ temp 80
|
||||
x_out_freq 500
|
||||
v_out_freq 5
|
||||
cutforce 0.9
|
||||
skin 0.0
|
||||
skin 0.05
|
||||
reneigh_every 100
|
||||
nstat 125000
|
||||
|
||||
+109
@@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
initMasks(atom);
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
@@ -50,6 +51,7 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
@@ -392,6 +394,113 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void initMasks(Atom *atom) {
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
|
||||
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
|
||||
|
||||
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
|
||||
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
}
|
||||
|
||||
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
|
||||
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
|
||||
}
|
||||
|
||||
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||
atom->exclusion_filter[i] = (1U << i);
|
||||
}
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
|
||||
}
|
||||
#else
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
|
||||
+511
-196
@@ -16,10 +16,36 @@
|
||||
#include <simd.h>
|
||||
|
||||
|
||||
/*
|
||||
static inline void gmx_load_simd_2xnn_interactions(
|
||||
int excl,
|
||||
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
|
||||
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
|
||||
|
||||
//SimdInt32 mask_pr_S(excl);
|
||||
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||
}
|
||||
|
||||
static inline void gmx_load_simd_4xn_interactions(
|
||||
int excl,
|
||||
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
|
||||
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
|
||||
|
||||
//SimdInt32 mask_pr_S(excl);
|
||||
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||
*interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
|
||||
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||
*interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
|
||||
}
|
||||
*/
|
||||
|
||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -35,9 +61,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
@@ -48,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int any = 0;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
@@ -119,6 +148,8 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ end\n");
|
||||
return E-S;
|
||||
@@ -127,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -136,7 +167,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -149,9 +179,41 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
/*
|
||||
MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||
MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||
|
||||
MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i);
|
||||
MD_SIMD_FLOAT zero_S = simd_broadcast(0.0);
|
||||
MD_SIMD_FLOAT one_S = simd_broadcast(1.0);
|
||||
|
||||
#if CLUSTER_M <= CLUSTER_N
|
||||
MD_SIMD_MASK diagonal_mask0, diagonal_mask2;
|
||||
diagonal_mask0 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask2 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
#else
|
||||
MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12;
|
||||
diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
#endif
|
||||
*/
|
||||
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -162,6 +224,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
@@ -176,76 +239,138 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
//int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
//MD_SIMD_MASK interact0;
|
||||
//MD_SIMD_MASK interact2;
|
||||
|
||||
//gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2);
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 1]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
|
||||
cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
/*
|
||||
#if CLUSTER_M <= CLUSTER_N
|
||||
if(ci == ci_cj0) {
|
||||
cutoff_mask0 = simd_mask_and(cutoff_mask0, diagonal_mask0);
|
||||
cutoff_mask2 = simd_mask_and(cutoff_mask2, diagonal_mask2);
|
||||
}
|
||||
#else
|
||||
if(ci == ci_cj0) {
|
||||
cutoff_mask0 = cutoff_mask0 && diagonal_mask00;
|
||||
cutoff_mask2 = cutoff_mask2 && diagonal_mask02;
|
||||
} else if(ci == ci_cj1) {
|
||||
cutoff_mask0 = cutoff_mask0 && diagonal_mask10;
|
||||
cutoff_mask2 = cutoff_mask2 && diagonal_mask12;
|
||||
}
|
||||
#endif
|
||||
*/
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
fix0 += tx0;
|
||||
fiy0 += ty0;
|
||||
fiz0 += tz0;
|
||||
fix2 += tx2;
|
||||
fiy2 += ty2;
|
||||
fiz2 += tz2;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
}
|
||||
#else
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
#endif
|
||||
}
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
fiz0 = simd_add(fiz0, tz0);
|
||||
fix2 = simd_add(fix2, tx2);
|
||||
fiy2 = simd_add(fiy2, ty2);
|
||||
fiz2 = simd_add(fiz2, tz2);
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
|
||||
fix0 += tx0;
|
||||
fiy0 += ty0;
|
||||
fiz0 += tz0;
|
||||
fix2 += tx2;
|
||||
fiy2 += ty2;
|
||||
fiz2 += tz2;
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
@@ -266,6 +391,8 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
@@ -274,7 +401,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -283,7 +410,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -296,9 +422,12 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -309,6 +438,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
@@ -323,61 +453,85 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 1]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
}
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
@@ -398,6 +552,8 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
@@ -414,7 +570,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -423,8 +579,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -436,7 +590,13 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -447,6 +607,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
@@ -473,53 +634,52 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 3]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||
@@ -531,28 +691,114 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
fiz0 = simd_add(fiz0, tz0);
|
||||
fix1 = simd_add(fix1, tx1);
|
||||
fiy1 = simd_add(fiy1, ty1);
|
||||
fiz1 = simd_add(fiz1, tz1);
|
||||
fix2 = simd_add(fix2, tx2);
|
||||
fiy2 = simd_add(fiy2, ty2);
|
||||
fiz2 = simd_add(fiz2, tz2);
|
||||
fix3 = simd_add(fix3, tx3);
|
||||
fiy3 = simd_add(fiy3, ty3);
|
||||
fiz3 = simd_add(fiz3, tz3);
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
}
|
||||
#else
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
@@ -590,6 +836,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||
return E-S;
|
||||
@@ -598,7 +846,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -607,8 +855,6 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -620,7 +866,13 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -631,6 +883,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
@@ -657,52 +910,51 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 3]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||
@@ -714,28 +966,88 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||
}
|
||||
|
||||
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
||||
@@ -744,10 +1056,13 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs));
|
||||
//addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||
return E-S;
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define CLUSTER_M 8
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_cuda
|
||||
# define initialIntegrate cudaInitialIntegrate
|
||||
# define finalIntegrate cudaFinalIntegrate
|
||||
@@ -32,11 +33,15 @@
|
||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 2
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
# else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
# endif
|
||||
# ifdef USE_REFERENCE_VERSION
|
||||
@@ -116,9 +121,17 @@ typedef struct {
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
int dummy_cj;
|
||||
MD_UINT *exclusion_filter;
|
||||
MD_FLOAT *diagonal_4xn_j_minus_i;
|
||||
MD_FLOAT *diagonal_2xnn_j_minus_i;
|
||||
unsigned int masks_2xnn_hn[8];
|
||||
unsigned int masks_2xnn_fn[8];
|
||||
unsigned int masks_4xn_hn[16];
|
||||
unsigned int masks_4xn_fn[16];
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void initMasks(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
|
||||
@@ -9,13 +9,35 @@
|
||||
|
||||
#ifndef __NEIGHBOR_H_
|
||||
#define __NEIGHBOR_H_
|
||||
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
|
||||
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
|
||||
// interaction masks (1 = interaction, 0 = no interaction)
|
||||
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
|
||||
// so read them from right to left (least significant to most significant bit)
|
||||
// All interaction mask is the same for all kernels
|
||||
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
|
||||
// 4x4 kernel diagonal mask
|
||||
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
|
||||
// 4x2 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
|
||||
// 4x8 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||
|
||||
typedef struct {
|
||||
int cj;
|
||||
unsigned int imask;
|
||||
} NeighborCluster;
|
||||
|
||||
typedef struct {
|
||||
int every;
|
||||
int ncalls;
|
||||
int* neighbors;
|
||||
int maxneighs;
|
||||
int* numneigh;
|
||||
int* numneigh_masked;
|
||||
int half_neigh;
|
||||
NeighborCluster* neighbors;
|
||||
} Neighbor;
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
|
||||
+80
-10
@@ -56,6 +56,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->numneigh_masked = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
}
|
||||
|
||||
@@ -184,6 +185,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
|
||||
static unsigned int get_imask(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
|
||||
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
|
||||
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
|
||||
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
|
||||
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
|
||||
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
#if VECTOR_WIDTH == 2
|
||||
# define get_imask_simd_4xn get_imask_simd_j2
|
||||
#elif VECTOR_WIDTH== 4
|
||||
# define get_imask_simd_4xn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 8
|
||||
# define get_imask_simd_4xn get_imask_simd_j8
|
||||
# define get_imask_simd_2xnn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 16
|
||||
# define get_imask_simd_2xnn get_imask_simd_j8
|
||||
#else
|
||||
# error "Invalid cluster configuration"
|
||||
#endif
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||
|
||||
@@ -193,7 +231,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
|
||||
}
|
||||
|
||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||
@@ -209,8 +248,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int n = 0, nmasked = 0;
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||
@@ -275,7 +314,28 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
if(d_bb_sq < cutneighsq) {
|
||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||
neighptr[n++] = cj;
|
||||
// We use true (1) for rdiag because we only care if there are masks
|
||||
// at all, and when this is set to false (0) the self-exclusions are
|
||||
// not accounted for, which makes the optimized version to not work!
|
||||
unsigned int imask;
|
||||
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
|
||||
imask = get_imask_simd_2xnn(1, ci, cj);
|
||||
#else // 4xn
|
||||
imask = get_imask_simd_4xn(1, ci, cj);
|
||||
#endif
|
||||
|
||||
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||
neighptr[n].cj = cj;
|
||||
neighptr[n].imask = imask;
|
||||
} else {
|
||||
neighptr[n].cj = neighptr[nmasked].cj;
|
||||
neighptr[n].imask = neighptr[nmasked].imask;
|
||||
neighptr[nmasked].cj = cj;
|
||||
neighptr[nmasked].imask = imask;
|
||||
nmasked++;
|
||||
}
|
||||
|
||||
n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -297,11 +357,14 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr[n].imask = 0;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = n;
|
||||
neighbor->numneigh_masked[ci] = nmasked;
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
@@ -315,7 +378,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,23 +433,27 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
MD_FLOAT cutsq = cutneighsq;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
||||
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
|
||||
numneighs--;
|
||||
}
|
||||
}
|
||||
|
||||
while(k < numneighs) {
|
||||
int cj = neighs[k];
|
||||
int cj = neighs[k].cj;
|
||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||
k++;
|
||||
} else {
|
||||
numneighs--;
|
||||
if(k < numneighs_masked) {
|
||||
numneighs_masked--;
|
||||
}
|
||||
neighs[k] = neighs[numneighs];
|
||||
}
|
||||
}
|
||||
@@ -394,11 +461,14 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs[numneighs].imask = 0;
|
||||
numneighs++;
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = numneighs;
|
||||
neighbor->numneigh_masked[ci] = numneighs_masked;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
||||
|
||||
+3
-2
@@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
@@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
MEM_TRACE(neighs[k], 'R');
|
||||
int j = neighs[k].cj;
|
||||
MEM_TRACE(j, 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
||||
|
||||
@@ -7,6 +7,7 @@ ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
||||
ASFLAGS = -masm=intel
|
||||
|
||||
+20
-4
@@ -6,13 +6,29 @@ ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
ifeq ($(ISA),AVX512)
|
||||
CFLAGS = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
|
||||
CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
CFLAGS = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
CFLAGS = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
CFLAGS = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
|
||||
+17
-3
@@ -3,11 +3,25 @@ LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
||||
|
||||
+21
-6
@@ -3,13 +3,28 @@ LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
#OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
OPTS = -Ofast -xHost $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
|
||||
+4
-2
@@ -9,13 +9,15 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
|
||||
__ISA_AVX_FMA__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX2)
|
||||
__ISA_AVX2__=true
|
||||
#__SIMD_KERNEL__=true
|
||||
__ISA_AVX2__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX512)
|
||||
__ISA_AVX512__=true
|
||||
__SIMD_KERNEL__=true
|
||||
__SIMD_WIDTH_DBL__=8
|
||||
ifeq ($(strip $(DATA_TYPE)), DP)
|
||||
__SIMD_KERNEL__=true
|
||||
endif
|
||||
endif
|
||||
|
||||
# SIMD width is specified in double-precision, hence it may
|
||||
|
||||
+13
-1
@@ -31,8 +31,12 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
double S = getTimeStamp();
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
#pragma omp parallel for
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -95,13 +99,19 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
}
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -192,6 +202,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
+31
-9
@@ -26,17 +26,22 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -67,9 +72,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
@@ -90,6 +95,8 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
@@ -102,6 +109,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
@@ -110,8 +120,12 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("forceLJ-halfneigh");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -146,9 +160,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
@@ -171,6 +185,8 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
@@ -189,7 +205,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#ifndef __SIMD_KERNEL__
|
||||
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
||||
@@ -201,7 +216,12 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
#pragma omp parallel for
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -242,9 +262,11 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
atom_fy(i) += simd_h_reduce_sum(fiy);
|
||||
atom_fz(i) += simd_h_reduce_sum(fiz);
|
||||
}
|
||||
#endif
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
#endif
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1062.9120
|
||||
Estimated atom data volume (kB): 6.1440
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2735, Mega atom updates/s: 0.1872
|
||||
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 127.3632
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 6553600
|
||||
Useful read data volume for force computation: 1.47GB
|
||||
Cycles/SIMD iteration: 83.4598
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.110776 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8643 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1367 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 9124 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1354 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 9138 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1356 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 5586 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1297 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 5328 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1269 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 5280 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1295 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.1108 |
|
||||
| Runtime unhalted [s] | 0.0878 |
|
||||
| Clock [MHz] | 1995.2564 |
|
||||
| CPI | 0.8202 |
|
||||
| Energy [J] | 10.9296 |
|
||||
| Power [W] | 98.6643 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 14233.3287 |
|
||||
| AVX DP [MFLOP/s] | 14231.8898 |
|
||||
| Packed [MUOPS/s] | 1778.9862 |
|
||||
| Scalar [MUOPS/s] | 1.4389 |
|
||||
| Memory read bandwidth [MBytes/s] | 24.9001 |
|
||||
| Memory read data volume [GBytes] | 0.0028 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.5861 |
|
||||
| Memory write data volume [GBytes] | 0.0005 |
|
||||
| Memory bandwidth [MBytes/s] | 29.4863 |
|
||||
| Memory data volume [GBytes] | 0.0033 |
|
||||
| Operational intensity | 482.7104 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: double
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200895e-01 6.923143e-01
|
||||
200 7.961495e-01 6.721043e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.28 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0352
|
||||
Average SIMD iterations per atom: 9.9181
|
||||
Total number of computed pair interactions: 2003182862
|
||||
Total number of SIMD iterations: 261297661
|
||||
Useful read data volume for force computation: 57.46GB
|
||||
Cycles/SIMD iteration: 40.4432
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.115807 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.1158 |
|
||||
| Runtime unhalted [s] | 4.0885 |
|
||||
| Clock [MHz] | 1995.2508 |
|
||||
| CPI | 0.8098 |
|
||||
| Energy [J] | 307.9429 |
|
||||
| Power [W] | 60.1944 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 12644.6041 |
|
||||
| AVX DP [MFLOP/s] | 12629.1535 |
|
||||
| Packed [MUOPS/s] | 1578.6442 |
|
||||
| Scalar [MUOPS/s] | 15.4506 |
|
||||
| Memory read bandwidth [MBytes/s] | 1713.4438 |
|
||||
| Memory read data volume [GBytes] | 8.7656 |
|
||||
| Memory write bandwidth [MBytes/s] | 86.5003 |
|
||||
| Memory write data volume [GBytes] | 0.4425 |
|
||||
| Memory bandwidth [MBytes/s] | 1799.9442 |
|
||||
| Memory data volume [GBytes] | 9.2082 |
|
||||
| Operational intensity | 7.0250 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.897385 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.8974 |
|
||||
| Runtime unhalted [s] | 4.7026 |
|
||||
| Clock [MHz] | 1995.2473 |
|
||||
| CPI | 0.6440 |
|
||||
| Energy [J] | 338.9000 |
|
||||
| Power [W] | 57.4661 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 1059.4978 |
|
||||
| AVX DP [MFLOP/s] | 1.3335 |
|
||||
| Packed [MUOPS/s] | 0.1667 |
|
||||
| Scalar [MUOPS/s] | 1058.1643 |
|
||||
| Memory read bandwidth [MBytes/s] | 136.3006 |
|
||||
| Memory read data volume [GBytes] | 0.8038 |
|
||||
| Memory write bandwidth [MBytes/s] | 72.2612 |
|
||||
| Memory write data volume [GBytes] | 0.4262 |
|
||||
| Memory bandwidth [MBytes/s] | 208.5618 |
|
||||
| Memory data volume [GBytes] | 1.2300 |
|
||||
| Operational intensity | 5.0800 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1056.7680
|
||||
Estimated atom data volume (kB): 3.0720
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2466, Mega atom updates/s: 0.2076
|
||||
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 63.6816
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 3276800
|
||||
Useful read data volume for force computation: 0.84GB
|
||||
Cycles/SIMD iteration: 150.4999
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.085843 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8354 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1126 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 7863 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1105 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 7990 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1113 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 4775 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1112 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 4201 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1127 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 4035 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1120 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.0858 |
|
||||
| Runtime unhalted [s] | 0.0691 |
|
||||
| Clock [MHz] | 1995.2787 |
|
||||
| CPI | 1.3277 |
|
||||
| Energy [J] | 9.2849 |
|
||||
| Power [W] | 108.1610 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 16606.5397 |
|
||||
| AVX SP [MFLOP/s] | 16604.7458 |
|
||||
| Packed [MUOPS/s] | 1037.7966 |
|
||||
| Scalar [MUOPS/s] | 1.7940 |
|
||||
| Memory read bandwidth [MBytes/s] | 27.7476 |
|
||||
| Memory read data volume [GBytes] | 0.0024 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.9974 |
|
||||
| Memory write data volume [GBytes] | 0.0004 |
|
||||
| Memory bandwidth [MBytes/s] | 32.7450 |
|
||||
| Memory data volume [GBytes] | 0.0028 |
|
||||
| Operational intensity | 507.1471 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: single
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200897e-01 6.923144e-01
|
||||
200 7.961481e-01 6.721031e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.42 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0351
|
||||
Average SIMD iterations per atom: 5.0875
|
||||
Total number of computed pair interactions: 2003181259
|
||||
Total number of SIMD iterations: 134032075
|
||||
Useful read data volume for force computation: 32.79GB
|
||||
Cycles/SIMD iteration: 68.9511
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 4.452877 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 595747 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 597090 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 595219 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 632443 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 633169 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 634112 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 4.4529 |
|
||||
| Runtime unhalted [s] | 3.5585 |
|
||||
| Clock [MHz] | 1995.2693 |
|
||||
| CPI | 1.1947 |
|
||||
| Energy [J] | 265.5057 |
|
||||
| Power [W] | 59.6257 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 14156.9661 |
|
||||
| AVX SP [MFLOP/s] | 14139.2165 |
|
||||
| Packed [MUOPS/s] | 883.7010 |
|
||||
| Scalar [MUOPS/s] | 17.7496 |
|
||||
| Memory read bandwidth [MBytes/s] | 1708.8254 |
|
||||
| Memory read data volume [GBytes] | 7.6092 |
|
||||
| Memory write bandwidth [MBytes/s] | 53.0035 |
|
||||
| Memory write data volume [GBytes] | 0.2360 |
|
||||
| Memory bandwidth [MBytes/s] | 1761.8288 |
|
||||
| Memory data volume [GBytes] | 7.8452 |
|
||||
| Operational intensity | 8.0354 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.935627 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 975760 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 977433 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 979122 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 967621 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 967179 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 969349 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.9356 |
|
||||
| Runtime unhalted [s] | 4.7334 |
|
||||
| Clock [MHz] | 1995.2675 |
|
||||
| CPI | 0.6483 |
|
||||
| Energy [J] | 340.7903 |
|
||||
| Power [W] | 57.4144 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 1052.6723 |
|
||||
| AVX SP [MFLOP/s] | 1.3249 |
|
||||
| Packed [MUOPS/s] | 0.0828 |
|
||||
| Scalar [MUOPS/s] | 1051.3474 |
|
||||
| Memory read bandwidth [MBytes/s] | 114.9736 |
|
||||
| Memory read data volume [GBytes] | 0.6824 |
|
||||
| Memory write bandwidth [MBytes/s] | 62.9308 |
|
||||
| Memory write data volume [GBytes] | 0.3735 |
|
||||
| Memory bandwidth [MBytes/s] | 177.9044 |
|
||||
| Memory data volume [GBytes] | 1.0560 |
|
||||
| Operational intensity | 5.9171 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-avx512-dp-ICX.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 47.68 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 42.0 0.0 | 12.5 | 5.0 5.0 | 5.0 5.0 | 0.0 | 42.0 | 12.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | movsxd rbx, dword ptr [r12+r14*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rcx, ptr [rbx+rbx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rcx, 0x6
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm4, zmm3, zmm29
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x140]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rbx+rbx*1]
|
||||
| 1 | | | | | | | 1.0 | | cmp rdi, rcx
|
||||
| 1 | | | | | | | 1.0 | | setnz dl
|
||||
| 1 | | | | | | | 1.0 | | setz cl
|
||||
| 1 | | 1.0 | | | | | | | lea ebx, ptr [rbx+rbx*1+0x1]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm25, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm4, zmm4
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm18
|
||||
| 1 | | 1.0 | | | | | | | cmp rdi, rbx
|
||||
| 1 | | | | | | | 1.0 | | setz bl
|
||||
| 1* | | | | | | | | | mov ebp, ebx
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm20, zmm19, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm21, zmm20
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm21, zmmword ptr [rsp+0x80]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm21, zmm29
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm1, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm20, zmm20, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm20, zmmword ptr [rsp+0x100]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm20, zmm30
|
||||
| 1 | | 1.0 | | | | | | | not bpl
|
||||
| 1 | | 1.0 | | | | | | | sub bpl, cl
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm18, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm26, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15{k1}, zmm19, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm4, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm19, zmm3
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm4
|
||||
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rdx+rdx*1]
|
||||
| 1* | | | | | | | | | mov eax, ebx
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm3, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm3, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm19, zmm17
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm29
|
||||
| 1 | | | | | | | 1.0 | | shl al, 0x5
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm1, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm17, zmm17, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm23, zmm30
|
||||
| 1 | | 0.5 | | | | | 0.5 | | sub cl, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add cl, 0xfd
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm4, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm4, zmm27, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14{k1}, zmm3, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm21, zmm4, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm21, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm3, zmm20
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm20, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm20, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm1, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||
| 1* | | | | | | | | | mov ecx, ebx
|
||||
| 1 | | | | | | | 1.0 | | shl cl, 0x6
|
||||
| 1 | | 0.5 | | | | | 0.5 | | sub al, cl
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add al, 0xfb
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm21, zmm0, 0x1
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm18, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm24, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm28, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm16{k1}, zmm3, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k1}, zmm3, zmm17
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm3, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm17, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm1, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm4, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl bl, 0x7
|
||||
| 1 | | 1.0 | | | | | | | sub dl, bl
|
||||
| 1 | | 1.0 | | | | | | | add dl, 0xf7
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, edx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k1}, zmm3, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k1}, zmm3, zmm21
|
||||
| 1 | | 0.5 | | | | | 0.5 | | inc r14
|
||||
| 1* | | | | | | | | | cmp r11, r14
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffd99
|
||||
Total Num Of Uops: 123
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -0,0 +1,159 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-dp-ICX.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-01-03 00:07:20
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2287 | | | | | | | | || | | .LBB5_11: #
|
||||
2288 | | | | | | | | || | | # Parent Loop BB5_6 Depth=1
|
||||
2289 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2290 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r12,%r14,4), %rbx
|
||||
2291 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rbx,%rbx,2), %rcx
|
||||
2292 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rcx
|
||||
2293 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rsi,%rcx), %zmm29
|
||||
2294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rsi,%rcx), %zmm30
|
||||
2295 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rsi,%rcx), %zmm31
|
||||
2296 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
|
||||
2297 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm4
|
||||
2298 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 320(%rsp), %zmm3 # 64-byte Reload
|
||||
2299 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm30, %zmm3, %zmm3
|
||||
2300 | | 1.00 | | | | 0.00 | | || | | leal (%rbx,%rbx), %ecx
|
||||
2301 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %rdi
|
||||
2302 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||
2303 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||
2304 | | 1.00 | | | | | | || | | leal 1(%rbx,%rbx), %ebx
|
||||
2305 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm31, %zmm25, %zmm17
|
||||
2306 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18
|
||||
2307 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
|
||||
2308 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
|
||||
2309 | 2.75 | | | | | 0.25 | | || 8.0 | | vrcp14pd %zmm18, %zmm19
|
||||
2310 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | cmpq %rbx, %rdi
|
||||
2311 | 0.00 | | | | | | 1.00 | || | | sete %bl
|
||||
2312 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ebp
|
||||
2313 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm19, %zmm20
|
||||
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm19, %zmm21
|
||||
2315 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm21, %zmm20
|
||||
2316 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm21 # 64-byte Reload
|
||||
2317 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm21, %zmm21
|
||||
2318 | 0.00 | | | | | | 1.00 | || | | shlb $4, %bpl
|
||||
2319 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm1, %zmm19
|
||||
2320 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||
2321 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm20, %zmm20
|
||||
2322 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||
2323 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 256(%rsp), %zmm20 # 64-byte Reload
|
||||
2324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm20, %zmm20
|
||||
2325 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | notb %bpl
|
||||
2326 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | subb %cl, %bpl
|
||||
2327 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2328 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
|
||||
2329 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm26, %zmm18
|
||||
2330 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
|
||||
2331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm4
|
||||
2332 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
|
||||
2333 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
|
||||
2334 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
|
||||
2335 | 2.25 | | | | | 0.75 | | || | | vrcp14pd %zmm4, %zmm3
|
||||
2336 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %ecx
|
||||
2337 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %eax
|
||||
2338 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
|
||||
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm3, %zmm17
|
||||
2340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm19
|
||||
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm19, %zmm17
|
||||
2342 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm19 # 64-byte Reload
|
||||
2343 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm19, %zmm19
|
||||
2344 | 0.00 | | | | | | 1.00 | || | | shlb $5, %al
|
||||
2345 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm1, %zmm3
|
||||
2346 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||
2347 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm17, %zmm17
|
||||
2348 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||
2349 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm23, %zmm17
|
||||
2350 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %al, %cl
|
||||
2351 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | addb $-3, %cl
|
||||
2352 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
2353 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
|
||||
2354 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm27, %zmm4
|
||||
2355 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
|
||||
2356 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm4, %zmm4, %zmm21
|
||||
2357 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
|
||||
2358 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
|
||||
2359 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
|
||||
2360 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm21, %zmm20
|
||||
2361 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
|
||||
2362 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm20, %zmm3
|
||||
2363 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm18
|
||||
2364 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||
2365 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm1, %zmm18
|
||||
2366 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm18
|
||||
2367 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2368 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||
2369 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||
2370 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ecx
|
||||
2371 | 0.00 | | | | | | 1.00 | || | | shlb $6, %cl
|
||||
2372 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %cl, %al
|
||||
2373 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-5, %al
|
||||
2374 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||
2375 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
|
||||
2376 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
|
||||
2377 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18
|
||||
2378 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm24, %zmm20
|
||||
2379 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm28, %zmm21
|
||||
2380 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
|
||||
2381 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm21, %zmm21, %zmm19
|
||||
2382 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
|
||||
2383 | 0.25 | | | | | 0.75 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
|
||||
2384 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
|
||||
2385 | 2.00 | | | | | 1.00 | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2386 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
|
||||
2387 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm22, %zmm17, %zmm3
|
||||
2388 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm17, %zmm4
|
||||
2389 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||
2390 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm1, %zmm4
|
||||
2391 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm4
|
||||
2392 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2393 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||
2394 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||
2395 | 0.00 | | | | | | 1.00 | || | | shlb $7, %bl
|
||||
2396 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %bl, %dl
|
||||
2397 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addb $-9, %dl
|
||||
2398 | 1.00 | | | | | | | || | | kmovd %edx, %k1
|
||||
2399 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2400 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
|
||||
2401 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
|
||||
2402 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
|
||||
2403 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %r14
|
||||
2404 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %r14, %r11
|
||||
2405 | | | | | | | | || | | * jne .LBB5_11
|
||||
|
||||
40.0 14.5 5.00 5.00 5.00 5.00 40.0 14.5 50.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2402 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
|
||||
2401 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
|
||||
2400 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
|
||||
2386 | 4.0 | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
|
||||
2384 | 4.0 | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
|
||||
2380 | 4.0 | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
|
||||
2361 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
|
||||
2359 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
|
||||
2355 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
|
||||
2338 | 4.0 | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
|
||||
2334 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
|
||||
2330 | 4.0 | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
|
||||
2394 | 3.0 | shlb $3, %dl | [2394, 2396, 2397]
|
||||
2318 | 3.0 | shlb $4, %bpl | [2318, 2325, 2326]
|
||||
2403 | 1.0 | incq %r14 | [2403]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,198 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icc-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
|
||||
| 1 | | | | | | | 1.0 | | inc rsi
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
|
||||
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
|
||||
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
|
||||
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
|
||||
| 1 | | | | | | | 1.0 | | mov edx, 0x0
|
||||
| 1 | | | | | | | 1.0 | | setz dl
|
||||
| 1 | | 1.0 | | | | | | | cmp eax, r11d
|
||||
| 1 | | | | | | | 1.0 | | mov eax, 0x0
|
||||
| 1* | | | | | | | | | mov r13d, edx
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
|
||||
| 1 | | | | | | | 1.0 | | setz al
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
|
||||
| 1 | | 1.0 | | | | | | | neg r13d
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
|
||||
| 1* | | | | | | | | | mov r12d, eax
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
|
||||
| 1 | | 1.0 | | | | | | | add r13d, 0xff
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
|
||||
| 1 | | | | | | | 1.0 | | nop
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
|
||||
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||
| 1* | | | | | | | | | mov r13d, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
|
||||
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
|
||||
| 1 | | | | | | | 1.0 | | neg r12d
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
|
||||
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
|
||||
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
|
||||
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
|
||||
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
|
||||
| 1 | | 1.0 | | | | | | | sub r12d, r13d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
|
||||
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
|
||||
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
|
||||
| 1* | | | | | | | | | mov r12d, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
|
||||
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
|
||||
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
|
||||
| 1 | | | | | | | 1.0 | | neg r13d
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
|
||||
| 1 | | | | | | | 1.0 | | add r13d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
|
||||
| 1 | | | | | | | 1.0 | | shl edx, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
|
||||
| 1 | | 1.0 | | | | | | | neg edx
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
|
||||
| 1 | | 1.0 | | | | | | | add edx, 0xff
|
||||
| 1 | | | | | | | 1.0 | | shl eax, 0x7
|
||||
| 1 | | 1.0 | | | | | | | sub edx, eax
|
||||
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||
| 1 | 1.0 | | | | | | | | kmovw eax, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovb k7, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, edx
|
||||
| 1 | 1.0 | | | | | | | | kmovb edx, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovw k7, edx
|
||||
| 1 | 1.0 | | | | | | | | kmovw edx, k0
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
|
||||
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||
| 1 | | | | | | 1.0 | | | kmovb k0, edx
|
||||
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
|
||||
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
|
||||
| 1* | | | | | | | | | cmp rsi, rdi
|
||||
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
|
||||
Total Num Of Uops: 187
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -0,0 +1,152 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icc-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
|
||||
| 1* | | | | | | | | | mov r12d, r13d
|
||||
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
|
||||
| 1 | | 1.0 | | | | | | | inc rax
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
|
||||
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
|
||||
| 1 | | | | | | | 1.0 | | setz r12b
|
||||
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | shl r14, 0x5
|
||||
| 1* | | | | | | | | | mov r8d, r12d
|
||||
| 1 | | 1.0 | | | | | | | neg r8d
|
||||
| 1* | | | | | | | | | mov r11d, r12d
|
||||
| 1 | | 1.0 | | | | | | | add r8d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
|
||||
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
|
||||
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
|
||||
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
|
||||
| 1 | | | | | | | 1.0 | | neg r9d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | | | | | | | 1.0 | | add r9d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
|
||||
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
|
||||
| 1 | | | | | | | 1.0 | | neg r10d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
|
||||
| 1 | | 1.0 | | | | | | | add r10d, r12d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
|
||||
| 1 | | | | | | | 1.0 | | add r10d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
|
||||
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
|
||||
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub r12d, r11d
|
||||
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
|
||||
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
|
||||
| 1* | | | | | | | | | cmp rax, rdx
|
||||
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
|
||||
Total Num Of Uops: 142
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -0,0 +1,154 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icx-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
|
||||
| 1 | | | | | | | 1.0 | | cmp r11, rdx
|
||||
| 1 | | | | | | | 1.0 | | setnz dl
|
||||
| 1 | | | | | | | 1.0 | | setz al
|
||||
| 1 | | 1.0 | | | | | | | add ecx, ecx
|
||||
| 1 | | 1.0 | | | | | | | inc ecx
|
||||
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
|
||||
| 1 | | | | | | | 1.0 | | setz cl
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
|
||||
| 1 | | | | | | | 1.0 | | setnz dil
|
||||
| 1* | | | | | | | | | mov ebp, edi
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub bpl, al
|
||||
| 1 | | 1.0 | | | | | | | add bpl, 0xef
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
|
||||
| 1* | | | | | | | | | mov ebp, edi
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
|
||||
| 1 | | 1.0 | | | | | | | or bpl, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||
| 1 | | | | | | | 1.0 | | shl dil, 0x6
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, edi
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
|
||||
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl cl, 0x7
|
||||
| 1 | | 1.0 | | | | | | | or cl, dl
|
||||
| 1 | | 1.0 | | | | | | | add cl, 0xf7
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
|
||||
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
|
||||
| 1* | | | | | | | | | cmp r9, rbx
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
|
||||
Total Num Of Uops: 129
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -0,0 +1,288 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 12200
|
||||
Total Cycles: 4745
|
||||
Total uOps: 14000
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.95
|
||||
IPC: 2.57
|
||||
Block RThroughput: 34.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
|
||||
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
|
||||
1 1 0.50 shlq $6, %rdx
|
||||
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
|
||||
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
|
||||
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
|
||||
2 8 0.50 * vmovupd 16(%rsp), %zmm3
|
||||
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
|
||||
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
|
||||
2 8 0.50 * vmovupd 336(%rsp), %zmm16
|
||||
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
|
||||
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||
3 4 2.00 vrcp14pd %zmm17, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
|
||||
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
|
||||
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
|
||||
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
|
||||
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
|
||||
1 1 0.50 leal (%rcx,%rcx), %edx
|
||||
1 1 0.25 cmpq %rdx, %r11
|
||||
1 1 0.50 setne %dl
|
||||
1 1 0.50 sete %al
|
||||
1 1 0.25 addl %ecx, %ecx
|
||||
1 1 0.25 incl %ecx
|
||||
1 1 0.25 cmpq %rcx, %r11
|
||||
1 1 0.50 sete %cl
|
||||
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
|
||||
2 8 0.50 * vmovupd 528(%rsp), %zmm19
|
||||
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
|
||||
1 1 0.50 setne %dil
|
||||
1 1 0.25 movl %edi, %ebp
|
||||
1 1 0.50 shlb $4, %bpl
|
||||
1 1 0.25 subb %al, %bpl
|
||||
1 1 0.25 addb $-17, %bpl
|
||||
1 1 1.00 kmovd %ebp, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 272(%rsp), %zmm17
|
||||
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
|
||||
1 1 0.50 leal (%rdx,%rdx), %eax
|
||||
1 1 0.25 movl %edi, %ebp
|
||||
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm3, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
|
||||
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
|
||||
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
|
||||
2 8 0.50 * vmovupd 464(%rsp), %zmm31
|
||||
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
|
||||
1 1 0.50 shlb $5, %bpl
|
||||
1 1 0.25 orb %al, %bpl
|
||||
1 1 0.25 orb $-35, %bpl
|
||||
1 1 1.00 kmovd %ebp, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 208(%rsp), %zmm3
|
||||
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
|
||||
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
|
||||
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
|
||||
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm19, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
|
||||
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
|
||||
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
|
||||
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
|
||||
1 1 0.50 leal (,%rdx,4), %eax
|
||||
1 1 0.50 shlb $6, %dil
|
||||
1 1 0.25 orb %al, %dil
|
||||
1 1 0.25 orb $-69, %dil
|
||||
1 1 1.00 kmovd %edi, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 400(%rsp), %zmm17
|
||||
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
|
||||
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
|
||||
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
|
||||
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm28, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
|
||||
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
|
||||
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
|
||||
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
|
||||
1 1 0.50 shlb $3, %dl
|
||||
1 1 0.50 shlb $7, %cl
|
||||
1 1 0.25 orb %dl, %cl
|
||||
1 1 0.25 addb $-9, %cl
|
||||
1 1 1.00 kmovd %ecx, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||
1 1 0.25 incq %rbx
|
||||
1 1 0.25 cmpq %rbx, %r9
|
||||
1 1 0.50 jne .LBB5_12
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
|
||||
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
|
||||
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
|
||||
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
|
||||
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
|
||||
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
|
||||
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
|
||||
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
|
||||
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
|
||||
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
|
||||
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
|
||||
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
|
||||
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
|
||||
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
|
||||
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
|
||||
- - - - - - - - 1.00 - cmpq %rdx, %r11
|
||||
- - - - - - - - 1.00 - setne %dl
|
||||
- - 0.44 - - - - - 0.56 - sete %al
|
||||
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
|
||||
- - - 0.53 - - - 0.46 0.01 - incl %ecx
|
||||
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
|
||||
- - 0.02 - - - - - 0.98 - sete %cl
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
|
||||
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
|
||||
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
|
||||
- - 0.04 - - - - - 0.96 - setne %dil
|
||||
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
|
||||
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
|
||||
- - - 0.96 - - - - 0.04 - subb %al, %bpl
|
||||
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
|
||||
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
|
||||
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
|
||||
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
|
||||
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
|
||||
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
|
||||
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
|
||||
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
|
||||
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
|
||||
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
|
||||
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
|
||||
- - - 0.94 - - - - 0.06 - orb %al, %bpl
|
||||
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
|
||||
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
|
||||
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
|
||||
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
|
||||
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
|
||||
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
|
||||
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
|
||||
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
|
||||
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
|
||||
- - - - - - - - 1.00 - shlb $6, %dil
|
||||
- - - 0.02 - - - - 0.98 - orb %al, %dil
|
||||
- - - 0.48 - - - - 0.52 - orb $-69, %dil
|
||||
- - - - - - - 1.00 - - kmovd %edi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
|
||||
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
|
||||
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
|
||||
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
|
||||
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
|
||||
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
|
||||
- - - - - - - - 1.00 - shlb $3, %dl
|
||||
- - - - - - - - 1.00 - shlb $7, %cl
|
||||
- - - 1.00 - - - - - - orb %dl, %cl
|
||||
- - - 0.52 - - - - 0.48 - addb $-9, %cl
|
||||
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
|
||||
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||
- - - 0.48 - - - - 0.52 - incq %rbx
|
||||
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
|
||||
- - - - - - - - 1.00 - jne .LBB5_12
|
||||
@@ -0,0 +1,167 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-dp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-14 12:51:57
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
2241 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2242 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2243 | | | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||
2244 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r10,%rbx,4), %rcx
|
||||
2246 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
2247 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rdx
|
||||
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||
2252 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||
2253 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||
2255 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2256 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||
2257 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||
2258 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||
2259 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm17, %zmm18
|
||||
2260 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||
2261 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2262 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2263 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||
2264 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||
2265 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||
2266 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||
2267 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||
2268 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rdx, %r11
|
||||
2269 | 0.00 | | | | | | 1.00 | | | || | | setne %dl
|
||||
2270 | 0.00 | | | | | | 1.00 | | | || | | sete %al
|
||||
2271 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl %ecx, %ecx
|
||||
2272 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | incl %ecx
|
||||
2273 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rcx, %r11
|
||||
2274 | 0.00 | | | | | | 1.00 | | | || | | sete %cl
|
||||
2275 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||
2277 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||
2278 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||
2279 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||
2280 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $4, %bpl
|
||||
2281 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | subb %al, %bpl
|
||||
2282 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | addb $-17, %bpl
|
||||
2283 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||
2284 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||
2286 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||
2287 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rdx,%rdx), %eax
|
||||
2288 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||
2289 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||
2290 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||
2291 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||
2292 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||
2293 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||
2294 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||
2295 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm3, %zmm16
|
||||
2296 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||
2297 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||
2298 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2299 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2300 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||
2301 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||
2302 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||
2304 | 0.75 | | | | | 0.250 | | | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||
2305 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $5, %bpl
|
||||
2306 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb %al, %bpl
|
||||
2307 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb $-35, %bpl
|
||||
2308 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||
2309 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||
2311 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||
2312 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||
2313 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||
2314 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2315 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||
2316 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||
2317 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||
2318 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||
2319 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||
2320 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2321 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||
2322 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||
2323 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2324 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2325 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||
2326 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||
2327 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||
2328 | 0.75 | | | | | 0.250 | | | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||
2329 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (,%rdx,4), %eax
|
||||
2330 | 0.00 | | | | | | 1.00 | | | || | | shlb $6, %dil
|
||||
2331 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %al, %dil
|
||||
2332 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb $-69, %dil
|
||||
2333 | 1.00 | | | | | | | | | || | | kmovd %edi, %k1
|
||||
2334 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||
2336 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||
2337 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||
2338 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||
2339 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2340 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||
2341 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||
2342 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||
2343 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||
2344 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||
2345 | 2.00 | | | | | 1.000 | | | | || | | vrcp14pd %zmm28, %zmm3
|
||||
2346 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||
2347 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||
2348 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2349 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2350 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||
2351 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||
2352 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||
2353 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||
2354 | 0.00 | | | | | | 1.00 | | | || | | shlb $3, %dl
|
||||
2355 | 0.00 | | | | | | 1.00 | | | || | | shlb $7, %cl
|
||||
2356 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %dl, %cl
|
||||
2357 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addb $-9, %cl
|
||||
2358 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k1
|
||||
2359 | 0.00 | | | | | 1.000 | | | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
2360 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||
2361 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||
2362 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||
2363 | 0.24 | | | | | 0.760 | | | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||
2364 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rbx
|
||||
2365 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rbx, %r9
|
||||
2366 | | | | | | | | | | || | | * jne .LBB5_12
|
||||
2367 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
44.0 15.0 5.50 5.50 5.50 5.50 43.99 15.0 71 6.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||
2364 | 1.0 | incq %rbx | [2364]
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-dp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:30:53
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
|
||||
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
|
||||
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
|
||||
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
|
||||
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
|
||||
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
|
||||
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
|
||||
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
|
||||
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
|
||||
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
|
||||
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
|
||||
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
|
||||
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
|
||||
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
|
||||
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
|
||||
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
|
||||
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
|
||||
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
|
||||
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
|
||||
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
|
||||
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
|
||||
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
|
||||
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
|
||||
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
|
||||
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
|
||||
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
|
||||
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
|
||||
2366 | | | | | | | | || | | * jne .LBB5_12
|
||||
2367 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||
2364 | 1.0 | incq %rbx | [2364]
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icx-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
|
||||
| 1* | | | | | | | | | mov rsi, rax
|
||||
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
|
||||
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
|
||||
| 1* | | | | | | | | | xor esi, esi
|
||||
| 1* | | | | | | | | | xor edi, edi
|
||||
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
|
||||
| 1 | | | | | | | 1.0 | | setz sil
|
||||
| 1 | | | | | | | 1.0 | | setnz dil
|
||||
| 1 | | 1.0 | | | | | | | mov eax, 0xff
|
||||
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
|
||||
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
|
||||
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
|
||||
| 1 | | 1.0 | | | | | | | xor esi, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
|
||||
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | or esi, 0xfc
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
|
||||
| 1* | | | | | | | | | cmp r10, rdx
|
||||
| 0*F | | | | | | | | | jz 0x34
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
|
||||
| 1 | | 1.0 | | | | | | | inc rdx
|
||||
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
|
||||
Total Num Of Uops: 140
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -0,0 +1,304 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 13000
|
||||
Total Cycles: 5640
|
||||
Total uOps: 15400
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.73
|
||||
IPC: 2.30
|
||||
Block RThroughput: 40.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 5 0.50 * movslq (%r11,%rdx,4), %rax
|
||||
1 1 0.25 movq %rax, %rsi
|
||||
1 1 0.50 shlq $5, %rsi
|
||||
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
|
||||
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
|
||||
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
|
||||
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
|
||||
2 8 0.50 * vmovups 128(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
|
||||
2 8 0.50 * vmovups 320(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
|
||||
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
|
||||
2 8 0.50 * vmovups (%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
|
||||
2 8 0.50 * vmovups 256(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
|
||||
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
|
||||
2 8 0.50 * vmovups 448(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
|
||||
2 8 0.50 * vmovups 192(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
|
||||
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
|
||||
2 8 0.50 * vmovups 384(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
|
||||
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
|
||||
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
|
||||
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
|
||||
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
|
||||
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
|
||||
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
|
||||
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||
3 4 2.00 vrcp14ps %zmm27, %zmm31
|
||||
3 4 2.00 vrcp14ps %zmm28, %zmm1
|
||||
3 4 2.00 vrcp14ps %zmm29, %zmm2
|
||||
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||
3 4 2.00 vrcp14ps %zmm30, %zmm3
|
||||
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
|
||||
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
|
||||
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
|
||||
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
|
||||
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
|
||||
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
|
||||
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
|
||||
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
|
||||
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
|
||||
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
|
||||
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
|
||||
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
|
||||
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
|
||||
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
|
||||
1 0 0.17 xorl %esi, %esi
|
||||
1 0 0.17 xorl %edi, %edi
|
||||
1 1 0.25 testl $2147483647, %eax
|
||||
1 1 0.50 sete %sil
|
||||
1 1 0.50 setne %dil
|
||||
1 1 0.25 movl $255, %eax
|
||||
1 1 0.50 cmovel %r8d, %eax
|
||||
1 1 0.25 movl $255, %ecx
|
||||
1 1 0.50 cmovel %r9d, %ecx
|
||||
1 1 0.25 xorl $255, %esi
|
||||
1 1 1.00 kmovd %esi, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
1 1 0.50 leal (%rdi,%rdi,2), %esi
|
||||
1 1 0.25 orl $252, %esi
|
||||
1 1 1.00 kmovd %esi, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
|
||||
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
|
||||
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
|
||||
1 1 1.00 kmovd %eax, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
|
||||
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
1 1 1.00 kmovd %ecx, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
|
||||
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
|
||||
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
|
||||
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
|
||||
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
|
||||
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
1 5 0.50 * movq 176(%r15), %rax
|
||||
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
|
||||
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
|
||||
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
|
||||
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
|
||||
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
|
||||
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
|
||||
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
|
||||
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
|
||||
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
|
||||
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
|
||||
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
|
||||
1 1 0.25 cmpq %rdx, %r10
|
||||
1 1 0.50 je .LBB4_18
|
||||
1 5 0.50 * movq 160(%r15), %rdi
|
||||
1 1 0.25 incq %rdx
|
||||
1 1 0.50 jmp .LBB4_8
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
|
||||
- - - - - - - - 1.00 - movq %rax, %rsi
|
||||
- - - - - - - - 1.00 - shlq $5, %rsi
|
||||
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
|
||||
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
|
||||
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
|
||||
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
|
||||
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
|
||||
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
|
||||
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
|
||||
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
|
||||
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
|
||||
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
|
||||
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
|
||||
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
|
||||
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
|
||||
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
|
||||
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
|
||||
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
|
||||
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
|
||||
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
|
||||
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
|
||||
- - - - - - - - - - xorl %esi, %esi
|
||||
- - - - - - - - - - xorl %edi, %edi
|
||||
- - - - - - - - 1.00 - testl $2147483647, %eax
|
||||
- - - - - - - - 1.00 - sete %sil
|
||||
- - - - - - - - 1.00 - setne %dil
|
||||
- - - 1.00 - - - - - - movl $255, %eax
|
||||
- - - - - - - - 1.00 - cmovel %r8d, %eax
|
||||
- - - 1.00 - - - - - - movl $255, %ecx
|
||||
- - - - - - - - 1.00 - cmovel %r9d, %ecx
|
||||
- - - 1.00 - - - - - - xorl $255, %esi
|
||||
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
|
||||
- - - - - - - - 1.00 - orl $252, %esi
|
||||
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
|
||||
- - - - - - - 1.00 - - kmovd %eax, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
|
||||
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
- - - - 1.00 - - - - - movq 176(%r15), %rax
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
|
||||
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
|
||||
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
|
||||
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
|
||||
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
|
||||
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
|
||||
- - - - - - - - 1.00 - cmpq %rdx, %r10
|
||||
- - - - - - - - 1.00 - je .LBB4_18
|
||||
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
|
||||
- - - 1.00 - - - - - - incq %rdx
|
||||
- - - - - - - - 1.00 - jmp .LBB4_8
|
||||
@@ -0,0 +1,116 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-sp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-14 12:51:43
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||
1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1
|
||||
1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx
|
||||
1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx
|
||||
1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16
|
||||
1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3]
|
||||
1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3]
|
||||
1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
|
||||
1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18
|
||||
1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17
|
||||
1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16
|
||||
1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22
|
||||
1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22
|
||||
1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22
|
||||
1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23
|
||||
1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24
|
||||
1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||
1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||
1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25
|
||||
1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23
|
||||
1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23
|
||||
1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23
|
||||
1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||
1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi
|
||||
1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp
|
||||
1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12
|
||||
1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||
1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx
|
||||
1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl
|
||||
1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx
|
||||
1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx
|
||||
1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12
|
||||
1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl
|
||||
1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx
|
||||
1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl
|
||||
1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx
|
||||
1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d
|
||||
1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d
|
||||
1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d
|
||||
1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx
|
||||
1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx
|
||||
1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d
|
||||
1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00
|
||||
1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx
|
||||
1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||
1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2}
|
||||
1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21
|
||||
1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20
|
||||
1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19
|
||||
1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22
|
||||
1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
|
||||
1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18
|
||||
1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18
|
||||
1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18
|
||||
1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
|
||||
1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17
|
||||
1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
|
||||
1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16
|
||||
1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||
1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||
1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22
|
||||
1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17
|
||||
1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17
|
||||
1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16
|
||||
1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx
|
||||
1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx
|
||||
1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx
|
||||
1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx
|
||||
1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx
|
||||
1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx
|
||||
1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB
|
||||
1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||
1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2}
|
||||
1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16
|
||||
1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
|
||||
1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
|
||||
1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
|
||||
1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax
|
||||
1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10
|
||||
1420 | | | | | | | | | | || | | * jne .LBB2_12
|
||||
1421 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
|
||||
1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
|
||||
1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
|
||||
1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
|
||||
1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
|
||||
1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
|
||||
1418 | 1.0 | incq %rax | [1418]
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-sp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:31:04
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
|
||||
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
|
||||
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
|
||||
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
|
||||
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
|
||||
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
|
||||
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
|
||||
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
|
||||
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
|
||||
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
|
||||
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
|
||||
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
|
||||
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
|
||||
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
|
||||
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
|
||||
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
|
||||
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
|
||||
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
|
||||
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
|
||||
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
|
||||
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
|
||||
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
|
||||
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
|
||||
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
|
||||
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
|
||||
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
|
||||
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
|
||||
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
|
||||
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
|
||||
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
|
||||
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
|
||||
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
|
||||
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
|
||||
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
|
||||
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
|
||||
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
|
||||
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
|
||||
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
|
||||
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
|
||||
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
|
||||
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
|
||||
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
|
||||
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
|
||||
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
|
||||
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
|
||||
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
|
||||
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
|
||||
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
|
||||
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
|
||||
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
|
||||
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
|
||||
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
|
||||
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
|
||||
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
|
||||
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
|
||||
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
|
||||
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
|
||||
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
|
||||
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
|
||||
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
|
||||
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
|
||||
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
|
||||
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
|
||||
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
|
||||
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
|
||||
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
|
||||
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
|
||||
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
|
||||
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
|
||||
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
|
||||
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
|
||||
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
|
||||
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
|
||||
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
|
||||
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
|
||||
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
|
||||
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
|
||||
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
|
||||
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
|
||||
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
|
||||
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
|
||||
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
|
||||
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
|
||||
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
|
||||
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
|
||||
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
|
||||
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
|
||||
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
|
||||
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
|
||||
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
|
||||
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
|
||||
1791 | | | | | | | | || | | * je .LBB4_18
|
||||
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
|
||||
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
|
||||
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
|
||||
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
|
||||
1796 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1794 | 1.0 | incq %rdx | [1794]
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - lammps-icc-avx2.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
|
||||
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
|
||||
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
|
||||
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
|
||||
| 1* | | | | | | | | | mov r8d, ecx
|
||||
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
|
||||
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
|
||||
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
|
||||
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
|
||||
| 1* | | | | | | | | | mov r14d, r15d
|
||||
| 1 | | | | | | | 1.0 | | shr r15, 0x20
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
|
||||
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
|
||||
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
|
||||
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
|
||||
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
|
||||
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
|
||||
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
|
||||
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
|
||||
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
|
||||
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
|
||||
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
|
||||
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
|
||||
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
|
||||
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
|
||||
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
|
||||
| 1 | | | | | | | 1.0 | | add rdx, 0x4
|
||||
| 1* | | | | | | | | | cmp rdx, rsi
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff02
|
||||
Total Num Of Uops: 62
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
@@ -0,0 +1,156 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 5600
|
||||
Total Cycles: 2352
|
||||
Total uOps: 6300
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.68
|
||||
IPC: 2.38
|
||||
Block RThroughput: 10.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
1 2 1.00 vmovq %xmm0, %rcx
|
||||
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
1 2 1.00 vmovq %xmm2, %r15
|
||||
1 1 0.25 movl %ecx, %r8d
|
||||
1 1 0.50 shrq $32, %rcx
|
||||
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||
1 1 0.25 movslq %r8d, %rcx
|
||||
1 1 0.25 movslq %r14d, %r8
|
||||
1 1 0.25 movl %r15d, %r14d
|
||||
1 1 0.50 shrq $32, %r15
|
||||
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||
1 1 0.25 movslq %r14d, %r14
|
||||
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||
1 1 0.25 movslq %r15d, %r15
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
2 3 1.00 vptest %ymm7, %ymm1
|
||||
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||
1 1 0.25 addq $4, %rdx
|
||||
1 1 0.25 cmpq %rsi, %rdx
|
||||
1 1 0.50 jb ..B1.22
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
|
||||
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
- - 1.00 - - - - - - - vmovq %xmm2, %r15
|
||||
- - - - - - - - 1.00 - movl %ecx, %r8d
|
||||
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
|
||||
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
|
||||
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
|
||||
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
|
||||
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
|
||||
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
|
||||
- - 0.51 - - - - - 0.49 - shrq $32, %r15
|
||||
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
|
||||
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
|
||||
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
|
||||
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
|
||||
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
|
||||
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
|
||||
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
|
||||
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
|
||||
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||
- - 0.01 - - - - - 0.99 - addq $4, %rdx
|
||||
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
|
||||
- - 0.45 - - - - - 0.55 - jb ..B1.22
|
||||
@@ -0,0 +1,158 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 5600
|
||||
Total Cycles: 2306
|
||||
Total uOps: 6300
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.73
|
||||
IPC: 2.43
|
||||
Block RThroughput: 10.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
1 2 1.00 vmovq %xmm0, %rcx
|
||||
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
1 2 1.00 vmovq %xmm2, %r15
|
||||
1 1 0.25 movl %ecx, %r8d
|
||||
1 1 0.50 shrq $32, %rcx
|
||||
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||
1 1 0.25 movslq %r8d, %rcx
|
||||
1 1 0.25 movslq %r14d, %r8
|
||||
1 1 0.25 movl %r15d, %r14d
|
||||
1 1 0.50 shrq $32, %r15
|
||||
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||
1 1 0.25 movslq %r14d, %r14
|
||||
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||
1 1 0.25 movslq %r15d, %r15
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
2 3 1.00 vptest %ymm7, %ymm1
|
||||
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||
1 1 0.25 addq $4, %rdx
|
||||
1 1 0.25 cmpq %rsi, %rdx
|
||||
1 1 0.50 jb ..B1.22
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - ICXDivider
|
||||
[1] - ICXFPDivider
|
||||
[2] - ICXPort0
|
||||
[3] - ICXPort1
|
||||
[4] - ICXPort2
|
||||
[5] - ICXPort3
|
||||
[6] - ICXPort4
|
||||
[7] - ICXPort5
|
||||
[8] - ICXPort6
|
||||
[9] - ICXPort7
|
||||
[10] - ICXPort8
|
||||
[11] - ICXPort9
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
|
||||
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
|
||||
- - - - - - - - 1.00 - - - movl %ecx, %r8d
|
||||
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
|
||||
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
|
||||
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
|
||||
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
|
||||
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
|
||||
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
|
||||
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
|
||||
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
|
||||
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
|
||||
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
|
||||
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
|
||||
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
|
||||
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
|
||||
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
|
||||
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
|
||||
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
|
||||
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
|
||||
- - 0.01 - - - - - 0.99 - - - jb ..B1.22
|
||||
@@ -0,0 +1,97 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx2.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:29:58
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
----------------------------------------------------------------------------------------------------
|
||||
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||
259 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
|
||||
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
|
||||
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
|
||||
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
|
||||
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
|
||||
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
|
||||
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
|
||||
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
|
||||
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||
299 | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||
319 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
|
||||
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
|
||||
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||
323 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx2.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-10 16:29:48
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
|
||||
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
|
||||
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
|
||||
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
|
||||
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
|
||||
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
|
||||
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
|
||||
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
|
||||
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
|
||||
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
|
||||
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||
323 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - lammps-icc-avx512.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
|
||||
| 1 | | | | | | | 1.0 | | add r15, 0x8
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k5
|
||||
| 1 | 1.0 | | | | | | | | kmovw k3, k5
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k5
|
||||
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
|
||||
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
|
||||
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
|
||||
| 1* | | | | | | | | | vmovaps zmm23, zmm31
|
||||
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
|
||||
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
|
||||
| 1* | | | | | | | | | cmp r15, r14
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
|
||||
Total Num Of Uops: 57
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
||||
@@ -0,0 +1,128 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 4200
|
||||
Total Cycles: 2465
|
||||
Total uOps: 5800
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.35
|
||||
IPC: 1.70
|
||||
Block RThroughput: 13.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||
1 1 0.25 addq $8, %r15
|
||||
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||
1 1 1.00 kmovw %k5, %k2
|
||||
1 1 1.00 kmovw %k5, %k3
|
||||
1 1 1.00 kmovw %k5, %k1
|
||||
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
1 1 1.00 knotw %k0, %k4
|
||||
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
1 1 0.25 cmpq %r14, %r15
|
||||
1 1 0.50 jb ..B1.16
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
|
||||
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
|
||||
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k2
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k3
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k1
|
||||
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
|
||||
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
|
||||
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
|
||||
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
|
||||
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
|
||||
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
- - 1.00 - - - - - - - knotw %k0, %k4
|
||||
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
|
||||
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
|
||||
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
|
||||
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
|
||||
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
|
||||
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
|
||||
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
|
||||
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
|
||||
- - 0.14 - - - - - 0.86 - jb ..B1.16
|
||||
@@ -0,0 +1,130 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 4200
|
||||
Total Cycles: 2465
|
||||
Total uOps: 5800
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.35
|
||||
IPC: 1.70
|
||||
Block RThroughput: 13.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||
1 1 0.25 addq $8, %r15
|
||||
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||
1 1 1.00 kmovw %k5, %k2
|
||||
1 1 1.00 kmovw %k5, %k3
|
||||
1 1 1.00 kmovw %k5, %k1
|
||||
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
1 1 1.00 knotw %k0, %k4
|
||||
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
1 1 0.25 cmpq %r14, %r15
|
||||
1 1 0.50 jb ..B1.16
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - ICXDivider
|
||||
[1] - ICXFPDivider
|
||||
[2] - ICXPort0
|
||||
[3] - ICXPort1
|
||||
[4] - ICXPort2
|
||||
[5] - ICXPort3
|
||||
[6] - ICXPort4
|
||||
[7] - ICXPort5
|
||||
[8] - ICXPort6
|
||||
[9] - ICXPort7
|
||||
[10] - ICXPort8
|
||||
[11] - ICXPort9
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
|
||||
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
|
||||
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k2
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k3
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k1
|
||||
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
|
||||
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
|
||||
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
|
||||
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
|
||||
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
|
||||
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
|
||||
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
- - 1.00 - - - - - - - - - knotw %k0, %k4
|
||||
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
|
||||
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
|
||||
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
|
||||
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
|
||||
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
|
||||
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
|
||||
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
|
||||
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
|
||||
- - 0.14 - - - - - 0.86 - - - jb ..B1.16
|
||||
@@ -0,0 +1,77 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx512.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:30:08
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||
203 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
|
||||
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
|
||||
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
|
||||
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||
246 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx512.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-10 16:29:42
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
|
||||
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
|
||||
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
|
||||
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||
246 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 7000
|
||||
Total Cycles: 3866
|
||||
Total uOps: 7900
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.04
|
||||
IPC: 1.81
|
||||
Block RThroughput: 21.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
2 4 1.50 vpmovsxdq %xmm11, %ymm1
|
||||
1 1 0.50 vpsllq $3, %ymm1, %ymm1
|
||||
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
|
||||
1 1 1.00 vmovq %xmm1, %r14
|
||||
2 1 1.00 vpextrq $1, %xmm1, %r9
|
||||
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
|
||||
1 8 0.50 * vmovsd (%r14), %xmm2
|
||||
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
2 4 1.50 vpmovsxdq %xmm6, %ymm6
|
||||
1 1 0.50 vpsllq $3, %ymm6, %ymm6
|
||||
1 1 1.00 vmovq %xmm1, %rdi
|
||||
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
|
||||
1 1 1.00 vmovq %xmm6, %rcx
|
||||
2 1 1.00 vpextrq $1, %xmm1, %rbx
|
||||
2 1 1.00 vpextrq $1, %xmm6, %rax
|
||||
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
|
||||
1 8 0.50 * vmovsd (%rdi), %xmm6
|
||||
1 1 1.00 vmovq %xmm1, %rdi
|
||||
2 1 1.00 vpextrq $1, %xmm1, %rsi
|
||||
1 8 0.50 * vmovsd (%rdi), %xmm1
|
||||
1 8 0.50 * vmovsd (%rcx), %xmm7
|
||||
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
|
||||
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
|
||||
2 4 1.50 vpmovsxdq %xmm4, %ymm4
|
||||
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
|
||||
1 1 0.50 vpsllq $3, %ymm4, %ymm4
|
||||
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
|
||||
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
|
||||
2 1 1.00 vpextrq $1, %xmm4, %rax
|
||||
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
|
||||
1 1 1.00 vmovq %xmm4, %rcx
|
||||
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
|
||||
1 1 1.00 vmovq %xmm4, %rsi
|
||||
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
2 1 1.00 vpextrq $1, %xmm4, %rdi
|
||||
1 8 0.50 * vmovsd (%rsi), %xmm4
|
||||
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
|
||||
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
|
||||
1 8 0.50 * vmovsd (%rcx), %xmm6
|
||||
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
|
||||
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
|
||||
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
|
||||
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
|
||||
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
|
||||
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
|
||||
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
|
||||
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
|
||||
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
|
||||
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
1 1 0.25 addq $4, %rbp
|
||||
1 1 0.25 cmpq %rdx, %rbp
|
||||
1 1 0.50 jb .LBB0_9
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - Zn3AGU0
|
||||
[1] - Zn3AGU1
|
||||
[2] - Zn3AGU2
|
||||
[3] - Zn3ALU0
|
||||
[4] - Zn3ALU1
|
||||
[5] - Zn3ALU2
|
||||
[6] - Zn3ALU3
|
||||
[7] - Zn3BRU1
|
||||
[8] - Zn3FPP0
|
||||
[9] - Zn3FPP1
|
||||
[10] - Zn3FPP2
|
||||
[11] - Zn3FPP3
|
||||
[12.0] - Zn3FPP45
|
||||
[12.1] - Zn3FPP45
|
||||
[13] - Zn3FPSt
|
||||
[14.0] - Zn3LSU
|
||||
[14.1] - Zn3LSU
|
||||
[14.2] - Zn3LSU
|
||||
[15.0] - Zn3Load
|
||||
[15.1] - Zn3Load
|
||||
[15.2] - Zn3Load
|
||||
[16.0] - Zn3Store
|
||||
[16.1] - Zn3Store
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
|
||||
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
|
||||
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
|
||||
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
|
||||
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
|
||||
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
|
||||
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
|
||||
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
|
||||
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
|
||||
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
|
||||
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
|
||||
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
|
||||
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
|
||||
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
|
||||
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
|
||||
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
|
||||
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
|
||||
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
|
||||
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
|
||||
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
|
||||
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
|
||||
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
|
||||
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
|
||||
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
|
||||
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
|
||||
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
|
||||
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
|
||||
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
|
||||
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
|
||||
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
|
||||
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
|
||||
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
|
||||
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
|
||||
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
|
||||
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
|
||||
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9
|
||||
@@ -0,0 +1,108 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icx-avx2zen.s
|
||||
Architecture: ZEN3
|
||||
Timestamp: 2023-02-10 16:31:30
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------
|
||||
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
|
||||
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
|
||||
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
|
||||
247 | 1.0 | addq $4, %rbp | [247]
|
||||
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
|
||||
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
|
||||
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -0,0 +1,640 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
movl $111, %ebx # OSACA START MARKER
|
||||
.byte 100 # OSACA START MARKER
|
||||
.byte 103 # OSACA START MARKER
|
||||
.byte 144 # OSACA START MARKER
|
||||
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
# LLVM-MCA-BEGIN
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # OSACA END MARKER
|
||||
.byte 100 # OSACA END MARKER
|
||||
.byte 103 # OSACA END MARKER
|
||||
.byte 144 # OSACA END MARKER
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $40, %rsp
|
||||
.cfi_def_cfa_offset 96
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r13d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r13,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r15), %rax
|
||||
movq 24(%r15), %rcx
|
||||
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||
movslq 8(%r15), %rdx
|
||||
movq 16(%r12), %rsi
|
||||
movq 64(%r12), %rdi
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 16(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, (%rsp) # 8-byte Spill
|
||||
xorl %r12d, %r12d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r9, %rdx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r15,8)
|
||||
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r10,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r9), %ecx
|
||||
addl $6, %r9d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r9d
|
||||
sarl $2, %r9d
|
||||
movslq %r9d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r12
|
||||
addq (%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r13, %r12
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq 8(%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r12,4), %r9
|
||||
leaq (%r12,%r12,2), %rcx
|
||||
leal 1(%rcx), %r10d
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r15d
|
||||
testq %r9, %r9
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r9d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ecx, %ecx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
je .LBB1_6
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rcx,4), %r8
|
||||
leaq (%r8,%r8,2), %r14
|
||||
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||
movslq %r14d, %rbp
|
||||
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm3, %xmm8, %xmm3
|
||||
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm7, %xmm3, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm9, %xmm9
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r13d, %r8d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rbp), %rbx
|
||||
addq $2, %rbp
|
||||
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r14,8)
|
||||
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
jmp .LBB1_13
|
||||
.LBB1_7: #
|
||||
movq 16(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $40, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.2, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.2, 66
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
||||
@@ -0,0 +1,105 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: force_lj_icx_avx2_markers.s
|
||||
Architecture: ZEN3
|
||||
Timestamp: 2022-12-12 12:47:07
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------
|
||||
172 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||
173 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||
174 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
175 | | 0.250 | 0.75 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
176 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
177 | 0.00 | 1.010 | 0.25 | 0.74 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||
178 | | 0.000 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||
179 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||
180 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||
181 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||
182 | | 1.000 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||
183 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
184 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
185 | 0.00 | 0.750 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||
186 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||
187 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
188 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||
189 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||
190 | | 1.000 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||
191 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||
192 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||
193 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
194 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
195 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||
196 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
197 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
198 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
199 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
200 | 0.00 | 0.000 | 0.62 | 0.38 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||
201 | 0.00 | 0.750 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||
202 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
203 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||
204 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||
205 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
206 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||
207 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
208 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||
209 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||
210 | 0.00 | -0.01 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||
211 | | 1.000 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
212 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||
213 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
214 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||
215 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
216 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
217 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
218 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
219 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
220 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||
221 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||
222 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||
223 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
224 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
225 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
226 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||
227 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||
228 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||
229 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
230 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||
231 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||
232 | 1.00 | 0.000 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
233 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||
234 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||
235 | | | 0.12 | 0.88 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
236 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
237 | 1.00 | 0.000 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
238 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
239 | 1.00 | 0.000 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
240 | 0.62 | 0.380 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
241 | 0.50 | 0.500 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
242 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||
243 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||
244 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||
|
||||
16.1 15.63 15.6 15.6 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
239 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
|
||||
238 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
|
||||
236 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
|
||||
242 | 1.0 | addq $4, %rbp | [242]
|
||||
241 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [241]
|
||||
240 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [240]
|
||||
237 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [237]
|
||||
|
||||
@@ -0,0 +1,638 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
# OSACA-BEGIN
|
||||
# LLVM-MCA-BEGIN
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
# OSACA-END
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $40, %rsp
|
||||
.cfi_def_cfa_offset 96
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r13d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r13,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r15), %rax
|
||||
movq 24(%r15), %rcx
|
||||
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||
movslq 8(%r15), %rdx
|
||||
movq 16(%r12), %rsi
|
||||
movq 64(%r12), %rdi
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 16(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, (%rsp) # 8-byte Spill
|
||||
xorl %r12d, %r12d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r9, %rdx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r15,8)
|
||||
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r10,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r9), %ecx
|
||||
addl $6, %r9d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r9d
|
||||
sarl $2, %r9d
|
||||
movslq %r9d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r12
|
||||
addq (%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r13, %r12
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq 8(%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r12,4), %r9
|
||||
leaq (%r12,%r12,2), %rcx
|
||||
leal 1(%rcx), %r10d
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r15d
|
||||
testq %r9, %r9
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r9d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ecx, %ecx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
je .LBB1_6
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rcx,4), %r8
|
||||
leaq (%r8,%r8,2), %r14
|
||||
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||
movslq %r14d, %rbp
|
||||
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm3, %xmm8, %xmm3
|
||||
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm7, %xmm3, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm9, %xmm9
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r13d, %r8d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rbp), %rbx
|
||||
addq $2, %rbp
|
||||
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r14,8)
|
||||
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
jmp .LBB1_13
|
||||
.LBB1_7: #
|
||||
movq 16(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $40, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.2, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.2, 66
|
||||
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
||||
@@ -1,46 +1,112 @@
|
||||
#!/bin/bash
|
||||
|
||||
TAG=ICX
|
||||
OPT_SCHEME=gromacs
|
||||
MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
|
||||
FREQ=2.4
|
||||
NRUNS=3
|
||||
FIXED_PARAMS=--freq $FREQ
|
||||
[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
|
||||
[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
|
||||
[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit
|
||||
|
||||
if [ "$OPT_SCHEME" = "gromacs" ]; then
|
||||
STUB1_NAME=Stub-33
|
||||
STUB1_PARAMS=-na 4 -nn 33
|
||||
STUB2_NAME=Stub-128
|
||||
STUB2_PARAMS=-na 4 -nn 128
|
||||
MDBENCH_BIN=$1
|
||||
BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
|
||||
OPT_SCHEME="${BIN_INFO%%-*}"
|
||||
PREC="${BIN_INFO##*-}"
|
||||
BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
|
||||
BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
|
||||
TAG="${BIN_INFO%%-*}"
|
||||
ISA="${BIN_INFO##*-}"
|
||||
CORE="${CORE:-0}"
|
||||
FREQ="${FREQ:-2.4}"
|
||||
NRUNS="${NRUNS:-3}"
|
||||
LOG="${LOG:-latencies_and_cfds.log}"
|
||||
STUB_ONLY="${STUB_ONLY:-false}"
|
||||
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
|
||||
|
||||
OPTIND=2
|
||||
while getopts "c:f:n:l:s" flag; do
|
||||
case "${flag}" in
|
||||
c) CORE=${OPTARG};;
|
||||
f) FREQ=${OPTARG};;
|
||||
n) NRUNS=${OPTARG};;
|
||||
l) LOG=${OPTARG};;
|
||||
s) STUB_ONLY=true;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Other useful variables
|
||||
MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
|
||||
FIXED_PARAMS="--freq $FREQ"
|
||||
CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
|
||||
|
||||
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
|
||||
ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
|
||||
PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
|
||||
else
|
||||
STUB1_NAME=Stub-76
|
||||
STUB1_PARAMS=-nn 76
|
||||
STUB2_NAME=Stub-1024
|
||||
STUB2_PARAMS=-nn 1024
|
||||
ALL_PREFETCHERS=""
|
||||
PREFETCHERS=("IGNORE")
|
||||
fi
|
||||
|
||||
if [ "$OPT_SCHEME" == "gromacs" ]; then
|
||||
STUB1_NAME=stub-33
|
||||
STUB1_PARAMS="-na 4 -nn 33"
|
||||
STUB2_NAME=stub-128
|
||||
STUB2_PARAMS="-na 4 -nn 128"
|
||||
else
|
||||
STUB1_NAME=stub-76
|
||||
STUB1_PARAMS="-nn 76"
|
||||
STUB2_NAME=stub-1024
|
||||
STUB2_PARAMS="-nn 1024"
|
||||
fi
|
||||
|
||||
function run_benchmark() {
|
||||
BEST=10000000
|
||||
for i in $(seq $NRUNS); do
|
||||
likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
|
||||
RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
|
||||
if (( $(echo "$BEST > $RES" | bc -l ) )); then
|
||||
BEST=$RES
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
echo "Tag: $TAG"
|
||||
echo "Optimization scheme: $OPT_SCHEME"
|
||||
echo "Binary: $MDBENCH_BIN(-stub)"
|
||||
echo "Frequency: $FREQ"
|
||||
echo "Number of runs: $NRUNS"
|
||||
echo "Tag: $TAG" | tee -a $LOG
|
||||
echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
|
||||
echo "Instruction set: $ISA" | tee -a $LOG
|
||||
echo "Precision: $PREC" | tee -a $LOG
|
||||
echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
|
||||
echo "Frequency: $FREQ" | tee -a $LOG
|
||||
echo "Number of runs: $NRUNS" | tee -a $LOG
|
||||
echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG
|
||||
|
||||
echo "Fixing frequencies..."
|
||||
likwid-setFrequencies -f $FREQ -t 0
|
||||
if [ "$SKIP_SET_FREQ" == "false" ]; then
|
||||
echo "Fixing frequencies..."
|
||||
likwid-setFrequencies -f $FREQ -t 0
|
||||
fi
|
||||
|
||||
echo "Standard"
|
||||
run_benchmark $MDBENCH_BIN
|
||||
echo "Melt"
|
||||
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
|
||||
echo "Argon"
|
||||
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
|
||||
echo "$STUB1_NAME"
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
|
||||
echo "$STUB2_NAME"
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
|
||||
for p in $PREFETCHERS; do
|
||||
if [ "$p" != "IGNORE" ]; then
|
||||
if [ "$p" == "ALL" ]; then
|
||||
likwid-features -c $CORE -e $ALL_PREFETCHERS
|
||||
elif [ "$p" == "NONE" ]; then
|
||||
likwid-features -c $CORE -d $ALL_PREFETCHERS
|
||||
else
|
||||
likwid-features -c $CORE -d $ALL_PREFETCHERS
|
||||
likwid-features -c $CORE -e $p
|
||||
fi
|
||||
|
||||
echo "Prefetcher settings: $p"
|
||||
likwid-features -c $CORE -l
|
||||
fi
|
||||
|
||||
MSG="$p: "
|
||||
if [ "$STUB_ONLY" == "false" ]; then
|
||||
run_benchmark $MDBENCH_BIN
|
||||
MSG+="standard=$BEST, "
|
||||
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
|
||||
MSG+="melt=$BEST, "
|
||||
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
|
||||
MSG+="argon=$BEST, "
|
||||
fi
|
||||
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
|
||||
MSG+="$STUB1_NAME=$BEST, "
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
|
||||
MSG+="$STUB2_NAME=$BEST"
|
||||
echo $MSG | tee -a $LOG
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user