Compare commits
45 Commits
superclust
...
gromacs_ma
Author | SHA1 | Date | |
---|---|---|---|
|
59145644e3 | ||
|
4a460b2c88 | ||
|
b15aa2f461 | ||
|
5c000444a4 | ||
|
04ade6bcec | ||
|
85f1484449 | ||
|
965fda3879 | ||
|
a86d214c73 | ||
|
d138f975f6 | ||
|
296a4c4e01 | ||
|
f5fd3e265a | ||
|
1fbf9dbdac | ||
|
89e1b9a9b6 | ||
|
4e99f7a623 | ||
|
4607202752 | ||
|
301274c9b6 | ||
|
95d63334fa | ||
|
d0277765c3 | ||
|
5814a86125 | ||
|
98583cdade | ||
|
cb5598bc91 | ||
|
3b076cdb49 | ||
|
122a23e2b8 | ||
|
32e004944f | ||
|
6126d74aa9 | ||
|
016f07dcaa | ||
|
90f30d26a3 | ||
|
01cc05a5d6 | ||
|
c61cf9a0ac | ||
|
d545ca65d4 | ||
|
5833f00894 | ||
|
8aad7e87a0 | ||
|
ffad9d40f3 | ||
|
99da76d59c | ||
|
cfe888c132 | ||
|
c7b136f629 | ||
|
07f2f74561 | ||
|
fd368609e8 | ||
|
db5f8cf1c6 | ||
|
f467d10ed3 | ||
|
fe86c948a8 | ||
|
ae1cfa2800 | ||
|
e5c233e072 | ||
|
8d5e10f635 | ||
|
56ff0d19af |
23
.gitignore
vendored
23
.gitignore
vendored
@@ -51,14 +51,17 @@ Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# TODO list
|
||||
todo.txt
|
||||
|
||||
# Build directories and executables
|
||||
GCC/
|
||||
ICC/
|
||||
ICX/
|
||||
CLANG/
|
||||
NVCC/
|
||||
MDBench-GCC*
|
||||
MDBench-ICC*
|
||||
MDBench-ICX*
|
||||
MDBench-CLANG*
|
||||
MDBench-NVCC*
|
||||
#GCC-*/
|
||||
#ICC-*/
|
||||
#ICX-*/
|
||||
#CLANG-*/
|
||||
#NVCC-*/
|
||||
build-*/
|
||||
MDBench-*
|
||||
|
12
Makefile
12
Makefile
@@ -1,6 +1,7 @@
|
||||
#CONFIGURE BUILD SYSTEM
|
||||
TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
|
||||
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
|
||||
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
|
||||
TARGET = MDBench-$(IDENTIFIER)
|
||||
BUILD_DIR = ./build-$(IDENTIFIER)
|
||||
SRC_DIR = ./$(OPT_SCHEME)
|
||||
ASM_DIR = ./asm
|
||||
COMMON_DIR = ./common
|
||||
@@ -151,6 +152,13 @@ $(BUILD_DIR)/%.o: %.s
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf $(BUILD_DIR)
|
||||
@rm -rf MDBench-$(IDENTIFIER)
|
||||
@rm -f tags
|
||||
|
||||
cleanall:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf build-*
|
||||
@rm -rf MDBench-*
|
||||
@rm -f tags
|
||||
|
||||
distclean: clean
|
||||
|
1541
asm/unused/force_lj_lammps_avx512_dp_no_newton_raphson.s
Normal file
1541
asm/unused/force_lj_lammps_avx512_dp_no_newton_raphson.s
Normal file
File diff suppressed because it is too large
Load Diff
1421
asm/unused/force_lj_lammps_avx512_sp_no_newton_raphson.s
Normal file
1421
asm/unused/force_lj_lammps_avx512_sp_no_newton_raphson.s
Normal file
File diff suppressed because it is too large
Load Diff
@@ -8,9 +8,11 @@
|
||||
#define __PARAMETER_H_
|
||||
|
||||
#if PRECISION == 1
|
||||
#define MD_FLOAT float
|
||||
# define MD_FLOAT float
|
||||
# define MD_UINT unsigned int
|
||||
#else
|
||||
#define MD_FLOAT double
|
||||
# define MD_FLOAT double
|
||||
# define MD_UINT unsigned long long int
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
|
@@ -9,10 +9,13 @@
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||
|
@@ -7,11 +7,30 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
# include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512
|
||||
#define MD_SIMD_MASK __mmask16
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
#define MD_SIMD_INT32 __m512i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||
|
||||
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||
return _mm512_load_si512(m);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||
@@ -69,7 +88,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
|
||||
return _mm_cvtss_f32(t3);
|
||||
}
|
||||
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256 t;
|
||||
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||
t = _mm256_load_ps(m);
|
||||
|
@@ -131,19 +131,19 @@ void readParameter(Parameter *param, const char *filename) {
|
||||
void printParameter(Parameter *param) {
|
||||
printf("Parameters:\n");
|
||||
if(param->input_file != NULL) {
|
||||
printf("Input file: %s\n", param->input_file);
|
||||
printf("\tInput file: %s\n", param->input_file);
|
||||
}
|
||||
|
||||
if(param->vtk_file != NULL) {
|
||||
printf("VTK file: %s\n", param->vtk_file);
|
||||
printf("\tVTK file: %s\n", param->vtk_file);
|
||||
}
|
||||
|
||||
if(param->xtc_file != NULL) {
|
||||
printf("XTC file: %s\n", param->xtc_file);
|
||||
printf("\tXTC file: %s\n", param->xtc_file);
|
||||
}
|
||||
|
||||
if(param->eam_file != NULL) {
|
||||
printf("EAM file: %s\n", param->eam_file);
|
||||
printf("\tEAM file: %s\n", param->eam_file);
|
||||
}
|
||||
|
||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||
|
@@ -7,6 +7,6 @@ temp 80
|
||||
x_out_freq 500
|
||||
v_out_freq 5
|
||||
cutforce 0.9
|
||||
skin 0.0
|
||||
skin 0.05
|
||||
reneigh_every 100
|
||||
nstat 125000
|
||||
|
109
gromacs/atom.c
109
gromacs/atom.c
@@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
initMasks(atom);
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
@@ -50,6 +51,7 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
@@ -392,6 +394,113 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void initMasks(Atom *atom) {
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
|
||||
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
|
||||
|
||||
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
|
||||
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
}
|
||||
|
||||
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
|
||||
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
|
||||
}
|
||||
|
||||
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||
atom->exclusion_filter[i] = (1U << i);
|
||||
}
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
|
||||
}
|
||||
#else
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
|
@@ -16,10 +16,36 @@
|
||||
#include <simd.h>
|
||||
|
||||
|
||||
/*
|
||||
static inline void gmx_load_simd_2xnn_interactions(
|
||||
int excl,
|
||||
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
|
||||
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
|
||||
|
||||
//SimdInt32 mask_pr_S(excl);
|
||||
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||
}
|
||||
|
||||
static inline void gmx_load_simd_4xn_interactions(
|
||||
int excl,
|
||||
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
|
||||
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
|
||||
|
||||
//SimdInt32 mask_pr_S(excl);
|
||||
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||
*interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
|
||||
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||
*interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
|
||||
}
|
||||
*/
|
||||
|
||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -35,9 +61,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
@@ -48,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int any = 0;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
@@ -119,6 +148,8 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ end\n");
|
||||
return E-S;
|
||||
@@ -127,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -136,7 +167,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -149,9 +179,41 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
/*
|
||||
MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||
MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||
|
||||
MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i);
|
||||
MD_SIMD_FLOAT zero_S = simd_broadcast(0.0);
|
||||
MD_SIMD_FLOAT one_S = simd_broadcast(1.0);
|
||||
|
||||
#if CLUSTER_M <= CLUSTER_N
|
||||
MD_SIMD_MASK diagonal_mask0, diagonal_mask2;
|
||||
diagonal_mask0 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask2 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
#else
|
||||
MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12;
|
||||
diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
#endif
|
||||
*/
|
||||
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -162,6 +224,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
@@ -176,76 +239,138 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
//int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
//MD_SIMD_MASK interact0;
|
||||
//MD_SIMD_MASK interact2;
|
||||
|
||||
//gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2);
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 1]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
|
||||
cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
/*
|
||||
#if CLUSTER_M <= CLUSTER_N
|
||||
if(ci == ci_cj0) {
|
||||
cutoff_mask0 = simd_mask_and(cutoff_mask0, diagonal_mask0);
|
||||
cutoff_mask2 = simd_mask_and(cutoff_mask2, diagonal_mask2);
|
||||
}
|
||||
#else
|
||||
if(ci == ci_cj0) {
|
||||
cutoff_mask0 = cutoff_mask0 && diagonal_mask00;
|
||||
cutoff_mask2 = cutoff_mask2 && diagonal_mask02;
|
||||
} else if(ci == ci_cj1) {
|
||||
cutoff_mask0 = cutoff_mask0 && diagonal_mask10;
|
||||
cutoff_mask2 = cutoff_mask2 && diagonal_mask12;
|
||||
}
|
||||
#endif
|
||||
*/
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
fix0 += tx0;
|
||||
fiy0 += ty0;
|
||||
fiz0 += tz0;
|
||||
fix2 += tx2;
|
||||
fiy2 += ty2;
|
||||
fiz2 += tz2;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
}
|
||||
#else
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
#endif
|
||||
}
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
fiz0 = simd_add(fiz0, tz0);
|
||||
fix2 = simd_add(fix2, tx2);
|
||||
fiy2 = simd_add(fiy2, ty2);
|
||||
fiz2 = simd_add(fiz2, tz2);
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
|
||||
fix0 += tx0;
|
||||
fiy0 += ty0;
|
||||
fiz0 += tz0;
|
||||
fix2 += tx2;
|
||||
fiy2 += ty2;
|
||||
fiz2 += tz2;
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
@@ -266,6 +391,8 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
@@ -274,7 +401,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -283,7 +410,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -296,9 +422,12 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -309,6 +438,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
@@ -323,61 +453,85 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 1]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
}
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
@@ -398,6 +552,8 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
@@ -414,7 +570,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -423,8 +579,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -436,7 +590,13 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -447,6 +607,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
@@ -473,53 +634,52 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 3]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||
@@ -531,28 +691,114 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
fiz0 = simd_add(fiz0, tz0);
|
||||
fix1 = simd_add(fix1, tx1);
|
||||
fiy1 = simd_add(fiy1, ty1);
|
||||
fiz1 = simd_add(fiz1, tz1);
|
||||
fix2 = simd_add(fix2, tx2);
|
||||
fiy2 = simd_add(fiy2, ty2);
|
||||
fiz2 = simd_add(fiz2, tz2);
|
||||
fix3 = simd_add(fix3, tx3);
|
||||
fiy3 = simd_add(fiy3, ty3);
|
||||
fiz3 = simd_add(fiz3, tz3);
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
}
|
||||
#else
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
@@ -590,6 +836,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||
return E-S;
|
||||
@@ -598,7 +846,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
@@ -607,8 +855,6 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@@ -620,7 +866,13 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@@ -631,6 +883,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
@@ -657,52 +910,51 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 3]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||
@@ -714,28 +966,88 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k].cj;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int imask = neighs[k].imask;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||
}
|
||||
|
||||
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
||||
@@ -744,10 +1056,13 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs));
|
||||
//addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||
return E-S;
|
||||
|
@@ -22,6 +22,7 @@
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define CLUSTER_M 8
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_cuda
|
||||
# define initialIntegrate cudaInitialIntegrate
|
||||
# define finalIntegrate cudaFinalIntegrate
|
||||
@@ -32,11 +33,15 @@
|
||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 2
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
# else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
# endif
|
||||
# ifdef USE_REFERENCE_VERSION
|
||||
@@ -116,9 +121,17 @@ typedef struct {
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
int dummy_cj;
|
||||
MD_UINT *exclusion_filter;
|
||||
MD_FLOAT *diagonal_4xn_j_minus_i;
|
||||
MD_FLOAT *diagonal_2xnn_j_minus_i;
|
||||
unsigned int masks_2xnn_hn[8];
|
||||
unsigned int masks_2xnn_fn[8];
|
||||
unsigned int masks_4xn_hn[16];
|
||||
unsigned int masks_4xn_fn[16];
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void initMasks(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
|
@@ -9,13 +9,35 @@
|
||||
|
||||
#ifndef __NEIGHBOR_H_
|
||||
#define __NEIGHBOR_H_
|
||||
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
|
||||
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
|
||||
// interaction masks (1 = interaction, 0 = no interaction)
|
||||
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
|
||||
// so read them from right to left (least significant to most significant bit)
|
||||
// All interaction mask is the same for all kernels
|
||||
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
|
||||
// 4x4 kernel diagonal mask
|
||||
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
|
||||
// 4x2 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
|
||||
// 4x8 kernel diagonal masks
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
|
||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||
|
||||
typedef struct {
|
||||
int cj;
|
||||
unsigned int imask;
|
||||
} NeighborCluster;
|
||||
|
||||
typedef struct {
|
||||
int every;
|
||||
int ncalls;
|
||||
int* neighbors;
|
||||
int maxneighs;
|
||||
int* numneigh;
|
||||
int* numneigh_masked;
|
||||
int half_neigh;
|
||||
NeighborCluster* neighbors;
|
||||
} Neighbor;
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
|
@@ -56,6 +56,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->numneigh_masked = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
}
|
||||
|
||||
@@ -184,6 +185,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
|
||||
static unsigned int get_imask(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
|
||||
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
|
||||
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
|
||||
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
|
||||
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
|
||||
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
#if VECTOR_WIDTH == 2
|
||||
# define get_imask_simd_4xn get_imask_simd_j2
|
||||
#elif VECTOR_WIDTH== 4
|
||||
# define get_imask_simd_4xn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 8
|
||||
# define get_imask_simd_4xn get_imask_simd_j8
|
||||
# define get_imask_simd_2xnn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 16
|
||||
# define get_imask_simd_2xnn get_imask_simd_j8
|
||||
#else
|
||||
# error "Invalid cluster configuration"
|
||||
#endif
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||
|
||||
@@ -193,7 +231,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
|
||||
}
|
||||
|
||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||
@@ -209,8 +248,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int n = 0, nmasked = 0;
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||
@@ -275,7 +314,28 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
if(d_bb_sq < cutneighsq) {
|
||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||
neighptr[n++] = cj;
|
||||
// We use true (1) for rdiag because we only care if there are masks
|
||||
// at all, and when this is set to false (0) the self-exclusions are
|
||||
// not accounted for, which makes the optimized version to not work!
|
||||
unsigned int imask;
|
||||
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
|
||||
imask = get_imask_simd_2xnn(1, ci, cj);
|
||||
#else // 4xn
|
||||
imask = get_imask_simd_4xn(1, ci, cj);
|
||||
#endif
|
||||
|
||||
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||
neighptr[n].cj = cj;
|
||||
neighptr[n].imask = imask;
|
||||
} else {
|
||||
neighptr[n].cj = neighptr[nmasked].cj;
|
||||
neighptr[n].imask = neighptr[nmasked].imask;
|
||||
neighptr[nmasked].cj = cj;
|
||||
neighptr[nmasked].imask = imask;
|
||||
nmasked++;
|
||||
}
|
||||
|
||||
n++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -297,11 +357,14 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr[n].imask = 0;
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = n;
|
||||
neighbor->numneigh_masked[ci] = nmasked;
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
@@ -315,7 +378,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,23 +433,27 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
MD_FLOAT cutsq = cutneighsq;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
||||
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
|
||||
numneighs--;
|
||||
}
|
||||
}
|
||||
|
||||
while(k < numneighs) {
|
||||
int cj = neighs[k];
|
||||
int cj = neighs[k].cj;
|
||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||
k++;
|
||||
} else {
|
||||
numneighs--;
|
||||
if(k < numneighs_masked) {
|
||||
numneighs_masked--;
|
||||
}
|
||||
neighs[k] = neighs[numneighs];
|
||||
}
|
||||
}
|
||||
@@ -394,11 +461,14 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs[numneighs].imask = 0;
|
||||
numneighs++;
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = numneighs;
|
||||
neighbor->numneigh_masked[ci] = numneighs_masked;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
||||
|
@@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
MEM_TRACER_INIT;
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
NeighborCluster* neighs;
|
||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
@@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
MEM_TRACE(neighs[k], 'R');
|
||||
int j = neighs[k].cj;
|
||||
MEM_TRACE(j, 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
||||
|
@@ -7,6 +7,7 @@ ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
||||
ASFLAGS = -masm=intel
|
||||
|
@@ -6,13 +6,29 @@ ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
ifeq ($(ISA),AVX512)
|
||||
CFLAGS = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
|
||||
CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
CFLAGS = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
CFLAGS = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
CFLAGS = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
|
||||
endif
|
||||
|
||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
|
@@ -3,11 +3,25 @@ LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
||||
|
@@ -3,13 +3,28 @@ LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
#OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
OPTS = -Ofast -xHost $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
|
@@ -9,13 +9,15 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
|
||||
__ISA_AVX_FMA__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX2)
|
||||
__ISA_AVX2__=true
|
||||
#__SIMD_KERNEL__=true
|
||||
__ISA_AVX2__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX512)
|
||||
__ISA_AVX512__=true
|
||||
__SIMD_KERNEL__=true
|
||||
__SIMD_WIDTH_DBL__=8
|
||||
ifeq ($(strip $(DATA_TYPE)), DP)
|
||||
__SIMD_KERNEL__=true
|
||||
endif
|
||||
endif
|
||||
|
||||
# SIMD width is specified in double-precision, hence it may
|
||||
|
@@ -31,8 +31,12 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
double S = getTimeStamp();
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
#pragma omp parallel for
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -95,13 +99,19 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
}
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -192,6 +202,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
@@ -26,17 +26,22 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -67,9 +72,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
@@ -90,6 +95,8 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
@@ -102,6 +109,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
const MD_FLOAT num1 = 1.0;
|
||||
const MD_FLOAT num48 = 48.0;
|
||||
const MD_FLOAT num05 = 0.5;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
@@ -110,8 +120,12 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("forceLJ-halfneigh");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -146,9 +160,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr2 = num1 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
@@ -171,6 +185,8 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
@@ -189,7 +205,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#ifndef __SIMD_KERNEL__
|
||||
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
||||
@@ -201,7 +216,12 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
#pragma omp parallel for
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@@ -242,9 +262,11 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
||||
atom_fy(i) += simd_h_reduce_sum(fiy);
|
||||
atom_fz(i) += simd_h_reduce_sum(fiz);
|
||||
}
|
||||
#endif
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
#endif
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
88
likwid-outputs/csx-lammps-dp-mem_dp-stub.out
Normal file
88
likwid-outputs/csx-lammps-dp-mem_dp-stub.out
Normal file
@@ -0,0 +1,88 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1062.9120
|
||||
Estimated atom data volume (kB): 6.1440
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2735, Mega atom updates/s: 0.1872
|
||||
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 127.3632
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 6553600
|
||||
Useful read data volume for force computation: 1.47GB
|
||||
Cycles/SIMD iteration: 83.4598
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.110776 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8643 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1367 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 9124 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1354 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 9138 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1356 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 5586 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1297 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 5328 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1269 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 5280 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1295 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.1108 |
|
||||
| Runtime unhalted [s] | 0.0878 |
|
||||
| Clock [MHz] | 1995.2564 |
|
||||
| CPI | 0.8202 |
|
||||
| Energy [J] | 10.9296 |
|
||||
| Power [W] | 98.6643 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 14233.3287 |
|
||||
| AVX DP [MFLOP/s] | 14231.8898 |
|
||||
| Packed [MUOPS/s] | 1778.9862 |
|
||||
| Scalar [MUOPS/s] | 1.4389 |
|
||||
| Memory read bandwidth [MBytes/s] | 24.9001 |
|
||||
| Memory read data volume [GBytes] | 0.0028 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.5861 |
|
||||
| Memory write data volume [GBytes] | 0.0005 |
|
||||
| Memory bandwidth [MBytes/s] | 29.4863 |
|
||||
| Memory data volume [GBytes] | 0.0033 |
|
||||
| Operational intensity | 482.7104 |
|
||||
+-----------------------------------+------------+
|
||||
|
168
likwid-outputs/csx-lammps-dp-mem_dp.out
Normal file
168
likwid-outputs/csx-lammps-dp-mem_dp.out
Normal file
@@ -0,0 +1,168 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: double
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200895e-01 6.923143e-01
|
||||
200 7.961495e-01 6.721043e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.28 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0352
|
||||
Average SIMD iterations per atom: 9.9181
|
||||
Total number of computed pair interactions: 2003182862
|
||||
Total number of SIMD iterations: 261297661
|
||||
Useful read data volume for force computation: 57.46GB
|
||||
Cycles/SIMD iteration: 40.4432
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.115807 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.1158 |
|
||||
| Runtime unhalted [s] | 4.0885 |
|
||||
| Clock [MHz] | 1995.2508 |
|
||||
| CPI | 0.8098 |
|
||||
| Energy [J] | 307.9429 |
|
||||
| Power [W] | 60.1944 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 12644.6041 |
|
||||
| AVX DP [MFLOP/s] | 12629.1535 |
|
||||
| Packed [MUOPS/s] | 1578.6442 |
|
||||
| Scalar [MUOPS/s] | 15.4506 |
|
||||
| Memory read bandwidth [MBytes/s] | 1713.4438 |
|
||||
| Memory read data volume [GBytes] | 8.7656 |
|
||||
| Memory write bandwidth [MBytes/s] | 86.5003 |
|
||||
| Memory write data volume [GBytes] | 0.4425 |
|
||||
| Memory bandwidth [MBytes/s] | 1799.9442 |
|
||||
| Memory data volume [GBytes] | 9.2082 |
|
||||
| Operational intensity | 7.0250 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.897385 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.8974 |
|
||||
| Runtime unhalted [s] | 4.7026 |
|
||||
| Clock [MHz] | 1995.2473 |
|
||||
| CPI | 0.6440 |
|
||||
| Energy [J] | 338.9000 |
|
||||
| Power [W] | 57.4661 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 1059.4978 |
|
||||
| AVX DP [MFLOP/s] | 1.3335 |
|
||||
| Packed [MUOPS/s] | 0.1667 |
|
||||
| Scalar [MUOPS/s] | 1058.1643 |
|
||||
| Memory read bandwidth [MBytes/s] | 136.3006 |
|
||||
| Memory read data volume [GBytes] | 0.8038 |
|
||||
| Memory write bandwidth [MBytes/s] | 72.2612 |
|
||||
| Memory write data volume [GBytes] | 0.4262 |
|
||||
| Memory bandwidth [MBytes/s] | 208.5618 |
|
||||
| Memory data volume [GBytes] | 1.2300 |
|
||||
| Operational intensity | 5.0800 |
|
||||
+-----------------------------------+------------+
|
||||
|
88
likwid-outputs/csx-lammps-sp-mem_sp-stub.out
Normal file
88
likwid-outputs/csx-lammps-sp-mem_sp-stub.out
Normal file
@@ -0,0 +1,88 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1056.7680
|
||||
Estimated atom data volume (kB): 3.0720
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2466, Mega atom updates/s: 0.2076
|
||||
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 63.6816
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 3276800
|
||||
Useful read data volume for force computation: 0.84GB
|
||||
Cycles/SIMD iteration: 150.4999
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.085843 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8354 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1126 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 7863 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1105 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 7990 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1113 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 4775 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1112 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 4201 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1127 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 4035 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1120 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.0858 |
|
||||
| Runtime unhalted [s] | 0.0691 |
|
||||
| Clock [MHz] | 1995.2787 |
|
||||
| CPI | 1.3277 |
|
||||
| Energy [J] | 9.2849 |
|
||||
| Power [W] | 108.1610 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 16606.5397 |
|
||||
| AVX SP [MFLOP/s] | 16604.7458 |
|
||||
| Packed [MUOPS/s] | 1037.7966 |
|
||||
| Scalar [MUOPS/s] | 1.7940 |
|
||||
| Memory read bandwidth [MBytes/s] | 27.7476 |
|
||||
| Memory read data volume [GBytes] | 0.0024 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.9974 |
|
||||
| Memory write data volume [GBytes] | 0.0004 |
|
||||
| Memory bandwidth [MBytes/s] | 32.7450 |
|
||||
| Memory data volume [GBytes] | 0.0028 |
|
||||
| Operational intensity | 507.1471 |
|
||||
+-----------------------------------+------------+
|
||||
|
168
likwid-outputs/csx-lammps-sp-mem_sp.out
Normal file
168
likwid-outputs/csx-lammps-sp-mem_sp.out
Normal file
@@ -0,0 +1,168 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: single
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200897e-01 6.923144e-01
|
||||
200 7.961481e-01 6.721031e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.42 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0351
|
||||
Average SIMD iterations per atom: 5.0875
|
||||
Total number of computed pair interactions: 2003181259
|
||||
Total number of SIMD iterations: 134032075
|
||||
Useful read data volume for force computation: 32.79GB
|
||||
Cycles/SIMD iteration: 68.9511
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 4.452877 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 595747 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 597090 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 595219 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 632443 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 633169 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 634112 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 4.4529 |
|
||||
| Runtime unhalted [s] | 3.5585 |
|
||||
| Clock [MHz] | 1995.2693 |
|
||||
| CPI | 1.1947 |
|
||||
| Energy [J] | 265.5057 |
|
||||
| Power [W] | 59.6257 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 14156.9661 |
|
||||
| AVX SP [MFLOP/s] | 14139.2165 |
|
||||
| Packed [MUOPS/s] | 883.7010 |
|
||||
| Scalar [MUOPS/s] | 17.7496 |
|
||||
| Memory read bandwidth [MBytes/s] | 1708.8254 |
|
||||
| Memory read data volume [GBytes] | 7.6092 |
|
||||
| Memory write bandwidth [MBytes/s] | 53.0035 |
|
||||
| Memory write data volume [GBytes] | 0.2360 |
|
||||
| Memory bandwidth [MBytes/s] | 1761.8288 |
|
||||
| Memory data volume [GBytes] | 7.8452 |
|
||||
| Operational intensity | 8.0354 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.935627 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 975760 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 977433 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 979122 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 967621 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 967179 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 969349 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.9356 |
|
||||
| Runtime unhalted [s] | 4.7334 |
|
||||
| Clock [MHz] | 1995.2675 |
|
||||
| CPI | 0.6483 |
|
||||
| Energy [J] | 340.7903 |
|
||||
| Power [W] | 57.4144 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 1052.6723 |
|
||||
| AVX SP [MFLOP/s] | 1.3249 |
|
||||
| Packed [MUOPS/s] | 0.0828 |
|
||||
| Scalar [MUOPS/s] | 1051.3474 |
|
||||
| Memory read bandwidth [MBytes/s] | 114.9736 |
|
||||
| Memory read data volume [GBytes] | 0.6824 |
|
||||
| Memory write bandwidth [MBytes/s] | 62.9308 |
|
||||
| Memory write data volume [GBytes] | 0.3735 |
|
||||
| Memory bandwidth [MBytes/s] | 177.9044 |
|
||||
| Memory data volume [GBytes] | 1.0560 |
|
||||
| Operational intensity | 5.9171 |
|
||||
+-----------------------------------+------------+
|
||||
|
148
static_analysis/gromacs-avx512-dp-ICX-iaca.txt
Normal file
148
static_analysis/gromacs-avx512-dp-ICX-iaca.txt
Normal file
@@ -0,0 +1,148 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-avx512-dp-ICX.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 47.68 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 42.0 0.0 | 12.5 | 5.0 5.0 | 5.0 5.0 | 0.0 | 42.0 | 12.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | movsxd rbx, dword ptr [r12+r14*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rcx, ptr [rbx+rbx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rcx, 0x6
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm4, zmm3, zmm29
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x140]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rbx+rbx*1]
|
||||
| 1 | | | | | | | 1.0 | | cmp rdi, rcx
|
||||
| 1 | | | | | | | 1.0 | | setnz dl
|
||||
| 1 | | | | | | | 1.0 | | setz cl
|
||||
| 1 | | 1.0 | | | | | | | lea ebx, ptr [rbx+rbx*1+0x1]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm25, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm4, zmm4
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm18
|
||||
| 1 | | 1.0 | | | | | | | cmp rdi, rbx
|
||||
| 1 | | | | | | | 1.0 | | setz bl
|
||||
| 1* | | | | | | | | | mov ebp, ebx
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm20, zmm19, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm21, zmm20
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm21, zmmword ptr [rsp+0x80]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm21, zmm29
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm1, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm20, zmm20, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm20, zmmword ptr [rsp+0x100]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm20, zmm30
|
||||
| 1 | | 1.0 | | | | | | | not bpl
|
||||
| 1 | | 1.0 | | | | | | | sub bpl, cl
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm18, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm26, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15{k1}, zmm19, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm4, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm19, zmm3
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm4
|
||||
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rdx+rdx*1]
|
||||
| 1* | | | | | | | | | mov eax, ebx
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm3, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm3, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm19, zmm17
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm29
|
||||
| 1 | | | | | | | 1.0 | | shl al, 0x5
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm1, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm17, zmm17, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm23, zmm30
|
||||
| 1 | | 0.5 | | | | | 0.5 | | sub cl, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add cl, 0xfd
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm4, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm4, zmm27, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14{k1}, zmm3, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm21, zmm4, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm21, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm3, zmm20
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm20, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm22
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm20, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm1, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||
| 1* | | | | | | | | | mov ecx, ebx
|
||||
| 1 | | | | | | | 1.0 | | shl cl, 0x6
|
||||
| 1 | | 0.5 | | | | | 0.5 | | sub al, cl
|
||||
| 1 | | 0.5 | | | | | 0.5 | | add al, 0xfb
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm21, zmm0, 0x1
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm18, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm24, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm28, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm16{k1}, zmm3, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm18, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k1}, zmm3, zmm17
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm3, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm17, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm1, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm4, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl bl, 0x7
|
||||
| 1 | | 1.0 | | | | | | | sub dl, bl
|
||||
| 1 | | 1.0 | | | | | | | add dl, 0xf7
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, edx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k1}, zmm3, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k1}, zmm3, zmm21
|
||||
| 1 | | 0.5 | | | | | 0.5 | | inc r14
|
||||
| 1* | | | | | | | | | cmp r11, r14
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffd99
|
||||
Total Num Of Uops: 123
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
159
static_analysis/gromacs-avx512-dp-ICX-osaca.txt
Normal file
159
static_analysis/gromacs-avx512-dp-ICX-osaca.txt
Normal file
@@ -0,0 +1,159 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-avx512-dp-ICX.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-01-03 00:07:20
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2287 | | | | | | | | || | | .LBB5_11: #
|
||||
2288 | | | | | | | | || | | # Parent Loop BB5_6 Depth=1
|
||||
2289 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2290 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r12,%r14,4), %rbx
|
||||
2291 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rbx,%rbx,2), %rcx
|
||||
2292 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rcx
|
||||
2293 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rsi,%rcx), %zmm29
|
||||
2294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rsi,%rcx), %zmm30
|
||||
2295 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rsi,%rcx), %zmm31
|
||||
2296 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
|
||||
2297 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm4
|
||||
2298 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 320(%rsp), %zmm3 # 64-byte Reload
|
||||
2299 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm30, %zmm3, %zmm3
|
||||
2300 | | 1.00 | | | | 0.00 | | || | | leal (%rbx,%rbx), %ecx
|
||||
2301 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %rdi
|
||||
2302 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||
2303 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||
2304 | | 1.00 | | | | | | || | | leal 1(%rbx,%rbx), %ebx
|
||||
2305 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm31, %zmm25, %zmm17
|
||||
2306 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18
|
||||
2307 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
|
||||
2308 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
|
||||
2309 | 2.75 | | | | | 0.25 | | || 8.0 | | vrcp14pd %zmm18, %zmm19
|
||||
2310 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | cmpq %rbx, %rdi
|
||||
2311 | 0.00 | | | | | | 1.00 | || | | sete %bl
|
||||
2312 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ebp
|
||||
2313 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm19, %zmm20
|
||||
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm19, %zmm21
|
||||
2315 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm21, %zmm20
|
||||
2316 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm21 # 64-byte Reload
|
||||
2317 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm21, %zmm21
|
||||
2318 | 0.00 | | | | | | 1.00 | || | | shlb $4, %bpl
|
||||
2319 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm1, %zmm19
|
||||
2320 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||
2321 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm20, %zmm20
|
||||
2322 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||
2323 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 256(%rsp), %zmm20 # 64-byte Reload
|
||||
2324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm20, %zmm20
|
||||
2325 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | notb %bpl
|
||||
2326 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | subb %cl, %bpl
|
||||
2327 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2328 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
|
||||
2329 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm26, %zmm18
|
||||
2330 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
|
||||
2331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm4
|
||||
2332 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
|
||||
2333 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
|
||||
2334 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
|
||||
2335 | 2.25 | | | | | 0.75 | | || | | vrcp14pd %zmm4, %zmm3
|
||||
2336 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %ecx
|
||||
2337 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %eax
|
||||
2338 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
|
||||
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm3, %zmm17
|
||||
2340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm19
|
||||
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm19, %zmm17
|
||||
2342 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm19 # 64-byte Reload
|
||||
2343 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm19, %zmm19
|
||||
2344 | 0.00 | | | | | | 1.00 | || | | shlb $5, %al
|
||||
2345 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm1, %zmm3
|
||||
2346 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||
2347 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm17, %zmm17
|
||||
2348 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||
2349 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm23, %zmm17
|
||||
2350 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %al, %cl
|
||||
2351 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | addb $-3, %cl
|
||||
2352 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
2353 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
|
||||
2354 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm27, %zmm4
|
||||
2355 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
|
||||
2356 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm4, %zmm4, %zmm21
|
||||
2357 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
|
||||
2358 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
|
||||
2359 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
|
||||
2360 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm21, %zmm20
|
||||
2361 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
|
||||
2362 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm20, %zmm3
|
||||
2363 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm18
|
||||
2364 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||
2365 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm1, %zmm18
|
||||
2366 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm18
|
||||
2367 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2368 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||
2369 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||
2370 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ecx
|
||||
2371 | 0.00 | | | | | | 1.00 | || | | shlb $6, %cl
|
||||
2372 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %cl, %al
|
||||
2373 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-5, %al
|
||||
2374 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||
2375 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
|
||||
2376 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
|
||||
2377 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18
|
||||
2378 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm24, %zmm20
|
||||
2379 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm28, %zmm21
|
||||
2380 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
|
||||
2381 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm21, %zmm21, %zmm19
|
||||
2382 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
|
||||
2383 | 0.25 | | | | | 0.75 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
|
||||
2384 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
|
||||
2385 | 2.00 | | | | | 1.00 | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2386 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
|
||||
2387 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm22, %zmm17, %zmm3
|
||||
2388 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm17, %zmm4
|
||||
2389 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||
2390 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm1, %zmm4
|
||||
2391 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm4
|
||||
2392 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||
2393 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||
2394 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||
2395 | 0.00 | | | | | | 1.00 | || | | shlb $7, %bl
|
||||
2396 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %bl, %dl
|
||||
2397 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addb $-9, %dl
|
||||
2398 | 1.00 | | | | | | | || | | kmovd %edx, %k1
|
||||
2399 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2400 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
|
||||
2401 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
|
||||
2402 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
|
||||
2403 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %r14
|
||||
2404 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %r14, %r11
|
||||
2405 | | | | | | | | || | | * jne .LBB5_11
|
||||
|
||||
40.0 14.5 5.00 5.00 5.00 5.00 40.0 14.5 50.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2402 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
|
||||
2401 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
|
||||
2400 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
|
||||
2386 | 4.0 | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
|
||||
2384 | 4.0 | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
|
||||
2380 | 4.0 | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
|
||||
2361 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
|
||||
2359 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
|
||||
2355 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
|
||||
2338 | 4.0 | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
|
||||
2334 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
|
||||
2330 | 4.0 | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
|
||||
2394 | 3.0 | shlb $3, %dl | [2394, 2396, 2397]
|
||||
2318 | 3.0 | shlb $4, %bpl | [2318, 2325, 2326]
|
||||
2403 | 1.0 | incq %r14 | [2403]
|
||||
|
2596
static_analysis/gromacs-avx512-dp-ICX.s
Normal file
2596
static_analysis/gromacs-avx512-dp-ICX.s
Normal file
File diff suppressed because it is too large
Load Diff
198
static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out
Normal file
198
static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out
Normal file
@@ -0,0 +1,198 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icc-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
|
||||
| 1 | | | | | | | 1.0 | | inc rsi
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
|
||||
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
|
||||
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
|
||||
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
|
||||
| 1 | | | | | | | 1.0 | | mov edx, 0x0
|
||||
| 1 | | | | | | | 1.0 | | setz dl
|
||||
| 1 | | 1.0 | | | | | | | cmp eax, r11d
|
||||
| 1 | | | | | | | 1.0 | | mov eax, 0x0
|
||||
| 1* | | | | | | | | | mov r13d, edx
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
|
||||
| 1 | | | | | | | 1.0 | | setz al
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
|
||||
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
|
||||
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
|
||||
| 1 | | 1.0 | | | | | | | neg r13d
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
|
||||
| 1* | | | | | | | | | mov r12d, eax
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
|
||||
| 1 | | 1.0 | | | | | | | add r13d, 0xff
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
|
||||
| 1 | | | | | | | 1.0 | | nop
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
|
||||
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||
| 1* | | | | | | | | | mov r13d, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
|
||||
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
|
||||
| 1 | | | | | | | 1.0 | | neg r12d
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
|
||||
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
|
||||
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
|
||||
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
|
||||
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
|
||||
| 1 | | 1.0 | | | | | | | sub r12d, r13d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
|
||||
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
|
||||
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
|
||||
| 1* | | | | | | | | | mov r12d, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
|
||||
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
|
||||
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
|
||||
| 1 | | | | | | | 1.0 | | neg r13d
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
|
||||
| 1 | | | | | | | 1.0 | | add r13d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
|
||||
| 1 | | | | | | | 1.0 | | shl edx, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
|
||||
| 1 | | 1.0 | | | | | | | neg edx
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
|
||||
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
|
||||
| 1 | | 1.0 | | | | | | | add edx, 0xff
|
||||
| 1 | | | | | | | 1.0 | | shl eax, 0x7
|
||||
| 1 | | 1.0 | | | | | | | sub edx, eax
|
||||
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||
| 1 | 1.0 | | | | | | | | kmovw eax, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovb k7, eax
|
||||
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, edx
|
||||
| 1 | 1.0 | | | | | | | | kmovb edx, k7
|
||||
| 1 | | | | | | 1.0 | | | kmovw k7, edx
|
||||
| 1 | 1.0 | | | | | | | | kmovw edx, k0
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
|
||||
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||
| 1 | | | | | | 1.0 | | | kmovb k0, edx
|
||||
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
|
||||
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
|
||||
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
|
||||
| 1* | | | | | | | | | cmp rsi, rdi
|
||||
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
|
||||
Total Num Of Uops: 187
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
152
static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out
Normal file
152
static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out
Normal file
@@ -0,0 +1,152 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icc-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
|
||||
| 1* | | | | | | | | | mov r12d, r13d
|
||||
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
|
||||
| 1 | | 1.0 | | | | | | | inc rax
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
|
||||
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
|
||||
| 1 | | | | | | | 1.0 | | setz r12b
|
||||
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | shl r14, 0x5
|
||||
| 1* | | | | | | | | | mov r8d, r12d
|
||||
| 1 | | 1.0 | | | | | | | neg r8d
|
||||
| 1* | | | | | | | | | mov r11d, r12d
|
||||
| 1 | | 1.0 | | | | | | | add r8d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
|
||||
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
|
||||
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
|
||||
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
|
||||
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
|
||||
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
|
||||
| 1 | | | | | | | 1.0 | | neg r9d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | | | | | | | 1.0 | | add r9d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
|
||||
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
|
||||
| 1 | | | | | | | 1.0 | | neg r10d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
|
||||
| 1 | | 1.0 | | | | | | | add r10d, r12d
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
|
||||
| 1 | | | | | | | 1.0 | | add r10d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
|
||||
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
|
||||
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub r12d, r11d
|
||||
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
|
||||
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
|
||||
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
|
||||
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
|
||||
| 1* | | | | | | | | | cmp rax, rdx
|
||||
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
|
||||
Total Num Of Uops: 142
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
154
static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out
Normal file
154
static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out
Normal file
@@ -0,0 +1,154 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icx-avx512-dp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
|
||||
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
|
||||
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
|
||||
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
|
||||
| 1 | | | | | | | 1.0 | | cmp r11, rdx
|
||||
| 1 | | | | | | | 1.0 | | setnz dl
|
||||
| 1 | | | | | | | 1.0 | | setz al
|
||||
| 1 | | 1.0 | | | | | | | add ecx, ecx
|
||||
| 1 | | 1.0 | | | | | | | inc ecx
|
||||
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
|
||||
| 1 | | | | | | | 1.0 | | setz cl
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
|
||||
| 1 | | | | | | | 1.0 | | setnz dil
|
||||
| 1* | | | | | | | | | mov ebp, edi
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||
| 1 | | 1.0 | | | | | | | sub bpl, al
|
||||
| 1 | | 1.0 | | | | | | | add bpl, 0xef
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
|
||||
| 1* | | | | | | | | | mov ebp, edi
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
|
||||
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
|
||||
| 1 | | 1.0 | | | | | | | or bpl, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
|
||||
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||
| 1 | | | | | | | 1.0 | | shl dil, 0x6
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
|
||||
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, edi
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
|
||||
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
|
||||
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||
| 1 | | | | | | | 1.0 | | shl cl, 0x7
|
||||
| 1 | | 1.0 | | | | | | | or cl, dl
|
||||
| 1 | | 1.0 | | | | | | | add cl, 0xf7
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
|
||||
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
|
||||
| 1* | | | | | | | | | cmp r9, rbx
|
||||
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
|
||||
Total Num Of Uops: 129
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
288
static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out
Normal file
288
static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out
Normal file
@@ -0,0 +1,288 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 12200
|
||||
Total Cycles: 4745
|
||||
Total uOps: 14000
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.95
|
||||
IPC: 2.57
|
||||
Block RThroughput: 34.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
|
||||
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
|
||||
1 1 0.50 shlq $6, %rdx
|
||||
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
|
||||
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
|
||||
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
|
||||
2 8 0.50 * vmovupd 16(%rsp), %zmm3
|
||||
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
|
||||
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
|
||||
2 8 0.50 * vmovupd 336(%rsp), %zmm16
|
||||
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
|
||||
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||
3 4 2.00 vrcp14pd %zmm17, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
|
||||
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
|
||||
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
|
||||
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
|
||||
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
|
||||
1 1 0.50 leal (%rcx,%rcx), %edx
|
||||
1 1 0.25 cmpq %rdx, %r11
|
||||
1 1 0.50 setne %dl
|
||||
1 1 0.50 sete %al
|
||||
1 1 0.25 addl %ecx, %ecx
|
||||
1 1 0.25 incl %ecx
|
||||
1 1 0.25 cmpq %rcx, %r11
|
||||
1 1 0.50 sete %cl
|
||||
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
|
||||
2 8 0.50 * vmovupd 528(%rsp), %zmm19
|
||||
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
|
||||
1 1 0.50 setne %dil
|
||||
1 1 0.25 movl %edi, %ebp
|
||||
1 1 0.50 shlb $4, %bpl
|
||||
1 1 0.25 subb %al, %bpl
|
||||
1 1 0.25 addb $-17, %bpl
|
||||
1 1 1.00 kmovd %ebp, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 272(%rsp), %zmm17
|
||||
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
|
||||
1 1 0.50 leal (%rdx,%rdx), %eax
|
||||
1 1 0.25 movl %edi, %ebp
|
||||
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm3, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
|
||||
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
|
||||
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
|
||||
2 8 0.50 * vmovupd 464(%rsp), %zmm31
|
||||
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
|
||||
1 1 0.50 shlb $5, %bpl
|
||||
1 1 0.25 orb %al, %bpl
|
||||
1 1 0.25 orb $-35, %bpl
|
||||
1 1 1.00 kmovd %ebp, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 208(%rsp), %zmm3
|
||||
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
|
||||
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
|
||||
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
|
||||
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm19, %zmm17
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
|
||||
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
|
||||
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
|
||||
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
|
||||
1 1 0.50 leal (,%rdx,4), %eax
|
||||
1 1 0.50 shlb $6, %dil
|
||||
1 1 0.25 orb %al, %dil
|
||||
1 1 0.25 orb $-69, %dil
|
||||
1 1 1.00 kmovd %edi, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2 8 0.50 * vmovupd 400(%rsp), %zmm17
|
||||
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
|
||||
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
|
||||
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
|
||||
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||
3 4 2.00 vrcp14pd %zmm28, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
|
||||
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
|
||||
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
|
||||
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
|
||||
1 1 0.50 shlb $3, %dl
|
||||
1 1 0.50 shlb $7, %cl
|
||||
1 1 0.25 orb %dl, %cl
|
||||
1 1 0.25 addb $-9, %cl
|
||||
1 1 1.00 kmovd %ecx, %k1
|
||||
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||
1 1 0.25 incq %rbx
|
||||
1 1 0.25 cmpq %rbx, %r9
|
||||
1 1 0.50 jne .LBB5_12
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
|
||||
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
|
||||
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
|
||||
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
|
||||
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
|
||||
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
|
||||
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
|
||||
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
|
||||
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
|
||||
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
|
||||
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
|
||||
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
|
||||
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
|
||||
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
|
||||
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
|
||||
- - - - - - - - 1.00 - cmpq %rdx, %r11
|
||||
- - - - - - - - 1.00 - setne %dl
|
||||
- - 0.44 - - - - - 0.56 - sete %al
|
||||
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
|
||||
- - - 0.53 - - - 0.46 0.01 - incl %ecx
|
||||
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
|
||||
- - 0.02 - - - - - 0.98 - sete %cl
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
|
||||
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
|
||||
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
|
||||
- - 0.04 - - - - - 0.96 - setne %dil
|
||||
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
|
||||
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
|
||||
- - - 0.96 - - - - 0.04 - subb %al, %bpl
|
||||
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
|
||||
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
|
||||
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
|
||||
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
|
||||
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
|
||||
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
|
||||
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
|
||||
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
|
||||
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
|
||||
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
|
||||
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
|
||||
- - - 0.94 - - - - 0.06 - orb %al, %bpl
|
||||
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
|
||||
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
|
||||
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
|
||||
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
|
||||
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
|
||||
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
|
||||
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
|
||||
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
|
||||
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
|
||||
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
|
||||
- - - - - - - - 1.00 - shlb $6, %dil
|
||||
- - - 0.02 - - - - 0.98 - orb %al, %dil
|
||||
- - - 0.48 - - - - 0.52 - orb $-69, %dil
|
||||
- - - - - - - 1.00 - - kmovd %edi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
|
||||
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
|
||||
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
|
||||
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
|
||||
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
|
||||
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
|
||||
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
|
||||
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
|
||||
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
|
||||
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
|
||||
- - - - - - - - 1.00 - shlb $3, %dl
|
||||
- - - - - - - - 1.00 - shlb $7, %cl
|
||||
- - - 1.00 - - - - - - orb %dl, %cl
|
||||
- - - 0.52 - - - - 0.48 - addb $-9, %cl
|
||||
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
|
||||
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||
- - - 0.48 - - - - 0.52 - incq %rbx
|
||||
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
|
||||
- - - - - - - - 1.00 - jne .LBB5_12
|
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out
Normal file
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out
Normal file
@@ -0,0 +1,167 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-dp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-14 12:51:57
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
2241 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2242 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2243 | | | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||
2244 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r10,%rbx,4), %rcx
|
||||
2246 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
2247 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rdx
|
||||
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||
2252 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||
2253 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||
2255 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2256 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||
2257 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||
2258 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||
2259 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm17, %zmm18
|
||||
2260 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||
2261 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2262 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2263 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||
2264 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||
2265 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||
2266 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||
2267 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||
2268 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rdx, %r11
|
||||
2269 | 0.00 | | | | | | 1.00 | | | || | | setne %dl
|
||||
2270 | 0.00 | | | | | | 1.00 | | | || | | sete %al
|
||||
2271 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl %ecx, %ecx
|
||||
2272 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | incl %ecx
|
||||
2273 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rcx, %r11
|
||||
2274 | 0.00 | | | | | | 1.00 | | | || | | sete %cl
|
||||
2275 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||
2277 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||
2278 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||
2279 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||
2280 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $4, %bpl
|
||||
2281 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | subb %al, %bpl
|
||||
2282 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | addb $-17, %bpl
|
||||
2283 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||
2284 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||
2286 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||
2287 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rdx,%rdx), %eax
|
||||
2288 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||
2289 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||
2290 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||
2291 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||
2292 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||
2293 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||
2294 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||
2295 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm3, %zmm16
|
||||
2296 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||
2297 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||
2298 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2299 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2300 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||
2301 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||
2302 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||
2304 | 0.75 | | | | | 0.250 | | | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||
2305 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $5, %bpl
|
||||
2306 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb %al, %bpl
|
||||
2307 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb $-35, %bpl
|
||||
2308 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||
2309 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||
2311 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||
2312 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||
2313 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||
2314 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2315 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||
2316 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||
2317 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||
2318 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||
2319 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||
2320 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2321 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||
2322 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||
2323 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2324 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2325 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||
2326 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||
2327 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||
2328 | 0.75 | | | | | 0.250 | | | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||
2329 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (,%rdx,4), %eax
|
||||
2330 | 0.00 | | | | | | 1.00 | | | || | | shlb $6, %dil
|
||||
2331 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %al, %dil
|
||||
2332 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb $-69, %dil
|
||||
2333 | 1.00 | | | | | | | | | || | | kmovd %edi, %k1
|
||||
2334 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||
2336 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||
2337 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||
2338 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||
2339 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2340 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||
2341 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||
2342 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||
2343 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||
2344 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||
2345 | 2.00 | | | | | 1.000 | | | | || | | vrcp14pd %zmm28, %zmm3
|
||||
2346 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||
2347 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||
2348 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2349 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2350 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||
2351 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||
2352 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||
2353 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||
2354 | 0.00 | | | | | | 1.00 | | | || | | shlb $3, %dl
|
||||
2355 | 0.00 | | | | | | 1.00 | | | || | | shlb $7, %cl
|
||||
2356 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %dl, %cl
|
||||
2357 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addb $-9, %cl
|
||||
2358 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k1
|
||||
2359 | 0.00 | | | | | 1.000 | | | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
2360 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||
2361 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||
2362 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||
2363 | 0.24 | | | | | 0.760 | | | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||
2364 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rbx
|
||||
2365 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rbx, %r9
|
||||
2366 | | | | | | | | | | || | | * jne .LBB5_12
|
||||
2367 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
44.0 15.0 5.50 5.50 5.50 5.50 43.99 15.0 71 6.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||
2364 | 1.0 | incq %rbx | [2364]
|
||||
|
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out
Normal file
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out
Normal file
@@ -0,0 +1,167 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-dp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:30:53
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
|
||||
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
|
||||
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
|
||||
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
|
||||
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
|
||||
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
|
||||
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
|
||||
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
|
||||
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
|
||||
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
|
||||
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
|
||||
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
|
||||
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
|
||||
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
|
||||
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
|
||||
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
|
||||
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
|
||||
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
|
||||
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
|
||||
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
|
||||
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
|
||||
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
|
||||
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
|
||||
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
|
||||
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
|
||||
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
|
||||
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
|
||||
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
|
||||
2366 | | | | | | | | || | | * jne .LBB5_12
|
||||
2367 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||
2364 | 1.0 | incq %rbx | [2364]
|
||||
|
162
static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out
Normal file
162
static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out
Normal file
@@ -0,0 +1,162 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - gromacs-icx-avx512-sp.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
|
||||
| 1* | | | | | | | | | mov rsi, rax
|
||||
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
|
||||
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
|
||||
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
|
||||
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
|
||||
| 1* | | | | | | | | | xor esi, esi
|
||||
| 1* | | | | | | | | | xor edi, edi
|
||||
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
|
||||
| 1 | | | | | | | 1.0 | | setz sil
|
||||
| 1 | | | | | | | 1.0 | | setnz dil
|
||||
| 1 | | 1.0 | | | | | | | mov eax, 0xff
|
||||
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
|
||||
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
|
||||
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
|
||||
| 1 | | 1.0 | | | | | | | xor esi, 0xff
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
|
||||
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
|
||||
| 1 | | | | | | | 1.0 | | or esi, 0xfc
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
|
||||
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
|
||||
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
|
||||
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
|
||||
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
|
||||
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
|
||||
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
|
||||
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
|
||||
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
|
||||
| 1* | | | | | | | | | cmp r10, rdx
|
||||
| 0*F | | | | | | | | | jz 0x34
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
|
||||
| 1 | | 1.0 | | | | | | | inc rdx
|
||||
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
|
||||
Total Num Of Uops: 140
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
304
static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out
Normal file
304
static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out
Normal file
@@ -0,0 +1,304 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 13000
|
||||
Total Cycles: 5640
|
||||
Total uOps: 15400
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.73
|
||||
IPC: 2.30
|
||||
Block RThroughput: 40.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 5 0.50 * movslq (%r11,%rdx,4), %rax
|
||||
1 1 0.25 movq %rax, %rsi
|
||||
1 1 0.50 shlq $5, %rsi
|
||||
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
|
||||
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
|
||||
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
|
||||
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
|
||||
2 8 0.50 * vmovups 128(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
|
||||
2 8 0.50 * vmovups 320(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
|
||||
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
|
||||
2 8 0.50 * vmovups (%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
|
||||
2 8 0.50 * vmovups 256(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
|
||||
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
|
||||
2 8 0.50 * vmovups 448(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
|
||||
2 8 0.50 * vmovups 192(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
|
||||
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
|
||||
2 8 0.50 * vmovups 384(%rsp), %zmm1
|
||||
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
|
||||
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
|
||||
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
|
||||
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
|
||||
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
|
||||
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
|
||||
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
|
||||
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||
3 4 2.00 vrcp14ps %zmm27, %zmm31
|
||||
3 4 2.00 vrcp14ps %zmm28, %zmm1
|
||||
3 4 2.00 vrcp14ps %zmm29, %zmm2
|
||||
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||
3 4 2.00 vrcp14ps %zmm30, %zmm3
|
||||
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
|
||||
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
|
||||
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
|
||||
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
|
||||
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
|
||||
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
|
||||
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
|
||||
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
|
||||
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
|
||||
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
|
||||
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
|
||||
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
|
||||
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
|
||||
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
|
||||
1 0 0.17 xorl %esi, %esi
|
||||
1 0 0.17 xorl %edi, %edi
|
||||
1 1 0.25 testl $2147483647, %eax
|
||||
1 1 0.50 sete %sil
|
||||
1 1 0.50 setne %dil
|
||||
1 1 0.25 movl $255, %eax
|
||||
1 1 0.50 cmovel %r8d, %eax
|
||||
1 1 0.25 movl $255, %ecx
|
||||
1 1 0.50 cmovel %r9d, %ecx
|
||||
1 1 0.25 xorl $255, %esi
|
||||
1 1 1.00 kmovd %esi, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
|
||||
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
1 1 0.50 leal (%rdi,%rdi,2), %esi
|
||||
1 1 0.25 orl $252, %esi
|
||||
1 1 1.00 kmovd %esi, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
|
||||
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
|
||||
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
|
||||
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
|
||||
1 1 1.00 kmovd %eax, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
|
||||
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
1 1 1.00 kmovd %ecx, %k1
|
||||
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
|
||||
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
|
||||
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
|
||||
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
|
||||
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
|
||||
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
1 5 0.50 * movq 176(%r15), %rax
|
||||
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
|
||||
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
|
||||
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
|
||||
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
|
||||
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
|
||||
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
|
||||
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
|
||||
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
|
||||
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
|
||||
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
|
||||
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
|
||||
1 1 0.25 cmpq %rdx, %r10
|
||||
1 1 0.50 je .LBB4_18
|
||||
1 5 0.50 * movq 160(%r15), %rdi
|
||||
1 1 0.25 incq %rdx
|
||||
1 1 0.50 jmp .LBB4_8
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
|
||||
- - - - - - - - 1.00 - movq %rax, %rsi
|
||||
- - - - - - - - 1.00 - shlq $5, %rsi
|
||||
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
|
||||
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
|
||||
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
|
||||
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
|
||||
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
|
||||
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
|
||||
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
|
||||
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
|
||||
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
|
||||
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
|
||||
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
|
||||
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
|
||||
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
|
||||
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
|
||||
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
|
||||
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
|
||||
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
|
||||
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
|
||||
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
|
||||
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
|
||||
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
|
||||
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
|
||||
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
|
||||
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
|
||||
- - - - - - - - - - xorl %esi, %esi
|
||||
- - - - - - - - - - xorl %edi, %edi
|
||||
- - - - - - - - 1.00 - testl $2147483647, %eax
|
||||
- - - - - - - - 1.00 - sete %sil
|
||||
- - - - - - - - 1.00 - setne %dil
|
||||
- - - 1.00 - - - - - - movl $255, %eax
|
||||
- - - - - - - - 1.00 - cmovel %r8d, %eax
|
||||
- - - 1.00 - - - - - - movl $255, %ecx
|
||||
- - - - - - - - 1.00 - cmovel %r9d, %ecx
|
||||
- - - 1.00 - - - - - - xorl $255, %esi
|
||||
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
|
||||
- - - - - - - - 1.00 - orl $252, %esi
|
||||
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
|
||||
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
|
||||
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
|
||||
- - - - - - - 1.00 - - kmovd %eax, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
|
||||
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
|
||||
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
|
||||
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
- - - - 1.00 - - - - - movq 176(%r15), %rax
|
||||
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
|
||||
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
|
||||
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
|
||||
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
|
||||
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
|
||||
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
|
||||
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
|
||||
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
|
||||
- - - - - - - - 1.00 - cmpq %rdx, %r10
|
||||
- - - - - - - - 1.00 - je .LBB4_18
|
||||
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
|
||||
- - - 1.00 - - - - - - incq %rdx
|
||||
- - - - - - - - 1.00 - jmp .LBB4_8
|
116
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out
Normal file
116
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out
Normal file
@@ -0,0 +1,116 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-sp.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-14 12:51:43
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||
1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1
|
||||
1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx
|
||||
1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||
1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx
|
||||
1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16
|
||||
1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3]
|
||||
1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3]
|
||||
1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
|
||||
1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18
|
||||
1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17
|
||||
1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16
|
||||
1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22
|
||||
1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22
|
||||
1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22
|
||||
1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23
|
||||
1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24
|
||||
1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||
1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||
1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25
|
||||
1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23
|
||||
1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23
|
||||
1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23
|
||||
1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||
1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi
|
||||
1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp
|
||||
1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12
|
||||
1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||
1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx
|
||||
1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl
|
||||
1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx
|
||||
1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx
|
||||
1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12
|
||||
1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl
|
||||
1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx
|
||||
1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl
|
||||
1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx
|
||||
1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d
|
||||
1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d
|
||||
1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d
|
||||
1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx
|
||||
1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx
|
||||
1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d
|
||||
1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00
|
||||
1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx
|
||||
1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||
1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2}
|
||||
1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21
|
||||
1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20
|
||||
1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19
|
||||
1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22
|
||||
1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
|
||||
1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18
|
||||
1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18
|
||||
1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18
|
||||
1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
|
||||
1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17
|
||||
1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
|
||||
1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16
|
||||
1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||
1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||
1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22
|
||||
1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17
|
||||
1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17
|
||||
1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16
|
||||
1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx
|
||||
1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx
|
||||
1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx
|
||||
1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx
|
||||
1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx
|
||||
1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx
|
||||
1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB
|
||||
1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||
1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2}
|
||||
1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16
|
||||
1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
|
||||
1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
|
||||
1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
|
||||
1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax
|
||||
1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10
|
||||
1420 | | | | | | | | | | || | | * jne .LBB2_12
|
||||
1421 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
|
||||
1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
|
||||
1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
|
||||
1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
|
||||
1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
|
||||
1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
|
||||
1418 | 1.0 | incq %rax | [1418]
|
||||
|
161
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out
Normal file
161
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out
Normal file
@@ -0,0 +1,161 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: gromacs-icx-avx512-sp.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:31:04
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
|
||||
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
|
||||
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
|
||||
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
|
||||
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
|
||||
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
|
||||
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
|
||||
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
|
||||
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
|
||||
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
|
||||
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
|
||||
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
|
||||
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
|
||||
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
|
||||
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
|
||||
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
|
||||
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
|
||||
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
|
||||
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
|
||||
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
|
||||
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
|
||||
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
|
||||
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
|
||||
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
|
||||
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
|
||||
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
|
||||
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
|
||||
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
|
||||
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
|
||||
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
|
||||
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
|
||||
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
|
||||
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
|
||||
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
|
||||
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
|
||||
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
|
||||
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
|
||||
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
|
||||
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
|
||||
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
|
||||
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
|
||||
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
|
||||
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
|
||||
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
|
||||
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
|
||||
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
|
||||
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
|
||||
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
|
||||
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
|
||||
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
|
||||
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
|
||||
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
|
||||
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
|
||||
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
|
||||
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
|
||||
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
|
||||
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
|
||||
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
|
||||
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
|
||||
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
|
||||
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
|
||||
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
|
||||
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
|
||||
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
|
||||
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
|
||||
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
|
||||
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
|
||||
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
|
||||
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
|
||||
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
|
||||
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
|
||||
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
|
||||
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
|
||||
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
|
||||
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
|
||||
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
|
||||
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
|
||||
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
|
||||
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
|
||||
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
|
||||
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
|
||||
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
|
||||
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
|
||||
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
|
||||
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
|
||||
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
|
||||
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
|
||||
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
|
||||
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
|
||||
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
|
||||
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
|
||||
1791 | | | | | | | | || | | * je .LBB4_18
|
||||
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
|
||||
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
|
||||
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
|
||||
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
|
||||
1796 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
1794 | 1.0 | incq %rdx | [1794]
|
||||
|
88
static_analysis/jan/analyses/lammps-icc-avx2-iaca.out
Normal file
88
static_analysis/jan/analyses/lammps-icc-avx2-iaca.out
Normal file
@@ -0,0 +1,88 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - lammps-icc-avx2.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
|
||||
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
|
||||
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
|
||||
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
|
||||
| 1* | | | | | | | | | mov r8d, ecx
|
||||
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
|
||||
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
|
||||
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
|
||||
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
|
||||
| 1* | | | | | | | | | mov r14d, r15d
|
||||
| 1 | | | | | | | 1.0 | | shr r15, 0x20
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
|
||||
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
|
||||
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
|
||||
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
|
||||
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
|
||||
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
|
||||
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
|
||||
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
|
||||
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
|
||||
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
|
||||
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
|
||||
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
|
||||
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
|
||||
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
|
||||
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
|
||||
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
|
||||
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
|
||||
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
|
||||
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
|
||||
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
|
||||
| 1 | | | | | | | 1.0 | | add rdx, 0x4
|
||||
| 1* | | | | | | | | | cmp rdx, rsi
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff02
|
||||
Total Num Of Uops: 62
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
156
static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out
Normal file
156
static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out
Normal file
@@ -0,0 +1,156 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 5600
|
||||
Total Cycles: 2352
|
||||
Total uOps: 6300
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.68
|
||||
IPC: 2.38
|
||||
Block RThroughput: 10.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
1 2 1.00 vmovq %xmm0, %rcx
|
||||
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
1 2 1.00 vmovq %xmm2, %r15
|
||||
1 1 0.25 movl %ecx, %r8d
|
||||
1 1 0.50 shrq $32, %rcx
|
||||
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||
1 1 0.25 movslq %r8d, %rcx
|
||||
1 1 0.25 movslq %r14d, %r8
|
||||
1 1 0.25 movl %r15d, %r14d
|
||||
1 1 0.50 shrq $32, %r15
|
||||
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||
1 1 0.25 movslq %r14d, %r14
|
||||
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||
1 1 0.25 movslq %r15d, %r15
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
2 3 1.00 vptest %ymm7, %ymm1
|
||||
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||
1 1 0.25 addq $4, %rdx
|
||||
1 1 0.25 cmpq %rsi, %rdx
|
||||
1 1 0.50 jb ..B1.22
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
|
||||
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
- - 1.00 - - - - - - - vmovq %xmm2, %r15
|
||||
- - - - - - - - 1.00 - movl %ecx, %r8d
|
||||
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
|
||||
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
|
||||
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
|
||||
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
|
||||
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
|
||||
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
|
||||
- - 0.51 - - - - - 0.49 - shrq $32, %r15
|
||||
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
|
||||
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
|
||||
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
|
||||
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
|
||||
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
|
||||
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
|
||||
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
|
||||
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
|
||||
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||
- - 0.01 - - - - - 0.99 - addq $4, %rdx
|
||||
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
|
||||
- - 0.45 - - - - - 0.55 - jb ..B1.22
|
158
static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out
Normal file
158
static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out
Normal file
@@ -0,0 +1,158 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 5600
|
||||
Total Cycles: 2306
|
||||
Total uOps: 6300
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.73
|
||||
IPC: 2.43
|
||||
Block RThroughput: 10.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
1 2 1.00 vmovq %xmm0, %rcx
|
||||
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
1 2 1.00 vmovq %xmm2, %r15
|
||||
1 1 0.25 movl %ecx, %r8d
|
||||
1 1 0.50 shrq $32, %rcx
|
||||
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||
1 1 0.25 movslq %r8d, %rcx
|
||||
1 1 0.25 movslq %r14d, %r8
|
||||
1 1 0.25 movl %r15d, %r14d
|
||||
1 1 0.50 shrq $32, %r15
|
||||
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||
1 1 0.25 movslq %r14d, %r14
|
||||
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||
1 1 0.25 movslq %r15d, %r15
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
2 3 1.00 vptest %ymm7, %ymm1
|
||||
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||
1 1 0.25 addq $4, %rdx
|
||||
1 1 0.25 cmpq %rsi, %rdx
|
||||
1 1 0.50 jb ..B1.22
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - ICXDivider
|
||||
[1] - ICXFPDivider
|
||||
[2] - ICXPort0
|
||||
[3] - ICXPort1
|
||||
[4] - ICXPort2
|
||||
[5] - ICXPort3
|
||||
[6] - ICXPort4
|
||||
[7] - ICXPort5
|
||||
[8] - ICXPort6
|
||||
[9] - ICXPort7
|
||||
[10] - ICXPort8
|
||||
[11] - ICXPort9
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
|
||||
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
|
||||
- - - - - - - - 1.00 - - - movl %ecx, %r8d
|
||||
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
|
||||
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
|
||||
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
|
||||
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
|
||||
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
|
||||
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
|
||||
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
|
||||
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
|
||||
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
|
||||
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
|
||||
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
|
||||
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
|
||||
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
|
||||
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
|
||||
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
|
||||
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
|
||||
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
|
||||
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
|
||||
- - 0.01 - - - - - 0.99 - - - jb ..B1.22
|
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out
Normal file
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out
Normal file
@@ -0,0 +1,97 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx2.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:29:58
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
----------------------------------------------------------------------------------------------------
|
||||
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||
259 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
|
||||
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
|
||||
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
|
||||
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
|
||||
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
|
||||
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
|
||||
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
|
||||
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
|
||||
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||
299 | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||
319 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
|
||||
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
|
||||
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||
323 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||
|
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out
Normal file
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out
Normal file
@@ -0,0 +1,97 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx2.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-10 16:29:48
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
-----------------------------------------------------------------------------------------------------------------------
|
||||
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
|
||||
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
|
||||
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
|
||||
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
|
||||
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
|
||||
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
|
||||
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
|
||||
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
|
||||
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
|
||||
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
|
||||
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
|
||||
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||
323 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||
|
75
static_analysis/jan/analyses/lammps-icc-avx512-iaca.out
Normal file
75
static_analysis/jan/analyses/lammps-icc-avx512-iaca.out
Normal file
@@ -0,0 +1,75 @@
|
||||
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||
Analyzed File - lammps-icc-avx512.o
|
||||
Binary Format - 64Bit
|
||||
Architecture - SKX
|
||||
Analysis Type - Throughput
|
||||
|
||||
Throughput Analysis Report
|
||||
--------------------------
|
||||
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
|
||||
Loop Count: 22
|
||||
Port Binding In Cycles Per Iteration:
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
|
||||
DV - Divider pipe (on port 0)
|
||||
D - Data fetch pipe (on ports 2 and 3)
|
||||
F - Macro Fusion with the previous instruction occurred
|
||||
* - instruction micro-ops not bound to a port
|
||||
^ - Micro Fusion occurred
|
||||
# - ESP Tracking sync uop was issued
|
||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||
X - instruction not supported, was not accounted in Analysis
|
||||
|
||||
| Num Of | Ports pressure in cycles | |
|
||||
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||
-----------------------------------------------------------------------------------------
|
||||
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
|
||||
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
|
||||
| 1 | | | | | | | 1.0 | | add r15, 0x8
|
||||
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
|
||||
| 1 | 1.0 | | | | | | | | kmovw k2, k5
|
||||
| 1 | 1.0 | | | | | | | | kmovw k3, k5
|
||||
| 1 | 1.0 | | | | | | | | kmovw k1, k5
|
||||
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
|
||||
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
|
||||
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
|
||||
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
|
||||
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
|
||||
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
|
||||
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
|
||||
| 1* | | | | | | | | | vmovaps zmm23, zmm31
|
||||
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
|
||||
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
|
||||
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
|
||||
| 1* | | | | | | | | | cmp r15, r14
|
||||
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
|
||||
Total Num Of Uops: 57
|
||||
Analysis Notes:
|
||||
Backend allocation was stalled due to unavailable allocation resources.
|
||||
There were bubbles in the frontend.
|
128
static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out
Normal file
128
static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out
Normal file
@@ -0,0 +1,128 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 4200
|
||||
Total Cycles: 2465
|
||||
Total uOps: 5800
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.35
|
||||
IPC: 1.70
|
||||
Block RThroughput: 13.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||
1 1 0.25 addq $8, %r15
|
||||
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||
1 1 1.00 kmovw %k5, %k2
|
||||
1 1 1.00 kmovw %k5, %k3
|
||||
1 1 1.00 kmovw %k5, %k1
|
||||
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
1 1 1.00 knotw %k0, %k4
|
||||
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
1 1 0.25 cmpq %r14, %r15
|
||||
1 1 0.50 jb ..B1.16
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - SKXDivider
|
||||
[1] - SKXFPDivider
|
||||
[2] - SKXPort0
|
||||
[3] - SKXPort1
|
||||
[4] - SKXPort2
|
||||
[5] - SKXPort3
|
||||
[6] - SKXPort4
|
||||
[7] - SKXPort5
|
||||
[8] - SKXPort6
|
||||
[9] - SKXPort7
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
|
||||
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
|
||||
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k2
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k3
|
||||
- - 1.00 - - - - - - - kmovw %k5, %k1
|
||||
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
|
||||
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
|
||||
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
|
||||
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
|
||||
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
|
||||
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
|
||||
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
- - 1.00 - - - - - - - knotw %k0, %k4
|
||||
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
|
||||
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
|
||||
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
|
||||
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
|
||||
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
|
||||
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
|
||||
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
|
||||
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
|
||||
- - 0.14 - - - - - 0.86 - jb ..B1.16
|
130
static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out
Normal file
130
static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out
Normal file
@@ -0,0 +1,130 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 4200
|
||||
Total Cycles: 2465
|
||||
Total uOps: 5800
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.35
|
||||
IPC: 1.70
|
||||
Block RThroughput: 13.0
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||
1 1 0.25 addq $8, %r15
|
||||
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||
1 1 1.00 kmovw %k5, %k2
|
||||
1 1 1.00 kmovw %k5, %k3
|
||||
1 1 1.00 kmovw %k5, %k1
|
||||
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
1 1 1.00 knotw %k0, %k4
|
||||
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
1 1 0.25 cmpq %r14, %r15
|
||||
1 1 0.50 jb ..B1.16
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - ICXDivider
|
||||
[1] - ICXFPDivider
|
||||
[2] - ICXPort0
|
||||
[3] - ICXPort1
|
||||
[4] - ICXPort2
|
||||
[5] - ICXPort3
|
||||
[6] - ICXPort4
|
||||
[7] - ICXPort5
|
||||
[8] - ICXPort6
|
||||
[9] - ICXPort7
|
||||
[10] - ICXPort8
|
||||
[11] - ICXPort9
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
|
||||
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
|
||||
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k2
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k3
|
||||
- - 1.00 - - - - - - - - - kmovw %k5, %k1
|
||||
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
|
||||
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
|
||||
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
|
||||
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
|
||||
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
|
||||
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
|
||||
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
|
||||
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||
- - 1.00 - - - - - - - - - knotw %k0, %k4
|
||||
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
|
||||
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
|
||||
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
|
||||
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
|
||||
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
|
||||
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
|
||||
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
|
||||
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
|
||||
- - 0.14 - - - - - 0.86 - - - jb ..B1.16
|
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out
Normal file
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out
Normal file
@@ -0,0 +1,77 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx512.s
|
||||
Architecture: CSX
|
||||
Timestamp: 2023-02-10 16:30:08
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------
|
||||
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||
203 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
|
||||
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
|
||||
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
|
||||
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||
246 | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||
|
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out
Normal file
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out
Normal file
@@ -0,0 +1,77 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icc-avx512.s
|
||||
Architecture: ICX
|
||||
Timestamp: 2023-02-10 16:29:42
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
|
||||
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
|
||||
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
|
||||
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||
246 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||
|
197
static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out
Normal file
197
static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out
Normal file
@@ -0,0 +1,197 @@
|
||||
|
||||
[0] Code Region
|
||||
|
||||
Iterations: 100
|
||||
Instructions: 7000
|
||||
Total Cycles: 3866
|
||||
Total uOps: 7900
|
||||
|
||||
Dispatch Width: 6
|
||||
uOps Per Cycle: 2.04
|
||||
IPC: 1.81
|
||||
Block RThroughput: 21.5
|
||||
|
||||
|
||||
Instruction Info:
|
||||
[1]: #uOps
|
||||
[2]: Latency
|
||||
[3]: RThroughput
|
||||
[4]: MayLoad
|
||||
[5]: MayStore
|
||||
[6]: HasSideEffects (U)
|
||||
|
||||
[1] [2] [3] [4] [5] [6] Instructions:
|
||||
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
2 4 1.50 vpmovsxdq %xmm11, %ymm1
|
||||
1 1 0.50 vpsllq $3, %ymm1, %ymm1
|
||||
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
|
||||
1 1 1.00 vmovq %xmm1, %r14
|
||||
2 1 1.00 vpextrq $1, %xmm1, %r9
|
||||
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
|
||||
1 8 0.50 * vmovsd (%r14), %xmm2
|
||||
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
2 4 1.50 vpmovsxdq %xmm6, %ymm6
|
||||
1 1 0.50 vpsllq $3, %ymm6, %ymm6
|
||||
1 1 1.00 vmovq %xmm1, %rdi
|
||||
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
|
||||
1 1 1.00 vmovq %xmm6, %rcx
|
||||
2 1 1.00 vpextrq $1, %xmm1, %rbx
|
||||
2 1 1.00 vpextrq $1, %xmm6, %rax
|
||||
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
|
||||
1 8 0.50 * vmovsd (%rdi), %xmm6
|
||||
1 1 1.00 vmovq %xmm1, %rdi
|
||||
2 1 1.00 vpextrq $1, %xmm1, %rsi
|
||||
1 8 0.50 * vmovsd (%rdi), %xmm1
|
||||
1 8 0.50 * vmovsd (%rcx), %xmm7
|
||||
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
|
||||
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
|
||||
2 4 1.50 vpmovsxdq %xmm4, %ymm4
|
||||
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
|
||||
1 1 0.50 vpsllq $3, %ymm4, %ymm4
|
||||
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
|
||||
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
|
||||
2 1 1.00 vpextrq $1, %xmm4, %rax
|
||||
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
|
||||
1 1 1.00 vmovq %xmm4, %rcx
|
||||
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
|
||||
1 1 1.00 vmovq %xmm4, %rsi
|
||||
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
2 1 1.00 vpextrq $1, %xmm4, %rdi
|
||||
1 8 0.50 * vmovsd (%rsi), %xmm4
|
||||
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
|
||||
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
|
||||
1 8 0.50 * vmovsd (%rcx), %xmm6
|
||||
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
|
||||
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
|
||||
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
|
||||
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
|
||||
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
|
||||
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
|
||||
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
|
||||
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
|
||||
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
|
||||
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
|
||||
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
1 1 0.25 addq $4, %rbp
|
||||
1 1 0.25 cmpq %rdx, %rbp
|
||||
1 1 0.50 jb .LBB0_9
|
||||
|
||||
|
||||
Resources:
|
||||
[0] - Zn3AGU0
|
||||
[1] - Zn3AGU1
|
||||
[2] - Zn3AGU2
|
||||
[3] - Zn3ALU0
|
||||
[4] - Zn3ALU1
|
||||
[5] - Zn3ALU2
|
||||
[6] - Zn3ALU3
|
||||
[7] - Zn3BRU1
|
||||
[8] - Zn3FPP0
|
||||
[9] - Zn3FPP1
|
||||
[10] - Zn3FPP2
|
||||
[11] - Zn3FPP3
|
||||
[12.0] - Zn3FPP45
|
||||
[12.1] - Zn3FPP45
|
||||
[13] - Zn3FPSt
|
||||
[14.0] - Zn3LSU
|
||||
[14.1] - Zn3LSU
|
||||
[14.2] - Zn3LSU
|
||||
[15.0] - Zn3Load
|
||||
[15.1] - Zn3Load
|
||||
[15.2] - Zn3Load
|
||||
[16.0] - Zn3Store
|
||||
[16.1] - Zn3Store
|
||||
|
||||
|
||||
Resource pressure per iteration:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
|
||||
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
|
||||
|
||||
Resource pressure by instruction:
|
||||
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
|
||||
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
|
||||
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
|
||||
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
|
||||
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
|
||||
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
|
||||
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
|
||||
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
|
||||
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
|
||||
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
|
||||
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
|
||||
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
|
||||
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
|
||||
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
|
||||
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
|
||||
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
|
||||
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
|
||||
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
|
||||
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
|
||||
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
|
||||
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
|
||||
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
|
||||
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
|
||||
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
|
||||
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
|
||||
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
|
||||
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
|
||||
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
|
||||
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
|
||||
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
|
||||
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
|
||||
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
|
||||
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
|
||||
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
|
||||
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
|
||||
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
|
||||
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
|
||||
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9
|
108
static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out
Normal file
108
static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out
Normal file
@@ -0,0 +1,108 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: lammps-icx-avx2zen.s
|
||||
Architecture: ZEN3
|
||||
Timestamp: 2023-02-10 16:31:30
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||
--------------------------------------------------------------------------------------------------------------------------------------------
|
||||
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
|
||||
|
||||
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
|
||||
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
|
||||
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
|
||||
247 | 1.0 | addq $4, %rbp | [247]
|
||||
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
|
||||
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
|
||||
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]
|
||||
|
BIN
static_analysis/jan/gromacs-icc-avx512-dp.o
Normal file
BIN
static_analysis/jan/gromacs-icc-avx512-dp.o
Normal file
Binary file not shown.
4334
static_analysis/jan/gromacs-icc-avx512-dp.s
Normal file
4334
static_analysis/jan/gromacs-icc-avx512-dp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/gromacs-icc-avx512-sp.o
Normal file
BIN
static_analysis/jan/gromacs-icc-avx512-sp.o
Normal file
Binary file not shown.
4018
static_analysis/jan/gromacs-icc-avx512-sp.s
Normal file
4018
static_analysis/jan/gromacs-icc-avx512-sp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/gromacs-icx-avx512-dp.o
Normal file
BIN
static_analysis/jan/gromacs-icx-avx512-dp.o
Normal file
Binary file not shown.
2453
static_analysis/jan/gromacs-icx-avx512-dp.s
Normal file
2453
static_analysis/jan/gromacs-icx-avx512-dp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/gromacs-icx-avx512-sp.o
Normal file
BIN
static_analysis/jan/gromacs-icx-avx512-sp.o
Normal file
Binary file not shown.
2013
static_analysis/jan/gromacs-icx-avx512-sp.s
Normal file
2013
static_analysis/jan/gromacs-icx-avx512-sp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/lammps-icc-avx2.o
Normal file
BIN
static_analysis/jan/lammps-icc-avx2.o
Normal file
Binary file not shown.
1419
static_analysis/jan/lammps-icc-avx2.s
Normal file
1419
static_analysis/jan/lammps-icc-avx2.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/lammps-icc-avx512.o
Normal file
BIN
static_analysis/jan/lammps-icc-avx512.o
Normal file
Binary file not shown.
1559
static_analysis/jan/lammps-icc-avx512.s
Normal file
1559
static_analysis/jan/lammps-icc-avx512.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/lammps-icx-avx2zen.o
Normal file
BIN
static_analysis/jan/lammps-icx-avx2zen.o
Normal file
Binary file not shown.
640
static_analysis/jan/lammps-icx-avx2zen.s
Normal file
640
static_analysis/jan/lammps-icx-avx2zen.s
Normal file
@@ -0,0 +1,640 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
movl $111, %ebx # OSACA START MARKER
|
||||
.byte 100 # OSACA START MARKER
|
||||
.byte 103 # OSACA START MARKER
|
||||
.byte 144 # OSACA START MARKER
|
||||
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
# LLVM-MCA-BEGIN
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # OSACA END MARKER
|
||||
.byte 100 # OSACA END MARKER
|
||||
.byte 103 # OSACA END MARKER
|
||||
.byte 144 # OSACA END MARKER
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $40, %rsp
|
||||
.cfi_def_cfa_offset 96
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r13d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r13,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r15), %rax
|
||||
movq 24(%r15), %rcx
|
||||
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||
movslq 8(%r15), %rdx
|
||||
movq 16(%r12), %rsi
|
||||
movq 64(%r12), %rdi
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 16(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, (%rsp) # 8-byte Spill
|
||||
xorl %r12d, %r12d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r9, %rdx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r15,8)
|
||||
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r10,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r9), %ecx
|
||||
addl $6, %r9d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r9d
|
||||
sarl $2, %r9d
|
||||
movslq %r9d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r12
|
||||
addq (%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r13, %r12
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq 8(%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r12,4), %r9
|
||||
leaq (%r12,%r12,2), %rcx
|
||||
leal 1(%rcx), %r10d
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r15d
|
||||
testq %r9, %r9
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r9d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ecx, %ecx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
je .LBB1_6
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rcx,4), %r8
|
||||
leaq (%r8,%r8,2), %r14
|
||||
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||
movslq %r14d, %rbp
|
||||
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm3, %xmm8, %xmm3
|
||||
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm7, %xmm3, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm9, %xmm9
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r13d, %r8d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rbp), %rbx
|
||||
addq $2, %rbp
|
||||
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r14,8)
|
||||
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
jmp .LBB1_13
|
||||
.LBB1_7: #
|
||||
movq 16(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $40, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.2, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.2, 66
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
105
static_analysis/lammps-avx2-dp-ICX-osaca.txt
Normal file
105
static_analysis/lammps-avx2-dp-ICX-osaca.txt
Normal file
@@ -0,0 +1,105 @@
|
||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||
Analyzed file: force_lj_icx_avx2_markers.s
|
||||
Architecture: ZEN3
|
||||
Timestamp: 2022-12-12 12:47:07
|
||||
|
||||
|
||||
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||
* - Instruction micro-ops not bound to a port
|
||||
X - No throughput/latency information for this instruction in data file
|
||||
|
||||
|
||||
Combined Analysis Report
|
||||
------------------------
|
||||
Port pressure in cycles
|
||||
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------
|
||||
172 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||
173 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||
174 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||
175 | | 0.250 | 0.75 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
176 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
177 | 0.00 | 1.010 | 0.25 | 0.74 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||
178 | | 0.000 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||
179 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||
180 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||
181 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||
182 | | 1.000 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||
183 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
184 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
185 | 0.00 | 0.750 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||
186 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||
187 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
188 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||
189 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||
190 | | 1.000 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||
191 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||
192 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||
193 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
194 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||
195 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||
196 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
197 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
198 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
199 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
200 | 0.00 | 0.000 | 0.62 | 0.38 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||
201 | 0.00 | 0.750 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||
202 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
203 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||
204 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||
205 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
206 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||
207 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
208 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||
209 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||
210 | 0.00 | -0.01 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||
211 | | 1.000 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
212 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||
213 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
214 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||
215 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
216 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
217 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
218 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
219 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
220 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||
221 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||
222 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||
223 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
224 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
225 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
226 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||
227 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||
228 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||
229 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
230 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||
231 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||
232 | 1.00 | 0.000 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
233 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||
234 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||
235 | | | 0.12 | 0.88 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
236 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
237 | 1.00 | 0.000 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
238 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
239 | 1.00 | 0.000 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
240 | 0.62 | 0.380 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
241 | 0.50 | 0.500 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
242 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||
243 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||
244 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||
|
||||
16.1 15.63 15.6 15.6 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||
|
||||
|
||||
|
||||
|
||||
Loop-Carried Dependencies Analysis Report
|
||||
-----------------------------------------
|
||||
239 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
|
||||
238 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
|
||||
236 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
|
||||
242 | 1.0 | addq $4, %rbp | [242]
|
||||
241 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [241]
|
||||
240 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [240]
|
||||
237 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [237]
|
||||
|
638
static_analysis/lammps-avx2-dp-ICX.s
Normal file
638
static_analysis/lammps-avx2-dp-ICX.s
Normal file
@@ -0,0 +1,638 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
# OSACA-BEGIN
|
||||
# LLVM-MCA-BEGIN
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
# OSACA-END
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $40, %rsp
|
||||
.cfi_def_cfa_offset 96
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r13d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r13,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r13d, %r13d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r15), %rax
|
||||
movq 24(%r15), %rcx
|
||||
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||
movslq 8(%r15), %rdx
|
||||
movq 16(%r12), %rsi
|
||||
movq 64(%r12), %rdi
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 16(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, (%rsp) # 8-byte Spill
|
||||
xorl %r12d, %r12d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r9, %rdx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r15,8)
|
||||
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r10,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r9), %ecx
|
||||
addl $6, %r9d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r9d
|
||||
sarl $2, %r9d
|
||||
movslq %r9d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r12
|
||||
addq (%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r13, %r12
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq 8(%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r12,4), %r9
|
||||
leaq (%r12,%r12,2), %rcx
|
||||
leal 1(%rcx), %r10d
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r15d
|
||||
testq %r9, %r9
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r9d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ecx, %ecx
|
||||
vxorpd %xmm9, %xmm9, %xmm9
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rcx
|
||||
cmpq %rcx, %rdx
|
||||
je .LBB1_6
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rcx,4), %r8
|
||||
leaq (%r8,%r8,2), %r14
|
||||
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||
movslq %r14d, %rbp
|
||||
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm3, %xmm8, %xmm3
|
||||
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm7, %xmm3, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm9, %xmm9
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r13d, %r8d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rbp), %rbx
|
||||
addq $2, %rbp
|
||||
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r14,8)
|
||||
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
jmp .LBB1_13
|
||||
.LBB1_7: #
|
||||
movq 16(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $40, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.2, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.2, 66
|
||||
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
@@ -1,46 +1,112 @@
|
||||
#!/bin/bash
|
||||
|
||||
TAG=ICX
|
||||
OPT_SCHEME=gromacs
|
||||
MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
|
||||
FREQ=2.4
|
||||
NRUNS=3
|
||||
FIXED_PARAMS=--freq $FREQ
|
||||
[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
|
||||
[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
|
||||
[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit
|
||||
|
||||
if [ "$OPT_SCHEME" = "gromacs" ]; then
|
||||
STUB1_NAME=Stub-33
|
||||
STUB1_PARAMS=-na 4 -nn 33
|
||||
STUB2_NAME=Stub-128
|
||||
STUB2_PARAMS=-na 4 -nn 128
|
||||
MDBENCH_BIN=$1
|
||||
BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
|
||||
OPT_SCHEME="${BIN_INFO%%-*}"
|
||||
PREC="${BIN_INFO##*-}"
|
||||
BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
|
||||
BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
|
||||
TAG="${BIN_INFO%%-*}"
|
||||
ISA="${BIN_INFO##*-}"
|
||||
CORE="${CORE:-0}"
|
||||
FREQ="${FREQ:-2.4}"
|
||||
NRUNS="${NRUNS:-3}"
|
||||
LOG="${LOG:-latencies_and_cfds.log}"
|
||||
STUB_ONLY="${STUB_ONLY:-false}"
|
||||
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
|
||||
|
||||
OPTIND=2
|
||||
while getopts "c:f:n:l:s" flag; do
|
||||
case "${flag}" in
|
||||
c) CORE=${OPTARG};;
|
||||
f) FREQ=${OPTARG};;
|
||||
n) NRUNS=${OPTARG};;
|
||||
l) LOG=${OPTARG};;
|
||||
s) STUB_ONLY=true;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Other useful variables
|
||||
MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
|
||||
FIXED_PARAMS="--freq $FREQ"
|
||||
CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
|
||||
|
||||
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
|
||||
ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
|
||||
PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
|
||||
else
|
||||
STUB1_NAME=Stub-76
|
||||
STUB1_PARAMS=-nn 76
|
||||
STUB2_NAME=Stub-1024
|
||||
STUB2_PARAMS=-nn 1024
|
||||
ALL_PREFETCHERS=""
|
||||
PREFETCHERS=("IGNORE")
|
||||
fi
|
||||
|
||||
if [ "$OPT_SCHEME" == "gromacs" ]; then
|
||||
STUB1_NAME=stub-33
|
||||
STUB1_PARAMS="-na 4 -nn 33"
|
||||
STUB2_NAME=stub-128
|
||||
STUB2_PARAMS="-na 4 -nn 128"
|
||||
else
|
||||
STUB1_NAME=stub-76
|
||||
STUB1_PARAMS="-nn 76"
|
||||
STUB2_NAME=stub-1024
|
||||
STUB2_PARAMS="-nn 1024"
|
||||
fi
|
||||
|
||||
function run_benchmark() {
|
||||
BEST=10000000
|
||||
for i in $(seq $NRUNS); do
|
||||
likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
|
||||
RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
|
||||
if (( $(echo "$BEST > $RES" | bc -l ) )); then
|
||||
BEST=$RES
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
echo "Tag: $TAG"
|
||||
echo "Optimization scheme: $OPT_SCHEME"
|
||||
echo "Binary: $MDBENCH_BIN(-stub)"
|
||||
echo "Frequency: $FREQ"
|
||||
echo "Number of runs: $NRUNS"
|
||||
echo "Tag: $TAG" | tee -a $LOG
|
||||
echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
|
||||
echo "Instruction set: $ISA" | tee -a $LOG
|
||||
echo "Precision: $PREC" | tee -a $LOG
|
||||
echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
|
||||
echo "Frequency: $FREQ" | tee -a $LOG
|
||||
echo "Number of runs: $NRUNS" | tee -a $LOG
|
||||
echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG
|
||||
|
||||
echo "Fixing frequencies..."
|
||||
likwid-setFrequencies -f $FREQ -t 0
|
||||
if [ "$SKIP_SET_FREQ" == "false" ]; then
|
||||
echo "Fixing frequencies..."
|
||||
likwid-setFrequencies -f $FREQ -t 0
|
||||
fi
|
||||
|
||||
echo "Standard"
|
||||
run_benchmark $MDBENCH_BIN
|
||||
echo "Melt"
|
||||
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
|
||||
echo "Argon"
|
||||
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
|
||||
echo "$STUB1_NAME"
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
|
||||
echo "$STUB2_NAME"
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
|
||||
for p in $PREFETCHERS; do
|
||||
if [ "$p" != "IGNORE" ]; then
|
||||
if [ "$p" == "ALL" ]; then
|
||||
likwid-features -c $CORE -e $ALL_PREFETCHERS
|
||||
elif [ "$p" == "NONE" ]; then
|
||||
likwid-features -c $CORE -d $ALL_PREFETCHERS
|
||||
else
|
||||
likwid-features -c $CORE -d $ALL_PREFETCHERS
|
||||
likwid-features -c $CORE -e $p
|
||||
fi
|
||||
|
||||
echo "Prefetcher settings: $p"
|
||||
likwid-features -c $CORE -l
|
||||
fi
|
||||
|
||||
MSG="$p: "
|
||||
if [ "$STUB_ONLY" == "false" ]; then
|
||||
run_benchmark $MDBENCH_BIN
|
||||
MSG+="standard=$BEST, "
|
||||
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
|
||||
MSG+="melt=$BEST, "
|
||||
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
|
||||
MSG+="argon=$BEST, "
|
||||
fi
|
||||
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
|
||||
MSG+="$STUB1_NAME=$BEST, "
|
||||
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
|
||||
MSG+="$STUB2_NAME=$BEST"
|
||||
echo $MSG | tee -a $LOG
|
||||
done
|
||||
|
Reference in New Issue
Block a user