Compare commits
45 Commits
gromacs_gp
...
gromacs_ma
Author | SHA1 | Date | |
---|---|---|---|
|
59145644e3 | ||
|
4a460b2c88 | ||
|
b15aa2f461 | ||
|
5c000444a4 | ||
|
04ade6bcec | ||
|
85f1484449 | ||
|
965fda3879 | ||
|
a86d214c73 | ||
|
d138f975f6 | ||
|
296a4c4e01 | ||
|
f5fd3e265a | ||
|
1fbf9dbdac | ||
|
89e1b9a9b6 | ||
|
4e99f7a623 | ||
|
4607202752 | ||
|
301274c9b6 | ||
|
95d63334fa | ||
|
d0277765c3 | ||
|
5814a86125 | ||
|
98583cdade | ||
|
cb5598bc91 | ||
|
3b076cdb49 | ||
|
122a23e2b8 | ||
|
32e004944f | ||
|
6126d74aa9 | ||
|
016f07dcaa | ||
|
90f30d26a3 | ||
|
01cc05a5d6 | ||
|
c61cf9a0ac | ||
|
d545ca65d4 | ||
|
5833f00894 | ||
|
8aad7e87a0 | ||
|
ffad9d40f3 | ||
|
99da76d59c | ||
|
cfe888c132 | ||
|
c7b136f629 | ||
|
07f2f74561 | ||
|
fd368609e8 | ||
|
db5f8cf1c6 | ||
|
f467d10ed3 | ||
|
fe86c948a8 | ||
|
ae1cfa2800 | ||
|
e5c233e072 | ||
|
8d5e10f635 | ||
|
56ff0d19af |
23
.gitignore
vendored
23
.gitignore
vendored
@@ -51,14 +51,17 @@ Module.symvers
|
|||||||
Mkfile.old
|
Mkfile.old
|
||||||
dkms.conf
|
dkms.conf
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# TODO list
|
||||||
|
todo.txt
|
||||||
|
|
||||||
# Build directories and executables
|
# Build directories and executables
|
||||||
GCC/
|
#GCC-*/
|
||||||
ICC/
|
#ICC-*/
|
||||||
ICX/
|
#ICX-*/
|
||||||
CLANG/
|
#CLANG-*/
|
||||||
NVCC/
|
#NVCC-*/
|
||||||
MDBench-GCC*
|
build-*/
|
||||||
MDBench-ICC*
|
MDBench-*
|
||||||
MDBench-ICX*
|
|
||||||
MDBench-CLANG*
|
|
||||||
MDBench-NVCC*
|
|
||||||
|
16
Makefile
16
Makefile
@@ -1,6 +1,7 @@
|
|||||||
#CONFIGURE BUILD SYSTEM
|
#CONFIGURE BUILD SYSTEM
|
||||||
TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
|
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
|
||||||
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
|
TARGET = MDBench-$(IDENTIFIER)
|
||||||
|
BUILD_DIR = ./build-$(IDENTIFIER)
|
||||||
SRC_DIR = ./$(OPT_SCHEME)
|
SRC_DIR = ./$(OPT_SCHEME)
|
||||||
ASM_DIR = ./asm
|
ASM_DIR = ./asm
|
||||||
COMMON_DIR = ./common
|
COMMON_DIR = ./common
|
||||||
@@ -97,10 +98,6 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true)
|
|||||||
DEFINES += -DUSE_SIMD_KERNEL
|
DEFINES += -DUSE_SIMD_KERNEL
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
|
|
||||||
DEFINES += -DUSE_SUPER_CLUSTERS
|
|
||||||
endif
|
|
||||||
|
|
||||||
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
|
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
|
||||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||||
@@ -155,6 +152,13 @@ $(BUILD_DIR)/%.o: %.s
|
|||||||
clean:
|
clean:
|
||||||
$(info ===> CLEAN)
|
$(info ===> CLEAN)
|
||||||
@rm -rf $(BUILD_DIR)
|
@rm -rf $(BUILD_DIR)
|
||||||
|
@rm -rf MDBench-$(IDENTIFIER)
|
||||||
|
@rm -f tags
|
||||||
|
|
||||||
|
cleanall:
|
||||||
|
$(info ===> CLEAN)
|
||||||
|
@rm -rf build-*
|
||||||
|
@rm -rf MDBench-*
|
||||||
@rm -f tags
|
@rm -f tags
|
||||||
|
|
||||||
distclean: clean
|
distclean: clean
|
||||||
|
1541
asm/unused/force_lj_lammps_avx512_dp_no_newton_raphson.s
Normal file
1541
asm/unused/force_lj_lammps_avx512_dp_no_newton_raphson.s
Normal file
File diff suppressed because it is too large
Load Diff
1421
asm/unused/force_lj_lammps_avx512_sp_no_newton_raphson.s
Normal file
1421
asm/unused/force_lj_lammps_avx512_sp_no_newton_raphson.s
Normal file
File diff suppressed because it is too large
Load Diff
@@ -9,8 +9,10 @@
|
|||||||
|
|
||||||
#if PRECISION == 1
|
#if PRECISION == 1
|
||||||
# define MD_FLOAT float
|
# define MD_FLOAT float
|
||||||
|
# define MD_UINT unsigned int
|
||||||
#else
|
#else
|
||||||
# define MD_FLOAT double
|
# define MD_FLOAT double
|
||||||
|
# define MD_UINT unsigned long long int
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@@ -12,7 +12,10 @@
|
|||||||
#define MD_SIMD_FLOAT __m512d
|
#define MD_SIMD_FLOAT __m512d
|
||||||
#define MD_SIMD_MASK __mmask8
|
#define MD_SIMD_MASK __mmask8
|
||||||
#define MD_SIMD_INT __m256i
|
#define MD_SIMD_INT __m256i
|
||||||
|
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||||
|
#define MD_SIMD_IBOOL __mmask16
|
||||||
|
|
||||||
|
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||||
|
@@ -7,11 +7,30 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
#ifndef NO_ZMM_INTRIN
|
||||||
# include <zmmintrin.h>
|
# include <zmmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#define MD_SIMD_FLOAT __m512
|
#define MD_SIMD_FLOAT __m512
|
||||||
#define MD_SIMD_MASK __mmask16
|
#define MD_SIMD_MASK __mmask16
|
||||||
|
#define MD_SIMD_INT __m256i
|
||||||
|
#define MD_SIMD_IBOOL __mmask16
|
||||||
|
#define MD_SIMD_INT32 __m512i
|
||||||
|
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||||
|
|
||||||
|
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||||
|
return _mm512_load_si512(m);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||||
|
return _mm512_set1_epi32(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||||
|
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||||
@@ -69,7 +88,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
|
|||||||
return _mm_cvtss_f32(t3);
|
return _mm_cvtss_f32(t3);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||||
__m256 t;
|
__m256 t;
|
||||||
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||||
t = _mm256_load_ps(m);
|
t = _mm256_load_ps(m);
|
||||||
|
@@ -131,19 +131,19 @@ void readParameter(Parameter *param, const char *filename) {
|
|||||||
void printParameter(Parameter *param) {
|
void printParameter(Parameter *param) {
|
||||||
printf("Parameters:\n");
|
printf("Parameters:\n");
|
||||||
if(param->input_file != NULL) {
|
if(param->input_file != NULL) {
|
||||||
printf("Input file: %s\n", param->input_file);
|
printf("\tInput file: %s\n", param->input_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(param->vtk_file != NULL) {
|
if(param->vtk_file != NULL) {
|
||||||
printf("VTK file: %s\n", param->vtk_file);
|
printf("\tVTK file: %s\n", param->vtk_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(param->xtc_file != NULL) {
|
if(param->xtc_file != NULL) {
|
||||||
printf("XTC file: %s\n", param->xtc_file);
|
printf("\tXTC file: %s\n", param->xtc_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(param->eam_file != NULL) {
|
if(param->eam_file != NULL) {
|
||||||
printf("EAM file: %s\n", param->eam_file);
|
printf("\tEAM file: %s\n", param->eam_file);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||||
|
@@ -1,5 +1,5 @@
|
|||||||
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
|
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
|
||||||
TAG ?= NVCC
|
TAG ?= ICC
|
||||||
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
|
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
|
||||||
ISA ?= AVX512
|
ISA ?= AVX512
|
||||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||||
@@ -13,7 +13,7 @@ DATA_LAYOUT ?= AOS
|
|||||||
# Assembly syntax to generate (ATT/INTEL)
|
# Assembly syntax to generate (ATT/INTEL)
|
||||||
ASM_SYNTAX ?= ATT
|
ASM_SYNTAX ?= ATT
|
||||||
# Debug
|
# Debug
|
||||||
DEBUG ?= true
|
DEBUG ?= false
|
||||||
|
|
||||||
# Explicitly store and load atom types (true or false)
|
# Explicitly store and load atom types (true or false)
|
||||||
EXPLICIT_TYPES ?= false
|
EXPLICIT_TYPES ?= false
|
||||||
@@ -41,7 +41,6 @@ HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
|
|||||||
# Configurations for CUDA
|
# Configurations for CUDA
|
||||||
# Use CUDA host memory to optimize transfers
|
# Use CUDA host memory to optimize transfers
|
||||||
USE_CUDA_HOST_MEMORY ?= false
|
USE_CUDA_HOST_MEMORY ?= false
|
||||||
USE_SUPER_CLUSTERS ?= true
|
|
||||||
|
|
||||||
#Feature options
|
#Feature options
|
||||||
OPTIONS = -DALIGNMENT=64
|
OPTIONS = -DALIGNMENT=64
|
||||||
|
@@ -7,6 +7,6 @@ temp 80
|
|||||||
x_out_freq 500
|
x_out_freq 500
|
||||||
v_out_freq 5
|
v_out_freq 5
|
||||||
cutforce 0.9
|
cutforce 0.9
|
||||||
skin 0.0
|
skin 0.05
|
||||||
reneigh_every 100
|
reneigh_every 100
|
||||||
nstat 125000
|
nstat 125000
|
||||||
|
142
gromacs/atom.c
142
gromacs/atom.c
@@ -37,24 +37,7 @@ void initAtom(Atom *atom) {
|
|||||||
atom->iclusters = NULL;
|
atom->iclusters = NULL;
|
||||||
atom->jclusters = NULL;
|
atom->jclusters = NULL;
|
||||||
atom->icluster_bin = NULL;
|
atom->icluster_bin = NULL;
|
||||||
|
initMasks(atom);
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
atom->scl_x = NULL;
|
|
||||||
atom->scl_v = NULL;
|
|
||||||
atom->scl_f = NULL;
|
|
||||||
|
|
||||||
atom->Nsclusters = 0;
|
|
||||||
atom->Nsclusters_local = 0;
|
|
||||||
atom->Nsclusters_ghost = 0;
|
|
||||||
atom->Nsclusters_max = 0;
|
|
||||||
|
|
||||||
atom->scl_type = NULL;
|
|
||||||
|
|
||||||
atom->siclusters = NULL;
|
|
||||||
atom->icluster_idx = NULL;
|
|
||||||
|
|
||||||
atom->sicluster_bin = NULL;
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void createAtom(Atom *atom, Parameter *param) {
|
void createAtom(Atom *atom, Parameter *param) {
|
||||||
@@ -68,6 +51,7 @@ void createAtom(Atom *atom, Parameter *param) {
|
|||||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
|
||||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||||
atom->epsilon[i] = param->epsilon;
|
atom->epsilon[i] = param->epsilon;
|
||||||
atom->sigma6[i] = param->sigma6;
|
atom->sigma6[i] = param->sigma6;
|
||||||
@@ -410,6 +394,113 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
|||||||
return natoms;
|
return natoms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void initMasks(Atom *atom) {
|
||||||
|
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||||
|
unsigned int mask0, mask1, mask2, mask3;
|
||||||
|
|
||||||
|
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||||
|
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
|
||||||
|
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
|
||||||
|
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
|
||||||
|
|
||||||
|
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
|
||||||
|
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
|
||||||
|
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||||
|
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||||
|
atom->exclusion_filter[i] = (1U << i);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if CLUSTER_M == CLUSTER_N
|
||||||
|
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||||
|
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||||
|
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||||
|
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||||
|
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||||
|
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||||
|
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||||
|
|
||||||
|
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||||
|
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||||
|
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||||
|
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||||
|
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||||
|
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||||
|
|
||||||
|
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||||
|
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
|
||||||
|
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
|
||||||
|
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
|
||||||
|
|
||||||
|
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||||
|
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
|
||||||
|
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
|
||||||
|
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||||
|
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
|
||||||
|
#if CLUSTER_M < CLUSTER_N
|
||||||
|
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||||
|
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||||
|
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||||
|
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||||
|
#else
|
||||||
|
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||||
|
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||||
|
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||||
|
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||||
|
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||||
|
|
||||||
|
#if CLUSTER_M < CLUSTER_N
|
||||||
|
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||||
|
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||||
|
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||||
|
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||||
|
#else
|
||||||
|
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||||
|
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||||
|
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||||
|
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||||
|
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||||
|
|
||||||
|
#if CLUSTER_M < CLUSTER_N
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||||
|
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||||
|
#else
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
|
||||||
|
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
|
||||||
|
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
|
||||||
|
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
void growAtom(Atom *atom) {
|
void growAtom(Atom *atom) {
|
||||||
int nold = atom->Nmax;
|
int nold = atom->Nmax;
|
||||||
atom->Nmax += DELTA;
|
atom->Nmax += DELTA;
|
||||||
@@ -439,18 +530,3 @@ void growClusters(Atom *atom) {
|
|||||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
void growSuperClusters(Atom *atom) {
|
|
||||||
int nold = atom->Nsclusters_max;
|
|
||||||
atom->Nsclusters_max += DELTA;
|
|
||||||
atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
|
|
||||||
atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
|
|
||||||
atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
|
|
||||||
//atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
|
|
||||||
|
|
||||||
atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
@@ -39,29 +39,8 @@ extern "C" {
|
|||||||
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
||||||
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
||||||
int isReneighboured;
|
int isReneighboured;
|
||||||
|
|
||||||
int *cuda_iclusters;
|
|
||||||
int *cuda_nclusters;
|
|
||||||
|
|
||||||
int cuda_max_scl;
|
|
||||||
MD_FLOAT *cuda_scl_x;
|
|
||||||
MD_FLOAT *cuda_scl_v;
|
|
||||||
MD_FLOAT *cuda_scl_f;
|
|
||||||
|
|
||||||
extern void alignDataToSuperclusters(Atom *atom);
|
|
||||||
extern void alignDataFromSuperclusters(Atom *atom);
|
|
||||||
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
|
||||||
int *cuda_nclusters,
|
|
||||||
int *cuda_natoms,
|
|
||||||
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
|
|
||||||
|
|
||||||
extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
|
||||||
int *cuda_nclusters, int *cuda_natoms,
|
|
||||||
int Nsclusters_local, MD_FLOAT dtforce);
|
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||||
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
|
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
|
||||||
@@ -80,23 +59,10 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
|
|||||||
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||||
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||||
isReneighboured = 1;
|
isReneighboured = 1;
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
cuda_max_scl = atom->Nsclusters_max;
|
|
||||||
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
|
|
||||||
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
|
|
||||||
|
|
||||||
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
void copyDataToCUDADevice(Atom *atom) {
|
void copyDataToCUDADevice(Atom *atom) {
|
||||||
DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
|
|
||||||
|
|
||||||
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
@@ -119,49 +85,13 @@ void copyDataToCUDADevice(Atom *atom) {
|
|||||||
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
|
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
|
||||||
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
|
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
|
||||||
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
|
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
//alignDataToSuperclusters(atom);
|
|
||||||
|
|
||||||
if (cuda_max_scl < atom->Nsclusters_max) {
|
|
||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
|
|
||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
|
|
||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
|
|
||||||
cuda_max_scl = atom->Nsclusters_max;
|
|
||||||
|
|
||||||
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
|
|
||||||
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
|
|
||||||
|
|
||||||
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
}
|
|
||||||
memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
void copyDataFromCUDADevice(Atom *atom) {
|
void copyDataFromCUDADevice(Atom *atom) {
|
||||||
DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
|
|
||||||
|
|
||||||
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
//alignDataFromSuperclusters(atom);
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
@@ -179,12 +109,6 @@ void cudaDeviceFree() {
|
|||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
|
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
|
||||||
free(natoms);
|
free(natoms);
|
||||||
free(ngatoms);
|
free(ngatoms);
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
|
|
||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
|
|
||||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||||
@@ -241,39 +165,6 @@ __global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
|
||||||
int *cuda_jclusters_natoms,
|
|
||||||
int *cuda_PBCx,
|
|
||||||
int *cuda_PBCy,
|
|
||||||
int *cuda_PBCz,
|
|
||||||
int Nsclusters_local,
|
|
||||||
int Nclusters_ghost,
|
|
||||||
MD_FLOAT param_xprd,
|
|
||||||
MD_FLOAT param_yprd,
|
|
||||||
MD_FLOAT param_zprd) {
|
|
||||||
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
if (cg >= Nclusters_ghost) return;
|
|
||||||
|
|
||||||
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
|
||||||
int jfac = SCLUSTER_SIZE / CLUSTER_M;
|
|
||||||
int ncj = Nsclusters_local / jfac;
|
|
||||||
MD_FLOAT xprd = param_xprd;
|
|
||||||
MD_FLOAT yprd = param_yprd;
|
|
||||||
MD_FLOAT zprd = param_zprd;
|
|
||||||
|
|
||||||
const int cj = ncj + cg;
|
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
|
||||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
|
|
||||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
|
||||||
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
|
|
||||||
|
|
||||||
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
|
|
||||||
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
|
|
||||||
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
|
|
||||||
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
||||||
int Nclusters_local, int Nclusters_max,
|
int Nclusters_local, int Nclusters_max,
|
||||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
||||||
@@ -360,17 +251,9 @@ extern "C"
|
|||||||
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
|
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
|
||||||
const int threads_num = 16;
|
const int threads_num = 16;
|
||||||
dim3 block_size = dim3(threads_num, 1, 1);
|
dim3 block_size = dim3(threads_num, 1, 1);
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
|
|
||||||
cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
|
|
||||||
cuda_nclusters,
|
|
||||||
cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
|
|
||||||
#else
|
|
||||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||||
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
|
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
|
||||||
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
|
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
|
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
|
||||||
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
|
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
@@ -381,19 +264,11 @@ extern "C"
|
|||||||
void cudaUpdatePbc(Atom *atom, Parameter *param) {
|
void cudaUpdatePbc(Atom *atom, Parameter *param) {
|
||||||
const int threads_num = 512;
|
const int threads_num = 512;
|
||||||
dim3 block_size = dim3(threads_num, 1, 1);;
|
dim3 block_size = dim3(threads_num, 1, 1);;
|
||||||
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
|
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
|
|
||||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
|
||||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
|
||||||
param->xprd, param->yprd, param->zprd);
|
|
||||||
#else
|
|
||||||
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
|
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
|
||||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
||||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
atom->Nclusters_local, atom->Nclusters_ghost,
|
||||||
param->xprd, param->yprd, param->zprd);
|
param->xprd, param->yprd, param->zprd);
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
|
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
|
||||||
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
|
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
@@ -435,17 +310,8 @@ extern "C"
|
|||||||
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
|
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
|
||||||
const int threads_num = 16;
|
const int threads_num = 16;
|
||||||
dim3 block_size = dim3(threads_num, 1, 1);
|
dim3 block_size = dim3(threads_num, 1, 1);
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
|
|
||||||
cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
|
|
||||||
cuda_nclusters, cuda_natoms,
|
|
||||||
atom->Nsclusters_local, param->dt);
|
|
||||||
#else
|
|
||||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||||
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
|
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
|
||||||
atom->Nclusters_local, param->dt);
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
|
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
|
||||||
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
|
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
@@ -1,288 +0,0 @@
|
|||||||
|
|
||||||
extern "C" {
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
//---
|
|
||||||
#include <cuda.h>
|
|
||||||
#include <driver_types.h>
|
|
||||||
//---
|
|
||||||
#include <likwid-marker.h>
|
|
||||||
//---
|
|
||||||
#include <atom.h>
|
|
||||||
#include <device.h>
|
|
||||||
#include <neighbor.h>
|
|
||||||
#include <parameter.h>
|
|
||||||
#include <stats.h>
|
|
||||||
#include <timing.h>
|
|
||||||
#include <util.h>
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" {
|
|
||||||
extern MD_FLOAT *cuda_cl_x;
|
|
||||||
extern MD_FLOAT *cuda_cl_v;
|
|
||||||
extern MD_FLOAT *cuda_cl_f;
|
|
||||||
extern int *cuda_neighbors;
|
|
||||||
extern int *cuda_numneigh;
|
|
||||||
extern int *cuda_natoms;
|
|
||||||
extern int *natoms;
|
|
||||||
extern int *ngatoms;
|
|
||||||
extern int *cuda_border_map;
|
|
||||||
extern int *cuda_jclusters_natoms;
|
|
||||||
extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
|
|
||||||
extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
|
|
||||||
extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
|
||||||
extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
|
||||||
extern int isReneighboured;
|
|
||||||
|
|
||||||
extern int *cuda_iclusters;
|
|
||||||
extern int *cuda_nclusters;
|
|
||||||
|
|
||||||
extern MD_FLOAT *cuda_scl_x;
|
|
||||||
extern MD_FLOAT *cuda_scl_v;
|
|
||||||
extern MD_FLOAT *cuda_scl_f;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
extern "C"
|
|
||||||
void alignDataToSuperclusters(Atom *atom) {
|
|
||||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
|
||||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
|
|
||||||
|
|
||||||
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
|
|
||||||
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
|
||||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
|
||||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
|
||||||
|
|
||||||
/*
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
*/
|
|
||||||
|
|
||||||
memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C"
|
|
||||||
void alignDataFromSuperclusters(Atom *atom) {
|
|
||||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
|
||||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
|
|
||||||
|
|
||||||
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
|
|
||||||
|
|
||||||
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
|
||||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
|
||||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
|
||||||
|
|
||||||
/*
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
*/
|
|
||||||
|
|
||||||
memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
|
||||||
int *cuda_nclusters,
|
|
||||||
int *cuda_natoms,
|
|
||||||
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
|
|
||||||
|
|
||||||
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
|
||||||
if (sci_pos >= Nsclusters_local) return;
|
|
||||||
|
|
||||||
//unsigned int ci_pos = cii_pos / CLUSTER_M;
|
|
||||||
//unsigned int scii_pos = cii_pos % CLUSTER_M;
|
|
||||||
|
|
||||||
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
|
|
||||||
//if (scii_pos >= cuda_natoms[ci_pos]) return;
|
|
||||||
|
|
||||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
|
|
||||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
|
||||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
|
||||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
|
||||||
|
|
||||||
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
|
|
||||||
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
|
|
||||||
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
|
|
||||||
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
|
|
||||||
ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
|
|
||||||
ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
|
|
||||||
ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
|
||||||
int *cuda_nclusters, int *cuda_natoms,
|
|
||||||
int Nsclusters_local, MD_FLOAT dtforce) {
|
|
||||||
|
|
||||||
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
|
||||||
if (sci_pos >= Nsclusters_local) return;
|
|
||||||
|
|
||||||
//unsigned int ci_pos = cii_pos / CLUSTER_M;
|
|
||||||
//unsigned int scii_pos = cii_pos % CLUSTER_M;
|
|
||||||
|
|
||||||
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
|
|
||||||
//if (scii_pos >= cuda_natoms[ci_pos]) return;
|
|
||||||
|
|
||||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
|
|
||||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
|
||||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
|
||||||
|
|
||||||
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
|
|
||||||
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
|
|
||||||
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
|
|
||||||
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
|
||||||
int *cuda_nclusters, int *cuda_iclusters,
|
|
||||||
int Nsclusters_local,
|
|
||||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
|
||||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
|
|
||||||
|
|
||||||
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
|
||||||
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
|
|
||||||
if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
|
|
||||||
|
|
||||||
unsigned int ci_pos = scii_pos / CLUSTER_M;
|
|
||||||
unsigned int cii_pos = scii_pos % CLUSTER_M;
|
|
||||||
|
|
||||||
if (ci_pos >= cuda_nclusters[sci_pos]) return;
|
|
||||||
|
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci_pos);
|
|
||||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
|
|
||||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
|
||||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
|
||||||
|
|
||||||
|
|
||||||
//int numneighs = cuda_numneigh[ci_pos];
|
|
||||||
int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
|
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
|
||||||
int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
|
|
||||||
int scj = glob_j / SCLUSTER_SIZE;
|
|
||||||
// TODO Make cj accessible from super cluster data alignment (not reachable right now)
|
|
||||||
int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
|
|
||||||
int cj_vec_base = cj;
|
|
||||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
|
||||||
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
|
|
||||||
|
|
||||||
MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
|
|
||||||
MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
|
|
||||||
MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
|
|
||||||
MD_FLOAT fix = 0;
|
|
||||||
MD_FLOAT fiy = 0;
|
|
||||||
MD_FLOAT fiz = 0;
|
|
||||||
|
|
||||||
|
|
||||||
//int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
|
|
||||||
int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
|
|
||||||
|
|
||||||
if(cond) {
|
|
||||||
MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
|
|
||||||
MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
|
|
||||||
MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
|
|
||||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
|
||||||
if(rsq < cutforcesq) {
|
|
||||||
MD_FLOAT sr2 = 1.0 / rsq;
|
|
||||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
|
||||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
|
||||||
|
|
||||||
if(half_neigh) {
|
|
||||||
atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
|
|
||||||
atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
|
|
||||||
atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
|
|
||||||
}
|
|
||||||
|
|
||||||
fix += delx * force;
|
|
||||||
fiy += dely * force;
|
|
||||||
fiz += delz * force;
|
|
||||||
|
|
||||||
atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
|
|
||||||
atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
|
|
||||||
atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C"
|
|
||||||
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
|
||||||
DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
|
|
||||||
|
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
|
||||||
|
|
||||||
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
|
||||||
if (isReneighboured) {
|
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
|
||||||
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
|
|
||||||
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
|
||||||
memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
|
|
||||||
//memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
|
|
||||||
|
|
||||||
isReneighboured = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int threads_num = 1;
|
|
||||||
dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
|
|
||||||
dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
|
|
||||||
double S = getTimeStamp();
|
|
||||||
LIKWID_MARKER_START("force");
|
|
||||||
computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
|
|
||||||
cuda_nclusters, cuda_iclusters,
|
|
||||||
atom->Nsclusters_local,
|
|
||||||
cuda_numneigh, cuda_neighbors,
|
|
||||||
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
|
|
||||||
sigma6, epsilon);
|
|
||||||
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
|
|
||||||
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
|
|
||||||
LIKWID_MARKER_STOP("force");
|
|
||||||
double E = getTimeStamp();
|
|
||||||
DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
|
|
||||||
return E-S;
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
@@ -16,10 +16,36 @@
|
|||||||
#include <simd.h>
|
#include <simd.h>
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
static inline void gmx_load_simd_2xnn_interactions(
|
||||||
|
int excl,
|
||||||
|
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
|
||||||
|
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
|
||||||
|
|
||||||
|
//SimdInt32 mask_pr_S(excl);
|
||||||
|
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||||
|
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||||
|
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void gmx_load_simd_4xn_interactions(
|
||||||
|
int excl,
|
||||||
|
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
|
||||||
|
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
|
||||||
|
|
||||||
|
//SimdInt32 mask_pr_S(excl);
|
||||||
|
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||||
|
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||||
|
*interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
|
||||||
|
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||||
|
*interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@@ -35,9 +61,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
}
|
}
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp for
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||||
@@ -48,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
int any = 0;
|
int any = 0;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
@@ -119,6 +148,8 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
DEBUG_MESSAGE("computeForceLJ end\n");
|
DEBUG_MESSAGE("computeForceLJ end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
@@ -127,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@@ -136,7 +167,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
@@ -149,9 +179,41 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
}
|
}
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
#pragma omp parallel for
|
/*
|
||||||
|
MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||||
|
MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i);
|
||||||
|
MD_SIMD_FLOAT zero_S = simd_broadcast(0.0);
|
||||||
|
MD_SIMD_FLOAT one_S = simd_broadcast(1.0);
|
||||||
|
|
||||||
|
#if CLUSTER_M <= CLUSTER_N
|
||||||
|
MD_SIMD_MASK diagonal_mask0, diagonal_mask2;
|
||||||
|
diagonal_mask0 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_mask2 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||||
|
#else
|
||||||
|
MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12;
|
||||||
|
diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||||
|
diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
#if CLUSTER_M > CLUSTER_N
|
#if CLUSTER_M > CLUSTER_N
|
||||||
@@ -162,6 +224,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||||
|
|
||||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||||
@@ -176,76 +239,138 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs_masked; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
//int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
unsigned int mask0, mask1, mask2, mask3;
|
//MD_SIMD_MASK interact0;
|
||||||
|
//MD_SIMD_MASK interact2;
|
||||||
|
|
||||||
|
//gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2);
|
||||||
|
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
|
|
||||||
#if CLUSTER_M == CLUSTER_N
|
#if CLUSTER_M == CLUSTER_N
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 0]);
|
||||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 1]);
|
||||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
#else
|
||||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
#if CLUSTER_M < CLUSTER_N
|
||||||
#elif CLUSTER_M < CLUSTER_N
|
|
||||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
|
||||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
|
||||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
|
||||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
|
||||||
#else
|
#else
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
#endif
|
||||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0]);
|
||||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1]);
|
||||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||||
|
cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
|
||||||
|
cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
|
||||||
|
|
||||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
/*
|
||||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
#if CLUSTER_M <= CLUSTER_N
|
||||||
|
if(ci == ci_cj0) {
|
||||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
cutoff_mask0 = simd_mask_and(cutoff_mask0, diagonal_mask0);
|
||||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
cutoff_mask2 = simd_mask_and(cutoff_mask2, diagonal_mask2);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
if(ci == ci_cj0) {
|
||||||
|
cutoff_mask0 = cutoff_mask0 && diagonal_mask00;
|
||||||
|
cutoff_mask2 = cutoff_mask2 && diagonal_mask02;
|
||||||
|
} else if(ci == ci_cj1) {
|
||||||
|
cutoff_mask0 = cutoff_mask0 && diagonal_mask10;
|
||||||
|
cutoff_mask2 = cutoff_mask2 && diagonal_mask12;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||||
|
|
||||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
fix0 += tx0;
|
||||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
fiy0 += ty0;
|
||||||
|
fiz0 += tz0;
|
||||||
|
fix2 += tx2;
|
||||||
|
fiy2 += ty2;
|
||||||
|
fiz2 += tz2;
|
||||||
|
|
||||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
}
|
||||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
#else
|
||||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
fix0 = simd_add(fix0, tx0);
|
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||||
fiy0 = simd_add(fiy0, ty0);
|
int cj = neighs[k].cj;
|
||||||
fiz0 = simd_add(fiz0, tz0);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
fix2 = simd_add(fix2, tx2);
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
fiy2 = simd_add(fiy2, ty2);
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
fiz2 = simd_add(fiz2, tz2);
|
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||||
|
|
||||||
|
fix0 += tx0;
|
||||||
|
fiy0 += ty0;
|
||||||
|
fiz0 += tz0;
|
||||||
|
fix2 += tx2;
|
||||||
|
fiy2 += ty2;
|
||||||
|
fiz2 += tz2;
|
||||||
|
|
||||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||||
@@ -266,6 +391,8 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
@@ -274,7 +401,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@@ -283,7 +410,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
@@ -296,9 +422,12 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
}
|
}
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp for
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
#if CLUSTER_M > CLUSTER_N
|
#if CLUSTER_M > CLUSTER_N
|
||||||
@@ -309,6 +438,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||||
|
|
||||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||||
@@ -323,61 +453,85 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs_masked; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
unsigned int mask0, mask1, mask2, mask3;
|
unsigned int mask0, mask1, mask2, mask3;
|
||||||
|
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
|
|
||||||
#if CLUSTER_M == CLUSTER_N
|
#if CLUSTER_M == CLUSTER_N
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 0]);
|
||||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 1]);
|
||||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
#else
|
||||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
#if CLUSTER_M < CLUSTER_N
|
||||||
#elif CLUSTER_M < CLUSTER_N
|
|
||||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
|
||||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
|
||||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
|
||||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
|
||||||
#else
|
#else
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
|
||||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
|
||||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
|
||||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
|
||||||
#endif
|
#endif
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0]);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1]);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
#endif
|
||||||
|
|
||||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
|
||||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
|
||||||
|
|
||||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||||
|
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||||
|
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||||
|
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||||
|
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||||
|
}
|
||||||
|
|
||||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
int cj = neighs[k].cj;
|
||||||
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
|
|
||||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||||
@@ -398,6 +552,8 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
@@ -414,7 +570,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
|||||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@@ -423,8 +579,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
double S = getTimeStamp();
|
|
||||||
LIKWID_MARKER_START("force");
|
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
@@ -436,7 +590,13 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
#if CLUSTER_M > CLUSTER_N
|
#if CLUSTER_M > CLUSTER_N
|
||||||
@@ -447,6 +607,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||||
|
|
||||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||||
@@ -473,53 +634,52 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs_masked; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||||
|
|
||||||
#if CLUSTER_M == CLUSTER_N
|
#if CLUSTER_M == CLUSTER_N
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 0]);
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 1]);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 2]);
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 3]);
|
||||||
#elif CLUSTER_M < CLUSTER_N
|
#else
|
||||||
|
#if CLUSTER_M < CLUSTER_N
|
||||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
|
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
|
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
|
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
|
|
||||||
#else
|
#else
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
#endif
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0]);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1]);
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2]);
|
||||||
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||||
|
|
||||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||||
@@ -531,28 +691,114 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||||
|
|
||||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||||
|
|
||||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||||
MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
|
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||||
MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
|
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||||
MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
|
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||||
MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
|
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||||
MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
|
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||||
MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
|
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||||
|
|
||||||
|
fix0 = simd_add(fix0, tx0);
|
||||||
|
fiy0 = simd_add(fiy0, ty0);
|
||||||
|
fiz0 = simd_add(fiz0, tz0);
|
||||||
|
fix1 = simd_add(fix1, tx1);
|
||||||
|
fiy1 = simd_add(fiy1, ty1);
|
||||||
|
fiz1 = simd_add(fiz1, tz1);
|
||||||
|
fix2 = simd_add(fix2, tx2);
|
||||||
|
fiy2 = simd_add(fiy2, ty2);
|
||||||
|
fiz2 = simd_add(fiz2, tz2);
|
||||||
|
fix3 = simd_add(fix3, tx3);
|
||||||
|
fiy3 = simd_add(fiy3, ty3);
|
||||||
|
fiz3 = simd_add(fiz3, tz3);
|
||||||
|
|
||||||
|
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||||
|
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||||
|
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||||
|
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||||
|
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||||
|
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||||
|
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||||
|
int cj = neighs[k].cj;
|
||||||
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
|
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
|
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
|
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||||
|
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||||
|
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||||
|
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||||
|
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||||
|
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||||
|
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||||
|
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||||
|
|
||||||
fix0 = simd_add(fix0, tx0);
|
fix0 = simd_add(fix0, tx0);
|
||||||
fiy0 = simd_add(fiy0, ty0);
|
fiy0 = simd_add(fiy0, ty0);
|
||||||
@@ -590,6 +836,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
@@ -598,7 +846,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
@@ -607,8 +855,6 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
double S = getTimeStamp();
|
|
||||||
LIKWID_MARKER_START("force");
|
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||||
@@ -620,7 +866,13 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#pragma omp parallel for
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||||
#if CLUSTER_M > CLUSTER_N
|
#if CLUSTER_M > CLUSTER_N
|
||||||
@@ -631,6 +883,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||||
|
|
||||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||||
@@ -657,52 +910,51 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs_masked; k++) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||||
|
|
||||||
#if CLUSTER_M == CLUSTER_N
|
#if CLUSTER_M == CLUSTER_N
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 0]);
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 1]);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 2]);
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 3]);
|
||||||
#elif CLUSTER_M < CLUSTER_N
|
#else
|
||||||
|
#if CLUSTER_M < CLUSTER_N
|
||||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
|
||||||
#else
|
#else
|
||||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
#endif
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0]);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1]);
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2]);
|
||||||
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||||
|
|
||||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||||
@@ -714,28 +966,88 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||||
|
|
||||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||||
|
|
||||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||||
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||||
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||||
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||||
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||||
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||||
|
int cj = neighs[k].cj;
|
||||||
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
|
int imask = neighs[k].imask;
|
||||||
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||||
|
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||||
|
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||||
|
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||||
|
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||||
|
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||||
|
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||||
|
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||||
|
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||||
|
|
||||||
|
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||||
|
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||||
|
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||||
|
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||||
|
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||||
|
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||||
|
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||||
|
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||||
|
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||||
|
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||||
|
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||||
|
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||||
}
|
}
|
||||||
|
|
||||||
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
||||||
@@ -744,10 +1056,13 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
|
|
||||||
addStat(stats->calculated_forces, 1);
|
addStat(stats->calculated_forces, 1);
|
||||||
addStat(stats->num_neighs, numneighs);
|
addStat(stats->num_neighs, numneighs);
|
||||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
addStat(stats->force_iters, (long long int)((double)numneighs));
|
||||||
|
//addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
|
@@ -22,25 +22,8 @@
|
|||||||
# define KERNEL_NAME "CUDA"
|
# define KERNEL_NAME "CUDA"
|
||||||
# define CLUSTER_M 8
|
# define CLUSTER_M 8
|
||||||
# define CLUSTER_N VECTOR_WIDTH
|
# define CLUSTER_N VECTOR_WIDTH
|
||||||
|
# define UNROLL_J 1
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
# define XX 0
|
|
||||||
# define YY 1
|
|
||||||
# define ZZ 2
|
|
||||||
# define SCLUSTER_SIZE_X 2
|
|
||||||
# define SCLUSTER_SIZE_Y 2
|
|
||||||
# define SCLUSTER_SIZE_Z 2
|
|
||||||
# define SCLUSTER_SIZE (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
|
|
||||||
# define DIM_COORD(dim,coord) ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
|
|
||||||
# define MIN(a,b) ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
|
|
||||||
# define SCLUSTER_M CLUSTER_M * SCLUSTER_SIZE
|
|
||||||
|
|
||||||
# define computeForceLJ computeForceLJSup_cuda
|
|
||||||
#else
|
|
||||||
# define computeForceLJ computeForceLJ_cuda
|
# define computeForceLJ computeForceLJ_cuda
|
||||||
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
# define initialIntegrate cudaInitialIntegrate
|
# define initialIntegrate cudaInitialIntegrate
|
||||||
# define finalIntegrate cudaFinalIntegrate
|
# define finalIntegrate cudaFinalIntegrate
|
||||||
# define updatePbc cudaUpdatePbc
|
# define updatePbc cudaUpdatePbc
|
||||||
@@ -50,11 +33,15 @@
|
|||||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||||
# define KERNEL_NAME "Simd2xNN"
|
# define KERNEL_NAME "Simd2xNN"
|
||||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||||
|
# define UNROLL_I 4
|
||||||
|
# define UNROLL_J 2
|
||||||
# define computeForceLJ computeForceLJ_2xnn
|
# define computeForceLJ computeForceLJ_2xnn
|
||||||
// Simd4xN
|
// Simd4xN
|
||||||
# else
|
# else
|
||||||
# define KERNEL_NAME "Simd4xN"
|
# define KERNEL_NAME "Simd4xN"
|
||||||
# define CLUSTER_N VECTOR_WIDTH
|
# define CLUSTER_N VECTOR_WIDTH
|
||||||
|
# define UNROLL_I 4
|
||||||
|
# define UNROLL_J 1
|
||||||
# define computeForceLJ computeForceLJ_4xn
|
# define computeForceLJ computeForceLJ_4xn
|
||||||
# endif
|
# endif
|
||||||
# ifdef USE_REFERENCE_VERSION
|
# ifdef USE_REFERENCE_VERSION
|
||||||
@@ -73,29 +60,16 @@
|
|||||||
# define CJ1_FROM_CI(a) (a)
|
# define CJ1_FROM_CI(a) (a)
|
||||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
# define CJ1_FROM_SCI(a) (a)
|
|
||||||
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
|
|
||||||
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
|
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
|
||||||
# define CJ0_FROM_CI(a) ((a) << 1)
|
# define CJ0_FROM_CI(a) ((a) << 1)
|
||||||
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
|
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
|
||||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
|
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
|
||||||
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
|
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
|
|
||||||
# define SCJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
|
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
|
||||||
# define CJ0_FROM_CI(a) ((a) >> 1)
|
# define CJ0_FROM_CI(a) ((a) >> 1)
|
||||||
# define CJ1_FROM_CI(a) ((a) >> 1)
|
# define CJ1_FROM_CI(a) ((a) >> 1)
|
||||||
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
|
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
|
||||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
# define SCI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
|
|
||||||
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
#else
|
#else
|
||||||
# error "Invalid cluster configuration!"
|
# error "Invalid cluster configuration!"
|
||||||
#endif
|
#endif
|
||||||
@@ -109,37 +83,14 @@
|
|||||||
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
|
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
|
||||||
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
|
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
#define SCI_SCALAR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 1))
|
|
||||||
#define SCI_VECTOR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 3))
|
|
||||||
#define SCJ_SCALAR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 1))
|
|
||||||
#define SCJ_VECTOR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 3))
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
#if CLUSTER_M >= CLUSTER_N
|
#if CLUSTER_M >= CLUSTER_N
|
||||||
# define CL_X_OFFSET (0 * CLUSTER_M)
|
# define CL_X_OFFSET (0 * CLUSTER_M)
|
||||||
# define CL_Y_OFFSET (1 * CLUSTER_M)
|
# define CL_Y_OFFSET (1 * CLUSTER_M)
|
||||||
# define CL_Z_OFFSET (2 * CLUSTER_M)
|
# define CL_Z_OFFSET (2 * CLUSTER_M)
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
# define SCL_CL_X_OFFSET(ci) (ci * CLUSTER_M + 0 * SCLUSTER_M)
|
|
||||||
# define SCL_CL_Y_OFFSET(ci) (ci * CLUSTER_M + 1 * SCLUSTER_M)
|
|
||||||
# define SCL_CL_Z_OFFSET(ci) (ci * CLUSTER_M + 2 * SCLUSTER_M)
|
|
||||||
|
|
||||||
# define SCL_X_OFFSET (0 * SCLUSTER_M)
|
|
||||||
# define SCL_Y_OFFSET (1 * SCLUSTER_M)
|
|
||||||
# define SCL_Z_OFFSET (2 * SCLUSTER_M)
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
#else
|
#else
|
||||||
# define CL_X_OFFSET (0 * CLUSTER_N)
|
# define CL_X_OFFSET (0 * CLUSTER_N)
|
||||||
# define CL_Y_OFFSET (1 * CLUSTER_N)
|
# define CL_Y_OFFSET (1 * CLUSTER_N)
|
||||||
# define CL_Z_OFFSET (2 * CLUSTER_N)
|
# define CL_Z_OFFSET (2 * CLUSTER_N)
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
# define SCL_X_OFFSET (0 * SCLUSTER_SIZE * CLUSTER_N)
|
|
||||||
# define SCL_Y_OFFSET (1 * SCLUSTER_SIZE * CLUSTER_N)
|
|
||||||
# define SCL_Z_OFFSET (2 * SCLUSTER_SIZE * CLUSTER_N)
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@@ -149,13 +100,6 @@ typedef struct {
|
|||||||
MD_FLOAT bbminz, bbmaxz;
|
MD_FLOAT bbminz, bbmaxz;
|
||||||
} Cluster;
|
} Cluster;
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
int nclusters;
|
|
||||||
MD_FLOAT bbminx, bbmaxx;
|
|
||||||
MD_FLOAT bbminy, bbmaxy;
|
|
||||||
MD_FLOAT bbminz, bbmaxz;
|
|
||||||
} SuperCluster;
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int Natoms, Nlocal, Nghost, Nmax;
|
int Natoms, Nlocal, Nghost, Nmax;
|
||||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
|
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
|
||||||
@@ -177,20 +121,17 @@ typedef struct {
|
|||||||
Cluster *iclusters, *jclusters;
|
Cluster *iclusters, *jclusters;
|
||||||
int *icluster_bin;
|
int *icluster_bin;
|
||||||
int dummy_cj;
|
int dummy_cj;
|
||||||
|
MD_UINT *exclusion_filter;
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
MD_FLOAT *diagonal_4xn_j_minus_i;
|
||||||
int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
|
MD_FLOAT *diagonal_2xnn_j_minus_i;
|
||||||
MD_FLOAT *scl_x;
|
unsigned int masks_2xnn_hn[8];
|
||||||
MD_FLOAT *scl_v;
|
unsigned int masks_2xnn_fn[8];
|
||||||
MD_FLOAT *scl_f;
|
unsigned int masks_4xn_hn[16];
|
||||||
int *scl_type;
|
unsigned int masks_4xn_fn[16];
|
||||||
int *icluster_idx;
|
|
||||||
SuperCluster *siclusters;
|
|
||||||
int *sicluster_bin;
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
} Atom;
|
} Atom;
|
||||||
|
|
||||||
extern void initAtom(Atom*);
|
extern void initAtom(Atom*);
|
||||||
|
extern void initMasks(Atom*);
|
||||||
extern void createAtom(Atom*, Parameter*);
|
extern void createAtom(Atom*, Parameter*);
|
||||||
extern int readAtom(Atom*, Parameter*);
|
extern int readAtom(Atom*, Parameter*);
|
||||||
extern int readAtom_pdb(Atom*, Parameter*);
|
extern int readAtom_pdb(Atom*, Parameter*);
|
||||||
@@ -198,7 +139,6 @@ extern int readAtom_gro(Atom*, Parameter*);
|
|||||||
extern int readAtom_dmp(Atom*, Parameter*);
|
extern int readAtom_dmp(Atom*, Parameter*);
|
||||||
extern void growAtom(Atom*);
|
extern void growAtom(Atom*);
|
||||||
extern void growClusters(Atom*);
|
extern void growClusters(Atom*);
|
||||||
extern void growSuperClusters(Atom*);
|
|
||||||
|
|
||||||
#ifdef AOS
|
#ifdef AOS
|
||||||
# define POS_DATA_LAYOUT "AoS"
|
# define POS_DATA_LAYOUT "AoS"
|
||||||
|
@@ -9,13 +9,35 @@
|
|||||||
|
|
||||||
#ifndef __NEIGHBOR_H_
|
#ifndef __NEIGHBOR_H_
|
||||||
#define __NEIGHBOR_H_
|
#define __NEIGHBOR_H_
|
||||||
|
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
|
||||||
|
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
|
||||||
|
// interaction masks (1 = interaction, 0 = no interaction)
|
||||||
|
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
|
||||||
|
// so read them from right to left (least significant to most significant bit)
|
||||||
|
// All interaction mask is the same for all kernels
|
||||||
|
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
|
||||||
|
// 4x4 kernel diagonal mask
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
|
||||||
|
// 4x2 kernel diagonal masks
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
|
||||||
|
// 4x8 kernel diagonal masks
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
|
||||||
|
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int cj;
|
||||||
|
unsigned int imask;
|
||||||
|
} NeighborCluster;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int every;
|
int every;
|
||||||
int ncalls;
|
int ncalls;
|
||||||
int* neighbors;
|
|
||||||
int maxneighs;
|
int maxneighs;
|
||||||
int* numneigh;
|
int* numneigh;
|
||||||
|
int* numneigh_masked;
|
||||||
int half_neigh;
|
int half_neigh;
|
||||||
|
NeighborCluster* neighbors;
|
||||||
} Neighbor;
|
} Neighbor;
|
||||||
|
|
||||||
extern void initNeighbor(Neighbor*, Parameter*);
|
extern void initNeighbor(Neighbor*, Parameter*);
|
||||||
@@ -25,7 +47,6 @@ extern void buildNeighbor(Atom*, Neighbor*);
|
|||||||
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
|
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
|
||||||
extern void sortAtom(Atom*);
|
extern void sortAtom(Atom*);
|
||||||
extern void buildClusters(Atom*);
|
extern void buildClusters(Atom*);
|
||||||
extern void buildClustersGPU(Atom*);
|
|
||||||
extern void defineJClusters(Atom*);
|
extern void defineJClusters(Atom*);
|
||||||
extern void binClusters(Atom*);
|
extern void binClusters(Atom*);
|
||||||
extern void updateSingleAtoms(Atom*);
|
extern void updateSingleAtoms(Atom*);
|
||||||
|
@@ -16,8 +16,5 @@ extern void setupPbc(Atom*, Parameter*);
|
|||||||
|
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
extern void cudaUpdatePbc(Atom*, Parameter*, int);
|
extern void cudaUpdatePbc(Atom*, Parameter*, int);
|
||||||
#if defined(USE_SUPER_CLUSTERS)
|
|
||||||
extern void setupPbcGPU(Atom*, Parameter*);
|
|
||||||
#endif //defined(USE_SUPER_CLUSTERS)
|
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@@ -1,19 +0,0 @@
|
|||||||
/*
|
|
||||||
* Temporal functions for debugging, remove before proceeding with pull request
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MD_BENCH_UTILS_H
|
|
||||||
#define MD_BENCH_UTILS_H
|
|
||||||
|
|
||||||
#include <atom.h>
|
|
||||||
#include <neighbor.h>
|
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
void verifyClusters(Atom *atom);
|
|
||||||
void verifyLayout(Atom *atom);
|
|
||||||
void checkAlignment(Atom *atom);
|
|
||||||
void showSuperclusters(Atom *atom);
|
|
||||||
void printNeighs(Atom *atom, Neighbor *neighbor);
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
#endif //MD_BENCH_UTILS_H
|
|
@@ -9,7 +9,6 @@
|
|||||||
#ifndef __VTK_H_
|
#ifndef __VTK_H_
|
||||||
#define __VTK_H_
|
#define __VTK_H_
|
||||||
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
|
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
|
||||||
extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
|
||||||
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||||
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||||
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||||
|
@@ -38,16 +38,7 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb
|
|||||||
extern void copyDataToCUDADevice(Atom *atom);
|
extern void copyDataToCUDADevice(Atom *atom);
|
||||||
extern void copyDataFromCUDADevice(Atom *atom);
|
extern void copyDataFromCUDADevice(Atom *atom);
|
||||||
extern void cudaDeviceFree();
|
extern void cudaDeviceFree();
|
||||||
|
#endif
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
#include <utils.h>
|
|
||||||
extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
|
|
||||||
extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
|
|
||||||
extern void alignDataToSuperclusters(Atom *atom);
|
|
||||||
extern void alignDataFromSuperclusters(Atom *atom);
|
|
||||||
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
#endif //CUDA_TARGET
|
|
||||||
|
|
||||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||||
@@ -71,24 +62,11 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
|||||||
setupNeighbor(param, atom);
|
setupNeighbor(param, atom);
|
||||||
setupThermo(param, atom->Natoms);
|
setupThermo(param, atom->Natoms);
|
||||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
buildClustersGPU(atom);
|
|
||||||
#else
|
|
||||||
buildClusters(atom);
|
buildClusters(atom);
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
defineJClusters(atom);
|
defineJClusters(atom);
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
setupPbcGPU(atom, param);
|
|
||||||
//setupPbc(atom, param);
|
|
||||||
#else
|
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
binClusters(atom);
|
binClusters(atom);
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
buildNeighborGPU(atom, neighbor);
|
|
||||||
#else
|
|
||||||
buildNeighbor(atom, neighbor);
|
buildNeighbor(atom, neighbor);
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
initDevice(atom, neighbor);
|
initDevice(atom, neighbor);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
@@ -100,24 +78,11 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
|||||||
LIKWID_MARKER_START("reneighbour");
|
LIKWID_MARKER_START("reneighbour");
|
||||||
updateSingleAtoms(atom);
|
updateSingleAtoms(atom);
|
||||||
updateAtomsPbc(atom, param);
|
updateAtomsPbc(atom, param);
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
buildClustersGPU(atom);
|
|
||||||
#else
|
|
||||||
buildClusters(atom);
|
buildClusters(atom);
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
defineJClusters(atom);
|
defineJClusters(atom);
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
//setupPbcGPU(atom, param);
|
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
#else
|
|
||||||
setupPbc(atom, param);
|
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
binClusters(atom);
|
binClusters(atom);
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
buildNeighborGPU(atom, neighbor);
|
|
||||||
#else
|
|
||||||
buildNeighbor(atom, neighbor);
|
buildNeighbor(atom, neighbor);
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
LIKWID_MARKER_STOP("reneighbour");
|
LIKWID_MARKER_STOP("reneighbour");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
@@ -244,8 +209,6 @@ int main(int argc, char** argv) {
|
|||||||
printParameter(¶m);
|
printParameter(¶m);
|
||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
|
|
||||||
//verifyNeigh(&atom, &neighbor);
|
|
||||||
|
|
||||||
printf("step\ttemp\t\tpressure\n");
|
printf("step\ttemp\t\tpressure\n");
|
||||||
computeThermo(0, ¶m, &atom);
|
computeThermo(0, ¶m, &atom);
|
||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||||
@@ -274,24 +237,14 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for(int n = 0; n < param.ntimes; n++) {
|
for(int n = 0; n < param.ntimes; n++) {
|
||||||
|
|
||||||
//printf("Step:\t%d\r\n", n);
|
|
||||||
|
|
||||||
initialIntegrate(¶m, &atom);
|
initialIntegrate(¶m, &atom);
|
||||||
|
|
||||||
if((n + 1) % param.reneigh_every) {
|
if((n + 1) % param.reneigh_every) {
|
||||||
if(!((n + 1) % param.prune_every)) {
|
if(!((n + 1) % param.prune_every)) {
|
||||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
pruneNeighborGPU(¶m, &atom, &neighbor);
|
|
||||||
#else
|
|
||||||
pruneNeighbor(¶m, &atom, &neighbor);
|
pruneNeighbor(¶m, &atom, &neighbor);
|
||||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
copyDataFromCUDADevice(&atom);
|
|
||||||
updatePbc(&atom, ¶m, 0);
|
updatePbc(&atom, ¶m, 0);
|
||||||
copyDataToCUDADevice(&atom);
|
|
||||||
} else {
|
} else {
|
||||||
#ifdef CUDA_TARGET
|
#ifdef CUDA_TARGET
|
||||||
copyDataFromCUDADevice(&atom);
|
copyDataFromCUDADevice(&atom);
|
||||||
@@ -309,34 +262,14 @@ int main(int argc, char** argv) {
|
|||||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
|
|
||||||
copyDataToCUDADevice(&atom);
|
|
||||||
verifyLayout(&atom);
|
|
||||||
|
|
||||||
//printClusterIndices(&atom);
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
if(param.force_field == FF_EAM) {
|
if(param.force_field == FF_EAM) {
|
||||||
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||||
} else {
|
} else {
|
||||||
timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
|
timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
copyDataFromCUDADevice(&atom);
|
|
||||||
verifyLayout(&atom);
|
|
||||||
|
|
||||||
getchar();
|
|
||||||
*/
|
|
||||||
|
|
||||||
finalIntegrate(¶m, &atom);
|
finalIntegrate(¶m, &atom);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||||
computeThermo(n + 1, ¶m, &atom);
|
computeThermo(n + 1, ¶m, &atom);
|
||||||
}
|
}
|
||||||
|
@@ -56,6 +56,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
|||||||
neighbor->half_neigh = param->half_neigh;
|
neighbor->half_neigh = param->half_neigh;
|
||||||
neighbor->maxneighs = 100;
|
neighbor->maxneighs = 100;
|
||||||
neighbor->numneigh = NULL;
|
neighbor->numneigh = NULL;
|
||||||
|
neighbor->numneigh_masked = NULL;
|
||||||
neighbor->neighbors = NULL;
|
neighbor->neighbors = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,13 +78,8 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
|||||||
|
|
||||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
|
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_X;
|
|
||||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_Y;
|
|
||||||
#else
|
|
||||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
|
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
|
||||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
|
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
|
||||||
#endif
|
|
||||||
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
|
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
|
||||||
nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
|
nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
|
||||||
binsizex = (xhi - xlo) / nbinx;
|
binsizex = (xhi - xlo) / nbinx;
|
||||||
@@ -189,30 +185,43 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int atomDistanceInRangeGPU(Atom *atom, int sci, int cj, MD_FLOAT rsq) {
|
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
|
||||||
for (int ci = 0; ci < atom->siclusters[sci].nclusters; ci++) {
|
static unsigned int get_imask(int rdiag, int ci, int cj) {
|
||||||
const int icluster_idx = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
|
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||||
|
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(icluster_idx);
|
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
|
||||||
|
|
||||||
for(int cii = 0; cii < atom->iclusters[icluster_idx].natoms; cii++) {
|
|
||||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
|
||||||
MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
|
|
||||||
MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
|
|
||||||
MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
|
|
||||||
if(delx * delx + dely * dely + delz * delz < rsq) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
|
||||||
|
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
|
||||||
|
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
|
||||||
|
: NBNXN_INTERACTION_MASK_ALL));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
|
||||||
|
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
|
||||||
|
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||||
|
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
|
||||||
|
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
|
||||||
|
: NBNXN_INTERACTION_MASK_ALL));
|
||||||
|
}
|
||||||
|
|
||||||
|
#if VECTOR_WIDTH == 2
|
||||||
|
# define get_imask_simd_4xn get_imask_simd_j2
|
||||||
|
#elif VECTOR_WIDTH== 4
|
||||||
|
# define get_imask_simd_4xn get_imask_simd_j4
|
||||||
|
#elif VECTOR_WIDTH == 8
|
||||||
|
# define get_imask_simd_4xn get_imask_simd_j8
|
||||||
|
# define get_imask_simd_2xnn get_imask_simd_j4
|
||||||
|
#elif VECTOR_WIDTH == 16
|
||||||
|
# define get_imask_simd_2xnn get_imask_simd_j8
|
||||||
|
#else
|
||||||
|
# error "Invalid cluster configuration"
|
||||||
|
#endif
|
||||||
|
|
||||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||||
|
|
||||||
@@ -222,7 +231,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
|
||||||
|
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
|
||||||
}
|
}
|
||||||
|
|
||||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||||
@@ -238,8 +248,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||||
int n = 0;
|
int n = 0, nmasked = 0;
|
||||||
int ibin = atom->icluster_bin[ci];
|
int ibin = atom->icluster_bin[ci];
|
||||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||||
@@ -304,7 +314,28 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
|
|
||||||
if(d_bb_sq < cutneighsq) {
|
if(d_bb_sq < cutneighsq) {
|
||||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||||
neighptr[n++] = cj;
|
// We use true (1) for rdiag because we only care if there are masks
|
||||||
|
// at all, and when this is set to false (0) the self-exclusions are
|
||||||
|
// not accounted for, which makes the optimized version to not work!
|
||||||
|
unsigned int imask;
|
||||||
|
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
|
||||||
|
imask = get_imask_simd_2xnn(1, ci, cj);
|
||||||
|
#else // 4xn
|
||||||
|
imask = get_imask_simd_4xn(1, ci, cj);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||||
|
neighptr[n].cj = cj;
|
||||||
|
neighptr[n].imask = imask;
|
||||||
|
} else {
|
||||||
|
neighptr[n].cj = neighptr[nmasked].cj;
|
||||||
|
neighptr[n].imask = neighptr[nmasked].imask;
|
||||||
|
neighptr[nmasked].cj = cj;
|
||||||
|
neighptr[nmasked].imask = imask;
|
||||||
|
nmasked++;
|
||||||
|
}
|
||||||
|
|
||||||
|
n++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -326,11 +357,14 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
// Fill neighbor list with dummy values to fit vector width
|
// Fill neighbor list with dummy values to fit vector width
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||||
|
neighptr[n].imask = 0;
|
||||||
|
n++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
neighbor->numneigh[ci] = n;
|
neighbor->numneigh[ci] = n;
|
||||||
|
neighbor->numneigh_masked[ci] = nmasked;
|
||||||
if(n >= neighbor->maxneighs) {
|
if(n >= neighbor->maxneighs) {
|
||||||
resize = 1;
|
resize = 1;
|
||||||
|
|
||||||
@@ -344,7 +378,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||||
free(neighbor->neighbors);
|
free(neighbor->neighbors);
|
||||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -393,221 +427,33 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
DEBUG_MESSAGE("buildNeighbor end\n");
|
DEBUG_MESSAGE("buildNeighbor end\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
// TODO For future parallelization on GPU
|
|
||||||
void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
|
|
||||||
DEBUG_MESSAGE("buildNeighborGPU start\n");
|
|
||||||
|
|
||||||
/* extend atom arrays if necessary */
|
|
||||||
if(atom->Nsclusters_local > nmax) {
|
|
||||||
nmax = atom->Nsclusters_local;
|
|
||||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
|
||||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
|
||||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
|
||||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
|
||||||
}
|
|
||||||
|
|
||||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
|
||||||
MD_FLOAT bby = 0.5 * (binsizey + binsizey);
|
|
||||||
MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
|
|
||||||
rbb_sq = rbb_sq * rbb_sq;
|
|
||||||
int resize = 1;
|
|
||||||
|
|
||||||
/* loop over each atom, storing neighbors */
|
|
||||||
while(resize) {
|
|
||||||
int new_maxneighs = neighbor->maxneighs;
|
|
||||||
resize = 0;
|
|
||||||
|
|
||||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
|
||||||
int ci_cj1 = CJ1_FROM_SCI(sci);
|
|
||||||
int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]);
|
|
||||||
int n = 0;
|
|
||||||
int ibin = atom->sicluster_bin[sci];
|
|
||||||
MD_FLOAT ibb_xmin = atom->siclusters[sci].bbminx;
|
|
||||||
MD_FLOAT ibb_xmax = atom->siclusters[sci].bbmaxx;
|
|
||||||
MD_FLOAT ibb_ymin = atom->siclusters[sci].bbminy;
|
|
||||||
MD_FLOAT ibb_ymax = atom->siclusters[sci].bbmaxy;
|
|
||||||
MD_FLOAT ibb_zmin = atom->siclusters[sci].bbminz;
|
|
||||||
MD_FLOAT ibb_zmax = atom->siclusters[sci].bbmaxz;
|
|
||||||
|
|
||||||
for(int k = 0; k < nstencil; k++) {
|
|
||||||
int jbin = ibin + stencil[k];
|
|
||||||
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
|
|
||||||
int cj, m = -1;
|
|
||||||
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
|
|
||||||
const int c = bin_nclusters[jbin];
|
|
||||||
|
|
||||||
if(c > 0) {
|
|
||||||
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
|
|
||||||
|
|
||||||
do {
|
|
||||||
m++;
|
|
||||||
cj = loc_bin[m];
|
|
||||||
if(neighbor->half_neigh && ci_cj1 > cj) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
|
||||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
|
||||||
dl = ibb_zmin - jbb_zmax;
|
|
||||||
dh = jbb_zmin - ibb_zmax;
|
|
||||||
dm = MAX(dl, dh);
|
|
||||||
dm0 = MAX(dm, 0.0);
|
|
||||||
d_bb_sq = dm0 * dm0;
|
|
||||||
} while(m + 1 < c && d_bb_sq > cutneighsq);
|
|
||||||
|
|
||||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
|
||||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
|
||||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
|
||||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
|
||||||
|
|
||||||
while(m < c) {
|
|
||||||
if(!neighbor->half_neigh || ci_cj1 <= cj) {
|
|
||||||
dl = ibb_zmin - jbb_zmax;
|
|
||||||
dh = jbb_zmin - ibb_zmax;
|
|
||||||
dm = MAX(dl, dh);
|
|
||||||
dm0 = MAX(dm, 0.0);
|
|
||||||
d_bb_sq = dm0 * dm0;
|
|
||||||
|
|
||||||
/*if(d_bb_sq > cutneighsq) {
|
|
||||||
break;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
dl = ibb_ymin - jbb_ymax;
|
|
||||||
dh = jbb_ymin - ibb_ymax;
|
|
||||||
dm = MAX(dl, dh);
|
|
||||||
dm0 = MAX(dm, 0.0);
|
|
||||||
d_bb_sq += dm0 * dm0;
|
|
||||||
|
|
||||||
dl = ibb_xmin - jbb_xmax;
|
|
||||||
dh = jbb_xmin - ibb_xmax;
|
|
||||||
dm = MAX(dl, dh);
|
|
||||||
dm0 = MAX(dm, 0.0);
|
|
||||||
d_bb_sq += dm0 * dm0;
|
|
||||||
|
|
||||||
if(d_bb_sq < cutneighsq) {
|
|
||||||
if(d_bb_sq < rbb_sq || atomDistanceInRangeGPU(atom, sci, cj, cutneighsq)) {
|
|
||||||
neighptr[n++] = cj;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m++;
|
|
||||||
if(m < c) {
|
|
||||||
cj = loc_bin[m];
|
|
||||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
|
||||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
|
||||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
|
||||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
|
||||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
|
||||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fill neighbor list with dummy values to fit vector width
|
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
|
||||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
|
||||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
neighbor->numneigh[sci] = n;
|
|
||||||
if(n >= neighbor->maxneighs) {
|
|
||||||
resize = 1;
|
|
||||||
|
|
||||||
if(n >= new_maxneighs) {
|
|
||||||
new_maxneighs = n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
|
|
||||||
//for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
|
||||||
//const int ci = atom->siclusters[sci].iclusters[scii];
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(resize) {
|
|
||||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
|
||||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
|
||||||
free(neighbor->neighbors);
|
|
||||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
|
|
||||||
for(int ci = 0; ci < 6; ci++) {
|
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
|
||||||
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
|
||||||
ci,
|
|
||||||
atom->iclusters[ci].bbminx,
|
|
||||||
atom->iclusters[ci].bbmaxx,
|
|
||||||
atom->iclusters[ci].bbminy,
|
|
||||||
atom->iclusters[ci].bbmaxy,
|
|
||||||
atom->iclusters[ci].bbminz,
|
|
||||||
atom->iclusters[ci].bbmaxz);
|
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
|
||||||
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("Neighbors:\n");
|
|
||||||
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
|
|
||||||
int cj = neighptr[k];
|
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
|
||||||
|
|
||||||
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
|
||||||
cj,
|
|
||||||
atom->jclusters[cj].bbminx,
|
|
||||||
atom->jclusters[cj].bbmaxx,
|
|
||||||
atom->jclusters[cj].bbminy,
|
|
||||||
atom->jclusters[cj].bbmaxy,
|
|
||||||
atom->jclusters[cj].bbminz,
|
|
||||||
atom->jclusters[cj].bbmaxz);
|
|
||||||
|
|
||||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
|
||||||
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("buildNeighborGPU end\n");
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||||
DEBUG_MESSAGE("pruneNeighbor start\n");
|
DEBUG_MESSAGE("pruneNeighbor start\n");
|
||||||
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
||||||
MD_FLOAT cutsq = cutneighsq;
|
MD_FLOAT cutsq = cutneighsq;
|
||||||
|
|
||||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||||
int k = 0;
|
int k = 0;
|
||||||
|
|
||||||
// Remove dummy clusters if necessary
|
// Remove dummy clusters if necessary
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
|
||||||
numneighs--;
|
numneighs--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while(k < numneighs) {
|
while(k < numneighs) {
|
||||||
int cj = neighs[k];
|
int cj = neighs[k].cj;
|
||||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||||
k++;
|
k++;
|
||||||
} else {
|
} else {
|
||||||
numneighs--;
|
numneighs--;
|
||||||
|
if(k < numneighs_masked) {
|
||||||
|
numneighs_masked--;
|
||||||
|
}
|
||||||
neighs[k] = neighs[numneighs];
|
neighs[k] = neighs[numneighs];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -615,63 +461,19 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
|||||||
// Readd dummy clusters if necessary
|
// Readd dummy clusters if necessary
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||||
|
neighs[numneighs].imask = 0;
|
||||||
|
numneighs++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
neighbor->numneigh[ci] = numneighs;
|
neighbor->numneigh[ci] = numneighs;
|
||||||
|
neighbor->numneigh_masked[ci] = numneighs_masked;
|
||||||
}
|
}
|
||||||
|
|
||||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
DEBUG_MESSAGE("pruneNeighbor end\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
|
||||||
DEBUG_MESSAGE("pruneNeighbor start\n");
|
|
||||||
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
|
||||||
MD_FLOAT cutsq = cutneighsq;
|
|
||||||
|
|
||||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
|
||||||
for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
|
|
||||||
//const int ci = atom->siclusters[sci].iclusters[scii];
|
|
||||||
const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
|
|
||||||
|
|
||||||
int *neighs = &neighbor->neighbors[sci * neighbor->maxneighs];
|
|
||||||
int numneighs = neighbor->numneigh[sci];
|
|
||||||
int k = 0;
|
|
||||||
|
|
||||||
// Remove dummy clusters if necessary
|
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
|
||||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
|
||||||
numneighs--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
while(k < numneighs) {
|
|
||||||
int cj = neighs[k];
|
|
||||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
|
||||||
k++;
|
|
||||||
} else {
|
|
||||||
numneighs--;
|
|
||||||
neighs[k] = neighs[numneighs];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Readd dummy clusters if necessary
|
|
||||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
|
||||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
|
||||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
neighbor->numneigh[sci] = numneighs;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
/* internal subroutines */
|
/* internal subroutines */
|
||||||
MD_FLOAT bindist(int i, int j) {
|
MD_FLOAT bindist(int i, int j) {
|
||||||
MD_FLOAT delx, dely, delz;
|
MD_FLOAT delx, dely, delz;
|
||||||
@@ -797,36 +599,6 @@ void sortAtomsByZCoord(Atom *atom) {
|
|||||||
DEBUG_MESSAGE("sortAtomsByZCoord end\n");
|
DEBUG_MESSAGE("sortAtomsByZCoord end\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
// TODO: Use pigeonhole sorting
|
|
||||||
void sortAtomsByCoord(Atom *atom, int dim, int bin, int start_index, int end_index) {
|
|
||||||
//DEBUG_MESSAGE("sortAtomsByCoord start\n");
|
|
||||||
int *bin_ptr = &bins[bin * atoms_per_bin];
|
|
||||||
|
|
||||||
for(int ac_i = start_index; ac_i <= end_index; ac_i++) {
|
|
||||||
int i = bin_ptr[ac_i];
|
|
||||||
int min_ac = ac_i;
|
|
||||||
int min_idx = i;
|
|
||||||
MD_FLOAT min_coord = DIM_COORD(dim, i);
|
|
||||||
|
|
||||||
for(int ac_j = ac_i + 1; ac_j <= end_index; ac_j++) {
|
|
||||||
int j = bin_ptr[ac_j];
|
|
||||||
MD_FLOAT coordj = DIM_COORD(dim, j);
|
|
||||||
if(coordj < min_coord) {
|
|
||||||
min_ac = ac_j;
|
|
||||||
min_idx = j;
|
|
||||||
min_coord = coordj;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bin_ptr[ac_i] = min_idx;
|
|
||||||
bin_ptr[min_ac] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
//DEBUG_MESSAGE("sortAtomsByCoord end\n");
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
void buildClusters(Atom *atom) {
|
void buildClusters(Atom *atom) {
|
||||||
DEBUG_MESSAGE("buildClusters start\n");
|
DEBUG_MESSAGE("buildClusters start\n");
|
||||||
atom->Nclusters_local = 0;
|
atom->Nclusters_local = 0;
|
||||||
@@ -903,175 +675,6 @@ void buildClusters(Atom *atom) {
|
|||||||
DEBUG_MESSAGE("buildClusters end\n");
|
DEBUG_MESSAGE("buildClusters end\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
void buildClustersGPU(Atom *atom) {
|
|
||||||
DEBUG_MESSAGE("buildClustersGPU start\n");
|
|
||||||
atom->Nclusters_local = 0;
|
|
||||||
|
|
||||||
/* bin local atoms */
|
|
||||||
binAtoms(atom);
|
|
||||||
|
|
||||||
for(int bin = 0; bin < mbins; bin++) {
|
|
||||||
int c = bincount[bin];
|
|
||||||
sortAtomsByCoord(atom, ZZ, bin, 0, c - 1);
|
|
||||||
int ac = 0;
|
|
||||||
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
|
|
||||||
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
|
|
||||||
|
|
||||||
int n_super_clusters_xy = nclusters / (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y);
|
|
||||||
if (nclusters % (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y)) n_super_clusters_xy++;
|
|
||||||
int n_super_clusters = n_super_clusters_xy / SCLUSTER_SIZE_Z;
|
|
||||||
if (n_super_clusters_xy % SCLUSTER_SIZE_Z) n_super_clusters++;
|
|
||||||
|
|
||||||
int cl_count = 0;
|
|
||||||
for (int scl = 0; scl < n_super_clusters; scl++) {
|
|
||||||
const int sci = atom->Nsclusters_local;
|
|
||||||
if(sci >= atom->Nsclusters_max) {
|
|
||||||
growSuperClusters(atom);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cl_count >= nclusters) break;
|
|
||||||
|
|
||||||
int scl_offset = scl * SCLUSTER_SIZE * CLUSTER_M;
|
|
||||||
|
|
||||||
MD_FLOAT sc_bbminx = INFINITY, sc_bbmaxx = -INFINITY;
|
|
||||||
MD_FLOAT sc_bbminy = INFINITY, sc_bbmaxy = -INFINITY;
|
|
||||||
MD_FLOAT sc_bbminz = INFINITY, sc_bbmaxz = -INFINITY;
|
|
||||||
|
|
||||||
for (int scl_z = 0; scl_z < SCLUSTER_SIZE_Z; scl_z++) {
|
|
||||||
|
|
||||||
if (cl_count >= nclusters) break;
|
|
||||||
|
|
||||||
const int atom_scl_z_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M;
|
|
||||||
|
|
||||||
|
|
||||||
const int atom_scl_z_end_idx = MIN(atom_scl_z_offset +
|
|
||||||
SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
|
|
||||||
|
|
||||||
sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx);
|
|
||||||
|
|
||||||
for (int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) {
|
|
||||||
|
|
||||||
if (cl_count >= nclusters) break;
|
|
||||||
|
|
||||||
const int atom_scl_y_offset = scl_offset +
|
|
||||||
scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M +
|
|
||||||
scl_y * SCLUSTER_SIZE_Y * CLUSTER_M;
|
|
||||||
|
|
||||||
const int atom_scl_y_end_idx = MIN(atom_scl_y_offset +
|
|
||||||
SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
|
|
||||||
|
|
||||||
sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx);
|
|
||||||
|
|
||||||
for (int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) {
|
|
||||||
if (cl_count >= nclusters) break;
|
|
||||||
cl_count++;
|
|
||||||
|
|
||||||
const int cluster_sup_idx = scl_z * SCLUSTER_SIZE_Z * SCLUSTER_SIZE_Y +
|
|
||||||
scl_y * SCLUSTER_SIZE_X + scl_x;
|
|
||||||
|
|
||||||
const int ci = atom->Nclusters_local;
|
|
||||||
if(ci >= atom->Nclusters_max) {
|
|
||||||
growClusters(atom);
|
|
||||||
}
|
|
||||||
|
|
||||||
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
|
|
||||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
|
||||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
|
||||||
|
|
||||||
int sci_sca_base = SCI_SCALAR_BASE_INDEX(sci);
|
|
||||||
int sci_vec_base = SCI_VECTOR_BASE_INDEX(sci);
|
|
||||||
MD_FLOAT *sci_x = &atom->scl_x[sci_vec_base];
|
|
||||||
MD_FLOAT *sci_v = &atom->scl_v[sci_vec_base];
|
|
||||||
|
|
||||||
int *ci_type = &atom->cl_type[ci_sca_base];
|
|
||||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
|
||||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
|
||||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
|
||||||
|
|
||||||
atom->iclusters[ci].natoms = 0;
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
|
||||||
if(ac < c) {
|
|
||||||
int i = bins[bin * atoms_per_bin + ac];
|
|
||||||
MD_FLOAT xtmp = atom_x(i);
|
|
||||||
MD_FLOAT ytmp = atom_y(i);
|
|
||||||
MD_FLOAT ztmp = atom_z(i);
|
|
||||||
|
|
||||||
ci_x[CL_X_OFFSET + cii] = xtmp;
|
|
||||||
ci_x[CL_Y_OFFSET + cii] = ytmp;
|
|
||||||
ci_x[CL_Z_OFFSET + cii] = ztmp;
|
|
||||||
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
|
|
||||||
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
|
|
||||||
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
|
|
||||||
|
|
||||||
sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = xtmp;
|
|
||||||
sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = ytmp;
|
|
||||||
sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = ztmp;
|
|
||||||
|
|
||||||
sci_v[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vx[i];
|
|
||||||
sci_v[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vy[i];
|
|
||||||
sci_v[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vz[i];
|
|
||||||
|
|
||||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
|
||||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
|
||||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
|
||||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
|
||||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
|
||||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
|
||||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
|
||||||
|
|
||||||
ci_type[cii] = atom->type[i];
|
|
||||||
atom->iclusters[ci].natoms++;
|
|
||||||
} else {
|
|
||||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
|
||||||
ci_x[CL_Y_OFFSET + cii] = INFINITY;
|
|
||||||
ci_x[CL_Z_OFFSET + cii] = INFINITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
ac++;
|
|
||||||
}
|
|
||||||
|
|
||||||
atom->icluster_bin[ci] = bin;
|
|
||||||
atom->iclusters[ci].bbminx = bbminx;
|
|
||||||
atom->iclusters[ci].bbmaxx = bbmaxx;
|
|
||||||
atom->iclusters[ci].bbminy = bbminy;
|
|
||||||
atom->iclusters[ci].bbmaxy = bbmaxy;
|
|
||||||
atom->iclusters[ci].bbminz = bbminz;
|
|
||||||
atom->iclusters[ci].bbmaxz = bbmaxz;
|
|
||||||
atom->Nclusters_local++;
|
|
||||||
|
|
||||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
|
||||||
if(sc_bbminx > bbminx) { sc_bbminx = bbminx; }
|
|
||||||
if(sc_bbmaxx < bbmaxx) { sc_bbmaxx = bbmaxx; }
|
|
||||||
if(sc_bbminy > bbminy) { sc_bbminy = bbminy; }
|
|
||||||
if(sc_bbmaxy < bbmaxy) { sc_bbmaxy = bbmaxy; }
|
|
||||||
if(sc_bbminz > bbminz) { sc_bbminz = bbminz; }
|
|
||||||
if(sc_bbmaxz < bbmaxz) { sc_bbmaxz = bbmaxz; }
|
|
||||||
|
|
||||||
atom->siclusters[sci].nclusters++;
|
|
||||||
atom->icluster_idx[SCLUSTER_SIZE * sci + cluster_sup_idx] = ci;
|
|
||||||
//atom->siclusters[sci].iclusters[cluster_sup_idx] = ci;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
atom->sicluster_bin[sci] = bin;
|
|
||||||
atom->siclusters[sci].bbminx = sc_bbminx;
|
|
||||||
atom->siclusters[sci].bbmaxx = sc_bbmaxx;
|
|
||||||
atom->siclusters[sci].bbminy = sc_bbminy;
|
|
||||||
atom->siclusters[sci].bbmaxy = sc_bbmaxy;
|
|
||||||
atom->siclusters[sci].bbminz = sc_bbminz;
|
|
||||||
atom->siclusters[sci].bbmaxz = sc_bbmaxz;
|
|
||||||
atom->Nsclusters_local++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("buildClustersGPU end\n");
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
void defineJClusters(Atom *atom) {
|
void defineJClusters(Atom *atom) {
|
||||||
DEBUG_MESSAGE("defineJClusters start\n");
|
DEBUG_MESSAGE("defineJClusters start\n");
|
||||||
|
|
||||||
|
180
gromacs/pbc.c
180
gromacs/pbc.c
@@ -86,98 +86,6 @@ void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
|||||||
DEBUG_MESSAGE("updatePbc end\n");
|
DEBUG_MESSAGE("updatePbc end\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* update coordinates of ghost atoms */
|
|
||||||
/* uses mapping created in setupPbc */
|
|
||||||
void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
|
||||||
DEBUG_MESSAGE("gpuUpdatePbc start\n");
|
|
||||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
|
||||||
int ncj = atom->Nclusters_local / jfac;
|
|
||||||
MD_FLOAT xprd = param->xprd;
|
|
||||||
MD_FLOAT yprd = param->yprd;
|
|
||||||
MD_FLOAT zprd = param->zprd;
|
|
||||||
|
|
||||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
|
||||||
const int cj = ncj + cg;
|
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
|
||||||
|
|
||||||
int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
|
|
||||||
|
|
||||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
|
||||||
|
|
||||||
int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
|
||||||
|
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
|
||||||
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
|
|
||||||
|
|
||||||
MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base];
|
|
||||||
MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base];
|
|
||||||
|
|
||||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
|
||||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
|
||||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
|
||||||
|
|
||||||
MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY;
|
|
||||||
MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY;
|
|
||||||
MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY;
|
|
||||||
|
|
||||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
|
||||||
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
|
||||||
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
|
||||||
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
|
||||||
|
|
||||||
MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
|
||||||
MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
|
||||||
MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
|
||||||
|
|
||||||
cj_x[CL_X_OFFSET + cjj] = xtmp;
|
|
||||||
cj_x[CL_Y_OFFSET + cjj] = ytmp;
|
|
||||||
cj_x[CL_Z_OFFSET + cjj] = ztmp;
|
|
||||||
|
|
||||||
scj_x[SCL_X_OFFSET + cjj] = sxtmp;
|
|
||||||
scj_x[SCL_Y_OFFSET + cjj] = sytmp;
|
|
||||||
scj_x[SCL_Z_OFFSET + cjj] = sztmp;
|
|
||||||
|
|
||||||
if(firstUpdate) {
|
|
||||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
|
||||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
|
||||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
|
||||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
|
||||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
|
||||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
|
||||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
|
||||||
|
|
||||||
if(sbbminx > sxtmp) { sbbminx = sxtmp; }
|
|
||||||
if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; }
|
|
||||||
if(sbbminy > sytmp) { sbbminy = sytmp; }
|
|
||||||
if(sbbmaxy < sytmp) { sbbmaxy = sytmp; }
|
|
||||||
if(sbbminz > sztmp) { sbbminz = sztmp; }
|
|
||||||
if(sbbmaxz < sztmp) { sbbmaxz = sztmp; }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(firstUpdate) {
|
|
||||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
|
||||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
|
||||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
|
||||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
|
||||||
|
|
||||||
scj_x[SCL_X_OFFSET + cjj] = INFINITY;
|
|
||||||
scj_x[SCL_Y_OFFSET + cjj] = INFINITY;
|
|
||||||
scj_x[SCL_Z_OFFSET + cjj] = INFINITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
atom->jclusters[cj].bbminx = bbminx;
|
|
||||||
atom->jclusters[cj].bbmaxx = bbmaxx;
|
|
||||||
atom->jclusters[cj].bbminy = bbminy;
|
|
||||||
atom->jclusters[cj].bbmaxy = bbmaxy;
|
|
||||||
atom->jclusters[cj].bbminz = bbminz;
|
|
||||||
atom->jclusters[cj].bbmaxz = bbmaxz;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
DEBUG_MESSAGE("gpuUpdatePbc end\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* relocate atoms that have left domain according
|
/* relocate atoms that have left domain according
|
||||||
* to periodic boundary conditions */
|
* to periodic boundary conditions */
|
||||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
||||||
@@ -321,91 +229,3 @@ void setupPbc(Atom *atom, Parameter *param) {
|
|||||||
cpuUpdatePbc(atom, param, 1);
|
cpuUpdatePbc(atom, param, 1);
|
||||||
DEBUG_MESSAGE("setupPbc end\n");
|
DEBUG_MESSAGE("setupPbc end\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void setupPbcGPU(Atom *atom, Parameter *param) {
|
|
||||||
DEBUG_MESSAGE("setupPbcGPU start\n");
|
|
||||||
MD_FLOAT xprd = param->xprd;
|
|
||||||
MD_FLOAT yprd = param->yprd;
|
|
||||||
MD_FLOAT zprd = param->zprd;
|
|
||||||
MD_FLOAT Cutneigh = param->cutneigh;
|
|
||||||
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
|
||||||
int jfac = SCLUSTER_M / CLUSTER_M;
|
|
||||||
int ncj = atom->Nsclusters_local * jfac;
|
|
||||||
int Nghost = -1;
|
|
||||||
int Nghost_atoms = 0;
|
|
||||||
|
|
||||||
for(int cj = 0; cj < ncj; cj++) {
|
|
||||||
if(atom->jclusters[cj].natoms > 0) {
|
|
||||||
if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
|
|
||||||
growClusters(atom);
|
|
||||||
//growSuperClusters(atom);
|
|
||||||
}
|
|
||||||
|
|
||||||
if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
|
|
||||||
growPbc(atom);
|
|
||||||
}
|
|
||||||
|
|
||||||
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
|
|
||||||
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
|
|
||||||
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
|
|
||||||
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
|
|
||||||
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
|
|
||||||
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
|
|
||||||
|
|
||||||
/* Setup ghost atoms */
|
|
||||||
/* 6 planes */
|
|
||||||
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
|
|
||||||
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
|
|
||||||
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
|
|
||||||
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
|
|
||||||
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
|
|
||||||
/* 8 corners */
|
|
||||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
|
||||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
|
||||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
|
||||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
|
|
||||||
/* 12 edges */
|
|
||||||
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
|
|
||||||
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
|
|
||||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
|
|
||||||
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
|
|
||||||
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
|
|
||||||
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
|
|
||||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
|
|
||||||
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
|
|
||||||
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
|
|
||||||
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
|
|
||||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
|
|
||||||
growClusters(atom);
|
|
||||||
//growSuperClusters(atom);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add dummy cluster at the end
|
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
|
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
|
||||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
|
||||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
|
||||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
|
||||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
|
||||||
}
|
|
||||||
|
|
||||||
// increase by one to make it the ghost atom count
|
|
||||||
atom->dummy_cj = ncj + Nghost + 1;
|
|
||||||
atom->Nghost = Nghost_atoms;
|
|
||||||
atom->Nclusters_ghost = Nghost + 1;
|
|
||||||
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
|
|
||||||
|
|
||||||
// Update created ghost clusters positions
|
|
||||||
gpuUpdatePbc(atom, param, 1);
|
|
||||||
DEBUG_MESSAGE("setupPbcGPU end\n");
|
|
||||||
}
|
|
||||||
|
@@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
|||||||
MEM_TRACER_INIT;
|
MEM_TRACER_INIT;
|
||||||
INDEX_TRACER_INIT;
|
INDEX_TRACER_INIT;
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
NeighborCluster* neighs;
|
||||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||||
|
|
||||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||||
@@ -34,7 +34,8 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
|||||||
DIST_TRACE(neighs, numneighs);
|
DIST_TRACE(neighs, numneighs);
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
MEM_TRACE(neighs[k], 'R');
|
int j = neighs[k].cj;
|
||||||
|
MEM_TRACE(j, 'R');
|
||||||
MEM_TRACE(atom_x(j), 'R');
|
MEM_TRACE(atom_x(j), 'R');
|
||||||
MEM_TRACE(atom_y(j), 'R');
|
MEM_TRACE(atom_y(j), 'R');
|
||||||
MEM_TRACE(atom_z(j), 'R');
|
MEM_TRACE(atom_z(j), 'R');
|
||||||
|
332
gromacs/utils.c
332
gromacs/utils.c
@@ -1,332 +0,0 @@
|
|||||||
|
|
||||||
/*
|
|
||||||
* Temporal functions for debugging, remove before proceeding with pull request
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <utils.h>
|
|
||||||
|
|
||||||
extern void alignDataToSuperclusters(Atom *atom);
|
|
||||||
extern void alignDataFromSuperclusters(Atom *atom);
|
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
/*
|
|
||||||
void verifyClusters(Atom *atom) {
|
|
||||||
unsigned int count = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < atom->Nsclusters_local; i++) {
|
|
||||||
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
|
|
||||||
MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
|
|
||||||
MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
count = 0;
|
|
||||||
unsigned int diffs = 0;
|
|
||||||
|
|
||||||
printf("######### %d #########\r\n", atom->Nsclusters_local);
|
|
||||||
for (int i = 0; i < atom->Nsclusters_local; i++) {
|
|
||||||
printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
|
|
||||||
|
|
||||||
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
|
|
||||||
//printf("%d\t", atom.siclusters[i].iclusters[j]);
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
|
|
||||||
|
|
||||||
if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
|
|
||||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
|
|
||||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
|
|
||||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
|
|
||||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
|
|
||||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
|
|
||||||
|
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
|
||||||
x[count] = ci_x[CL_X_OFFSET + cii];
|
|
||||||
y[count] = ci_x[CL_Y_OFFSET + cii];
|
|
||||||
z[count] = ci_x[CL_Z_OFFSET + cii];
|
|
||||||
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
printf("######### \t #########\r\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("######### Diffs: %d\t #########\r\n", diffs);
|
|
||||||
|
|
||||||
printf("\r\n");
|
|
||||||
|
|
||||||
count = 0;
|
|
||||||
diffs = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < atom->Nclusters_local; i++) {
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
|
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
|
||||||
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
|
|
||||||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
|
|
||||||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("######### Diffs: %d\t #########\r\n", diffs);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
void verifyLayout(Atom *atom) {
|
|
||||||
|
|
||||||
printf("verifyLayout start\r\n");
|
|
||||||
|
|
||||||
/*
|
|
||||||
unsigned int count = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < atom->Nsclusters_local; i++) {
|
|
||||||
for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
|
|
||||||
}
|
|
||||||
|
|
||||||
MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
|
|
||||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
|
||||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
|
|
||||||
|
|
||||||
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
|
||||||
|
|
||||||
const unsigned int atom_offset = scci;
|
|
||||||
|
|
||||||
/*
|
|
||||||
for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
|
|
||||||
scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
|
|
||||||
scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
|
|
||||||
scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
|
|
||||||
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*/
|
|
||||||
//alignDataToSuperclusters(atom);
|
|
||||||
|
|
||||||
//for (int sci = 0; sci < 2; sci++) {
|
|
||||||
for (int sci = 4; sci < 6; sci++) {
|
|
||||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE;
|
|
||||||
|
|
||||||
MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
|
|
||||||
|
|
||||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
|
||||||
|
|
||||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
|
||||||
const unsigned int ciii = cii % CLUSTER_M;
|
|
||||||
|
|
||||||
/*
|
|
||||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
|
|
||||||
sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
|
|
||||||
*/
|
|
||||||
|
|
||||||
printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
|
|
||||||
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
//for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
|
||||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
|
||||||
|
|
||||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
|
||||||
const unsigned int ciii = cii % CLUSTER_M;
|
|
||||||
|
|
||||||
/*
|
|
||||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
|
|
||||||
sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
|
|
||||||
sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
|
|
||||||
|
|
||||||
for (int cii = 0; cii < CLUSTER_M; ++cii) {
|
|
||||||
printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
|
|
||||||
sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
|
|
||||||
const unsigned int cl_offset = scii * 3 * CLUSTER_M;
|
|
||||||
//MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
|
|
||||||
|
|
||||||
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
|
|
||||||
printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
|
|
||||||
sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
|
|
||||||
printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
|
|
||||||
scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
//}
|
|
||||||
|
|
||||||
printf("##########\t##########\r\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("\r\n");
|
|
||||||
|
|
||||||
//for (int ci = 0; ci < 16; ci++) {
|
|
||||||
for (int ci = 35; ci < 37; ci++) {
|
|
||||||
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
|
|
||||||
|
|
||||||
//for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
|
||||||
|
|
||||||
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
|
|
||||||
ci_x[CL_Y_OFFSET + cii],
|
|
||||||
ci_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
printf("##########\t##########\r\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("verifyLayout end\r\n");
|
|
||||||
|
|
||||||
/*
|
|
||||||
for (int i = 0; i < atom->Nclusters_local; i++) {
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
|
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
|
||||||
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
|
|
||||||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
|
|
||||||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
void checkAlignment(Atom *atom) {
|
|
||||||
alignDataToSuperclusters(atom);
|
|
||||||
|
|
||||||
for (int sci = 4; sci < 6; sci++) {
|
|
||||||
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
|
|
||||||
|
|
||||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
|
||||||
|
|
||||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
|
||||||
const unsigned int ciii = cii % CLUSTER_M;
|
|
||||||
|
|
||||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
|
|
||||||
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int ci = 35; ci < 37; ci++) {
|
|
||||||
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
|
|
||||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
|
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
|
||||||
|
|
||||||
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
|
|
||||||
ci_x[CL_Y_OFFSET + cii],
|
|
||||||
ci_x[CL_Z_OFFSET + cii]);
|
|
||||||
}
|
|
||||||
printf("##########\t##########\r\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void showSuperclusters(Atom *atom) {
|
|
||||||
for (int sci = 4; sci < 6; sci++) {
|
|
||||||
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
|
|
||||||
|
|
||||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
|
||||||
|
|
||||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
|
||||||
const unsigned int ciii = cii % CLUSTER_M;
|
|
||||||
|
|
||||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
|
|
||||||
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void printNeighs(Atom *atom, Neighbor *neighbor) {
|
|
||||||
for (int i = 0; i < atom->Nclusters_local; ++i) {
|
|
||||||
int neigh_num = neighbor->numneigh[i];
|
|
||||||
for (int j = 0; j < neigh_num; j++) {
|
|
||||||
printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
|
|
||||||
}
|
|
||||||
printf("\r\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void printClusterIndices(Atom *atom) {
|
|
||||||
for (int i = 0; i < atom->Nsclusters_local; ++i) {
|
|
||||||
int clusters_num = atom->siclusters[i].nclusters;
|
|
||||||
for (int j = 0; j < clusters_num; j++) {
|
|
||||||
printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
|
|
||||||
}
|
|
||||||
printf("\r\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void verifyNeigh(Atom *atom, Neighbor *neighbor) {
|
|
||||||
|
|
||||||
buildNeighbor(atom, neighbor);
|
|
||||||
int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
|
|
||||||
int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
|
|
||||||
|
|
||||||
for (int i = 0; i < atom->Nclusters_local; ++i) {
|
|
||||||
int neigh_num = neighbor->numneigh[i];
|
|
||||||
numneigh[i] = neighbor->numneigh[i];
|
|
||||||
neighbor->numneigh[i] = 0;
|
|
||||||
for (int j = 0; j < neigh_num; j++) {
|
|
||||||
neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
|
|
||||||
neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
buildNeighborGPU(atom, neighbor);
|
|
||||||
|
|
||||||
unsigned int num_diff = 0;
|
|
||||||
unsigned int neigh_diff = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < atom->Nclusters_local; ++i) {
|
|
||||||
int neigh_num = neighbor->numneigh[i];
|
|
||||||
if (numneigh[i] != neigh_num) num_diff++;
|
|
||||||
for (int j = 0; j < neigh_num; j++) {
|
|
||||||
if (neighbors[i * neighbor->maxneighs + j] !=
|
|
||||||
neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("%d\t%d\r\n", num_diff, neigh_diff);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
@@ -15,61 +15,8 @@ void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
|
|||||||
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
|
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
|
||||||
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
|
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||||
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
|
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
write_super_clusters_to_vtk_file(filename, atom, timestep);
|
|
||||||
#endif //#ifdef USE_SUPER_CLUSTERS
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_SUPER_CLUSTERS
|
|
||||||
int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
|
||||||
char timestep_filename[128];
|
|
||||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
|
|
||||||
FILE* fp = fopen(timestep_filename, "wb");
|
|
||||||
|
|
||||||
if(fp == NULL) {
|
|
||||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
|
||||||
fprintf(fp, "Particle data\n");
|
|
||||||
fprintf(fp, "ASCII\n");
|
|
||||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
|
||||||
fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
|
|
||||||
for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
|
|
||||||
|
|
||||||
int factor = (rand() % 1000) + 1;
|
|
||||||
//double factor = ci * 10;
|
|
||||||
|
|
||||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
|
|
||||||
MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
|
|
||||||
for(int cii = 0; cii < SCLUSTER_M; ++cii) {
|
|
||||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(fp, "\n\n");
|
|
||||||
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
|
|
||||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
|
||||||
fprintf(fp, "1 %d\n", i);
|
|
||||||
}
|
|
||||||
fprintf(fp, "\n\n");
|
|
||||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
|
|
||||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
|
||||||
fprintf(fp, "1\n");
|
|
||||||
}
|
|
||||||
fprintf(fp, "\n\n");
|
|
||||||
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
|
|
||||||
fprintf(fp, "SCALARS mass double\n");
|
|
||||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
|
||||||
fprintf(fp, "1.0\n");
|
|
||||||
}
|
|
||||||
fprintf(fp, "\n\n");
|
|
||||||
fclose(fp);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif //USE_SUPER_CLUSTERS
|
|
||||||
|
|
||||||
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||||
char timestep_filename[128];
|
char timestep_filename[128];
|
||||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
|
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
|
||||||
|
@@ -7,6 +7,7 @@ ANSI_CFLAGS += -pedantic
|
|||||||
ANSI_CFLAGS += -Wextra
|
ANSI_CFLAGS += -Wextra
|
||||||
|
|
||||||
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||||
|
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||||
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||||
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
||||||
ASFLAGS = -masm=intel
|
ASFLAGS = -masm=intel
|
||||||
|
@@ -6,13 +6,29 @@ ANSI_CFLAGS += -std=c99
|
|||||||
ANSI_CFLAGS += -pedantic
|
ANSI_CFLAGS += -pedantic
|
||||||
ANSI_CFLAGS += -Wextra
|
ANSI_CFLAGS += -Wextra
|
||||||
|
|
||||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
ifeq ($(ISA),AVX512)
|
||||||
|
CFLAGS = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
|
||||||
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
|
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
|
||||||
CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),AVX2)
|
||||||
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
|
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||||
|
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
||||||
|
#CFLAGS = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||||
|
CFLAGS = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),AVX)
|
||||||
|
CFLAGS = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),SSE)
|
||||||
|
CFLAGS = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
|
||||||
|
endif
|
||||||
|
|
||||||
|
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||||
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
|
||||||
ASFLAGS = #-masm=intel
|
ASFLAGS = #-masm=intel
|
||||||
LFLAGS =
|
LFLAGS =
|
||||||
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||||
|
@@ -3,11 +3,25 @@ LINKER = $(CC)
|
|||||||
|
|
||||||
OPENMP = #-qopenmp
|
OPENMP = #-qopenmp
|
||||||
PROFILE = #-profile-functions -g -pg
|
PROFILE = #-profile-functions -g -pg
|
||||||
|
|
||||||
|
ifeq ($(ISA),AVX512)
|
||||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
endif
|
||||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
|
||||||
|
ifeq ($(ISA),AVX2)
|
||||||
|
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),AVX)
|
||||||
|
OPTS = -Ofast -xAVX $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),SSE)
|
||||||
|
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||||
#OPTS = -Ofast -xHost $(PROFILE)
|
#OPTS = -Ofast -xHost $(PROFILE)
|
||||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
||||||
|
@@ -3,13 +3,28 @@ LINKER = $(CC)
|
|||||||
|
|
||||||
OPENMP = #-qopenmp
|
OPENMP = #-qopenmp
|
||||||
PROFILE = #-profile-functions -g -pg
|
PROFILE = #-profile-functions -g -pg
|
||||||
#OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
|
||||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
ifeq ($(ISA),AVX512)
|
||||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
#OPTS = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),AVX2)
|
||||||
|
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||||
|
#OPTS = -Ofast -xHost $(PROFILE)
|
||||||
|
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),AVX)
|
||||||
|
OPTS = -Ofast -xAVX $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ISA),SSE)
|
||||||
|
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||||
|
endif
|
||||||
|
|
||||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||||
OPTS = -Ofast -xHost $(PROFILE)
|
#OPTS = -Ofast -xHost $(PROFILE)
|
||||||
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
|
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
|
||||||
ASFLAGS = #-masm=intel
|
ASFLAGS = #-masm=intel
|
||||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||||
|
@@ -9,13 +9,15 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
|
|||||||
__ISA_AVX_FMA__=true
|
__ISA_AVX_FMA__=true
|
||||||
__SIMD_WIDTH_DBL__=4
|
__SIMD_WIDTH_DBL__=4
|
||||||
else ifeq ($(strip $(ISA)), AVX2)
|
else ifeq ($(strip $(ISA)), AVX2)
|
||||||
__ISA_AVX2__=true
|
|
||||||
#__SIMD_KERNEL__=true
|
#__SIMD_KERNEL__=true
|
||||||
|
__ISA_AVX2__=true
|
||||||
__SIMD_WIDTH_DBL__=4
|
__SIMD_WIDTH_DBL__=4
|
||||||
else ifeq ($(strip $(ISA)), AVX512)
|
else ifeq ($(strip $(ISA)), AVX512)
|
||||||
__ISA_AVX512__=true
|
__ISA_AVX512__=true
|
||||||
__SIMD_KERNEL__=true
|
|
||||||
__SIMD_WIDTH_DBL__=8
|
__SIMD_WIDTH_DBL__=8
|
||||||
|
ifeq ($(strip $(DATA_TYPE)), DP)
|
||||||
|
__SIMD_KERNEL__=true
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# SIMD width is specified in double-precision, hence it may
|
# SIMD width is specified in double-precision, hence it may
|
||||||
|
@@ -8,8 +8,7 @@ ANSI_CFLAGS += -Wextra
|
|||||||
|
|
||||||
#
|
#
|
||||||
# A100 + Native
|
# A100 + Native
|
||||||
#CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||||
CFLAGS = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
|
||||||
# A40 + Native
|
# A40 + Native
|
||||||
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||||
# Cascade Lake
|
# Cascade Lake
|
||||||
|
@@ -31,8 +31,12 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
|||||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("force_eam_fp");
|
LIKWID_MARKER_START("force_eam_fp");
|
||||||
#pragma omp parallel for
|
|
||||||
|
#pragma omp for
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[i];
|
int numneighs = neighbor->numneigh[i];
|
||||||
@@ -95,13 +99,19 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force_eam_fp");
|
LIKWID_MARKER_STOP("force_eam_fp");
|
||||||
|
}
|
||||||
|
|
||||||
// We still need to update fp for PBC atoms
|
// We still need to update fp for PBC atoms
|
||||||
for(int i = 0; i < atom->Nghost; i++) {
|
for(int i = 0; i < atom->Nghost; i++) {
|
||||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("force_eam");
|
LIKWID_MARKER_START("force_eam");
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[i];
|
int numneighs = neighbor->numneigh[i];
|
||||||
@@ -192,6 +202,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force_eam");
|
LIKWID_MARKER_STOP("force_eam");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
@@ -26,17 +26,22 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
|||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
#endif
|
#endif
|
||||||
|
const MD_FLOAT num1 = 1.0;
|
||||||
|
const MD_FLOAT num48 = 48.0;
|
||||||
|
const MD_FLOAT num05 = 0.5;
|
||||||
|
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
atom_fx(i) = 0.0;
|
atom_fx(i) = 0.0;
|
||||||
atom_fy(i) = 0.0;
|
atom_fy(i) = 0.0;
|
||||||
atom_fz(i) = 0.0;
|
atom_fz(i) = 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp for
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[i];
|
int numneighs = neighbor->numneigh[i];
|
||||||
@@ -67,9 +72,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if(rsq < cutforcesq) {
|
if(rsq < cutforcesq) {
|
||||||
MD_FLOAT sr2 = 1.0 / rsq;
|
MD_FLOAT sr2 = num1 / rsq;
|
||||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||||
fix += delx * force;
|
fix += delx * force;
|
||||||
fiy += dely * force;
|
fiy += dely * force;
|
||||||
fiz += delz * force;
|
fiz += delz * force;
|
||||||
@@ -90,6 +95,8 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
@@ -102,6 +109,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
MD_FLOAT sigma6 = param->sigma6;
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
#endif
|
#endif
|
||||||
|
const MD_FLOAT num1 = 1.0;
|
||||||
|
const MD_FLOAT num48 = 48.0;
|
||||||
|
const MD_FLOAT num05 = 0.5;
|
||||||
|
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
atom_fx(i) = 0.0;
|
atom_fx(i) = 0.0;
|
||||||
@@ -110,8 +120,12 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
}
|
}
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
LIKWID_MARKER_START("forceLJ-halfneigh");
|
LIKWID_MARKER_START("forceLJ-halfneigh");
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[i];
|
int numneighs = neighbor->numneigh[i];
|
||||||
@@ -146,9 +160,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
if(rsq < cutforcesq) {
|
if(rsq < cutforcesq) {
|
||||||
MD_FLOAT sr2 = 1.0 / rsq;
|
MD_FLOAT sr2 = num1 / rsq;
|
||||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
|
||||||
fix += delx * force;
|
fix += delx * force;
|
||||||
fiy += dely * force;
|
fiy += dely * force;
|
||||||
fiz += delz * force;
|
fiz += delz * force;
|
||||||
@@ -171,6 +185,8 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
}
|
}
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
||||||
|
}
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
@@ -189,7 +205,6 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
|||||||
}
|
}
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
LIKWID_MARKER_START("force");
|
|
||||||
|
|
||||||
#ifndef __SIMD_KERNEL__
|
#ifndef __SIMD_KERNEL__
|
||||||
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
||||||
@@ -201,7 +216,12 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
|||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
#pragma omp for
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[i];
|
int numneighs = neighbor->numneigh[i];
|
||||||
@@ -242,9 +262,11 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
|
|||||||
atom_fy(i) += simd_h_reduce_sum(fiy);
|
atom_fy(i) += simd_h_reduce_sum(fiy);
|
||||||
atom_fz(i) += simd_h_reduce_sum(fiz);
|
atom_fz(i) += simd_h_reduce_sum(fiz);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
88
likwid-outputs/csx-lammps-dp-mem_dp-stub.out
Normal file
88
likwid-outputs/csx-lammps-dp-mem_dp-stub.out
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
--------------------------------------------------------------------------------
|
||||||
|
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||||
|
CPU type: Intel Cascadelake SP processor
|
||||||
|
CPU clock: 2.49 GHz
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Initializing parameters...
|
||||||
|
Initializing atoms...
|
||||||
|
Creating atoms...
|
||||||
|
Pattern: seq
|
||||||
|
Number of timesteps: 200
|
||||||
|
Number of atoms: 256
|
||||||
|
Number of neighbors per atom: 1024
|
||||||
|
Number of times to replicate neighbor lists: 1
|
||||||
|
Estimated total data volume (kB): 1062.9120
|
||||||
|
Estimated atom data volume (kB): 6.1440
|
||||||
|
Estimated neighborlist data volume (kB): 1050.6240
|
||||||
|
Initializing neighbor lists...
|
||||||
|
Creating neighbor lists...
|
||||||
|
Computing forces...
|
||||||
|
Total time: 0.2735, Mega atom updates/s: 0.1872
|
||||||
|
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
|
||||||
|
Statistics:
|
||||||
|
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||||
|
Average neighbors per atom: 1018.9055
|
||||||
|
Average SIMD iterations per atom: 127.3632
|
||||||
|
Total number of computed pair interactions: 52428800
|
||||||
|
Total number of SIMD iterations: 6553600
|
||||||
|
Useful read data volume for force computation: 1.47GB
|
||||||
|
Cycles/SIMD iteration: 83.4598
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Region force, Group 1: MEM_DP
|
||||||
|
+-------------------+------------+
|
||||||
|
| Region Info | HWThread 0 |
|
||||||
|
+-------------------+------------+
|
||||||
|
| RDTSC Runtime [s] | 0.110776 |
|
||||||
|
| call count | 200 |
|
||||||
|
+-------------------+------------+
|
||||||
|
|
||||||
|
+------------------------------------------+---------+------------+
|
||||||
|
| Event | Counter | HWThread 0 |
|
||||||
|
+------------------------------------------+---------+------------+
|
||||||
|
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
|
||||||
|
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
|
||||||
|
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
|
||||||
|
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
|
||||||
|
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
|
||||||
|
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
|
||||||
|
| CAS_COUNT_RD | MBOX0C0 | 8643 |
|
||||||
|
| CAS_COUNT_WR | MBOX0C1 | 1367 |
|
||||||
|
| CAS_COUNT_RD | MBOX1C0 | 9124 |
|
||||||
|
| CAS_COUNT_WR | MBOX1C1 | 1354 |
|
||||||
|
| CAS_COUNT_RD | MBOX2C0 | 9138 |
|
||||||
|
| CAS_COUNT_WR | MBOX2C1 | 1356 |
|
||||||
|
| CAS_COUNT_RD | MBOX3C0 | 5586 |
|
||||||
|
| CAS_COUNT_WR | MBOX3C1 | 1297 |
|
||||||
|
| CAS_COUNT_RD | MBOX4C0 | 5328 |
|
||||||
|
| CAS_COUNT_WR | MBOX4C1 | 1269 |
|
||||||
|
| CAS_COUNT_RD | MBOX5C0 | 5280 |
|
||||||
|
| CAS_COUNT_WR | MBOX5C1 | 1295 |
|
||||||
|
+------------------------------------------+---------+------------+
|
||||||
|
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Metric | HWThread 0 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Runtime (RDTSC) [s] | 0.1108 |
|
||||||
|
| Runtime unhalted [s] | 0.0878 |
|
||||||
|
| Clock [MHz] | 1995.2564 |
|
||||||
|
| CPI | 0.8202 |
|
||||||
|
| Energy [J] | 10.9296 |
|
||||||
|
| Power [W] | 98.6643 |
|
||||||
|
| Energy DRAM [J] | 0 |
|
||||||
|
| Power DRAM [W] | 0 |
|
||||||
|
| DP [MFLOP/s] | 14233.3287 |
|
||||||
|
| AVX DP [MFLOP/s] | 14231.8898 |
|
||||||
|
| Packed [MUOPS/s] | 1778.9862 |
|
||||||
|
| Scalar [MUOPS/s] | 1.4389 |
|
||||||
|
| Memory read bandwidth [MBytes/s] | 24.9001 |
|
||||||
|
| Memory read data volume [GBytes] | 0.0028 |
|
||||||
|
| Memory write bandwidth [MBytes/s] | 4.5861 |
|
||||||
|
| Memory write data volume [GBytes] | 0.0005 |
|
||||||
|
| Memory bandwidth [MBytes/s] | 29.4863 |
|
||||||
|
| Memory data volume [GBytes] | 0.0033 |
|
||||||
|
| Operational intensity | 482.7104 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
|
168
likwid-outputs/csx-lammps-dp-mem_dp.out
Normal file
168
likwid-outputs/csx-lammps-dp-mem_dp.out
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
--------------------------------------------------------------------------------
|
||||||
|
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||||
|
CPU type: Intel Cascadelake SP processor
|
||||||
|
CPU clock: 2.49 GHz
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Parameters:
|
||||||
|
Force field: lj
|
||||||
|
Kernel: plain-C
|
||||||
|
Data layout: AoS
|
||||||
|
Floating-point precision: double
|
||||||
|
Unit cells (nx, ny, nz): 32, 32, 32
|
||||||
|
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||||
|
Periodic (x, y, z): 1, 1, 1
|
||||||
|
Lattice size: 1.679596e+00
|
||||||
|
Epsilon: 1.000000e+00
|
||||||
|
Sigma: 1.000000e+00
|
||||||
|
Spring constant: 1.000000e+00
|
||||||
|
Damping constant: 1.000000e+00
|
||||||
|
Temperature: 1.440000e+00
|
||||||
|
RHO: 8.442000e-01
|
||||||
|
Mass: 1.000000e+00
|
||||||
|
Number of types: 4
|
||||||
|
Number of timesteps: 200
|
||||||
|
Report stats every (timesteps): 100
|
||||||
|
Reneighbor every (timesteps): 20
|
||||||
|
Prune every (timesteps): 1000
|
||||||
|
Output positions every (timesteps): 20
|
||||||
|
Output velocities every (timesteps): 5
|
||||||
|
Delta time (dt): 5.000000e-03
|
||||||
|
Cutoff radius: 2.500000e+00
|
||||||
|
Skin: 3.000000e-01
|
||||||
|
Half neighbor lists: 0
|
||||||
|
Processor frequency (GHz): 2.0000
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
step temp pressure
|
||||||
|
0 1.440000e+00 1.215639e+00
|
||||||
|
100 8.200895e-01 6.923143e-01
|
||||||
|
200 7.961495e-01 6.721043e-01
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||||
|
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
Performance: 2.28 million atom updates per second
|
||||||
|
Statistics:
|
||||||
|
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||||
|
Average neighbors per atom: 76.0352
|
||||||
|
Average SIMD iterations per atom: 9.9181
|
||||||
|
Total number of computed pair interactions: 2003182862
|
||||||
|
Total number of SIMD iterations: 261297661
|
||||||
|
Useful read data volume for force computation: 57.46GB
|
||||||
|
Cycles/SIMD iteration: 40.4432
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Region force, Group 1: MEM_DP
|
||||||
|
+-------------------+------------+
|
||||||
|
| Region Info | HWThread 0 |
|
||||||
|
+-------------------+------------+
|
||||||
|
| RDTSC Runtime [s] | 5.115807 |
|
||||||
|
| call count | 201 |
|
||||||
|
+-------------------+------------+
|
||||||
|
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| Event | Counter | HWThread 0 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
|
||||||
|
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
|
||||||
|
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
|
||||||
|
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
|
||||||
|
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
|
||||||
|
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
|
||||||
|
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
|
||||||
|
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
|
||||||
|
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
|
||||||
|
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
|
||||||
|
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
|
||||||
|
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
|
||||||
|
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
|
||||||
|
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
|
||||||
|
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
|
||||||
|
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
|
||||||
|
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
|
||||||
|
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Metric | HWThread 0 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Runtime (RDTSC) [s] | 5.1158 |
|
||||||
|
| Runtime unhalted [s] | 4.0885 |
|
||||||
|
| Clock [MHz] | 1995.2508 |
|
||||||
|
| CPI | 0.8098 |
|
||||||
|
| Energy [J] | 307.9429 |
|
||||||
|
| Power [W] | 60.1944 |
|
||||||
|
| Energy DRAM [J] | 0 |
|
||||||
|
| Power DRAM [W] | 0 |
|
||||||
|
| DP [MFLOP/s] | 12644.6041 |
|
||||||
|
| AVX DP [MFLOP/s] | 12629.1535 |
|
||||||
|
| Packed [MUOPS/s] | 1578.6442 |
|
||||||
|
| Scalar [MUOPS/s] | 15.4506 |
|
||||||
|
| Memory read bandwidth [MBytes/s] | 1713.4438 |
|
||||||
|
| Memory read data volume [GBytes] | 8.7656 |
|
||||||
|
| Memory write bandwidth [MBytes/s] | 86.5003 |
|
||||||
|
| Memory write data volume [GBytes] | 0.4425 |
|
||||||
|
| Memory bandwidth [MBytes/s] | 1799.9442 |
|
||||||
|
| Memory data volume [GBytes] | 9.2082 |
|
||||||
|
| Operational intensity | 7.0250 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
|
||||||
|
Region reneighbour, Group 1: MEM_DP
|
||||||
|
+-------------------+------------+
|
||||||
|
| Region Info | HWThread 0 |
|
||||||
|
+-------------------+------------+
|
||||||
|
| RDTSC Runtime [s] | 5.897385 |
|
||||||
|
| call count | 10 |
|
||||||
|
+-------------------+------------+
|
||||||
|
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| Event | Counter | HWThread 0 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
|
||||||
|
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
|
||||||
|
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
|
||||||
|
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
|
||||||
|
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
|
||||||
|
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
|
||||||
|
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
|
||||||
|
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
|
||||||
|
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
|
||||||
|
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
|
||||||
|
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
|
||||||
|
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
|
||||||
|
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
|
||||||
|
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
|
||||||
|
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
|
||||||
|
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
|
||||||
|
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
|
||||||
|
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Metric | HWThread 0 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Runtime (RDTSC) [s] | 5.8974 |
|
||||||
|
| Runtime unhalted [s] | 4.7026 |
|
||||||
|
| Clock [MHz] | 1995.2473 |
|
||||||
|
| CPI | 0.6440 |
|
||||||
|
| Energy [J] | 338.9000 |
|
||||||
|
| Power [W] | 57.4661 |
|
||||||
|
| Energy DRAM [J] | 0 |
|
||||||
|
| Power DRAM [W] | 0 |
|
||||||
|
| DP [MFLOP/s] | 1059.4978 |
|
||||||
|
| AVX DP [MFLOP/s] | 1.3335 |
|
||||||
|
| Packed [MUOPS/s] | 0.1667 |
|
||||||
|
| Scalar [MUOPS/s] | 1058.1643 |
|
||||||
|
| Memory read bandwidth [MBytes/s] | 136.3006 |
|
||||||
|
| Memory read data volume [GBytes] | 0.8038 |
|
||||||
|
| Memory write bandwidth [MBytes/s] | 72.2612 |
|
||||||
|
| Memory write data volume [GBytes] | 0.4262 |
|
||||||
|
| Memory bandwidth [MBytes/s] | 208.5618 |
|
||||||
|
| Memory data volume [GBytes] | 1.2300 |
|
||||||
|
| Operational intensity | 5.0800 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
|
88
likwid-outputs/csx-lammps-sp-mem_sp-stub.out
Normal file
88
likwid-outputs/csx-lammps-sp-mem_sp-stub.out
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
--------------------------------------------------------------------------------
|
||||||
|
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||||
|
CPU type: Intel Cascadelake SP processor
|
||||||
|
CPU clock: 2.49 GHz
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Initializing parameters...
|
||||||
|
Initializing atoms...
|
||||||
|
Creating atoms...
|
||||||
|
Pattern: seq
|
||||||
|
Number of timesteps: 200
|
||||||
|
Number of atoms: 256
|
||||||
|
Number of neighbors per atom: 1024
|
||||||
|
Number of times to replicate neighbor lists: 1
|
||||||
|
Estimated total data volume (kB): 1056.7680
|
||||||
|
Estimated atom data volume (kB): 3.0720
|
||||||
|
Estimated neighborlist data volume (kB): 1050.6240
|
||||||
|
Initializing neighbor lists...
|
||||||
|
Creating neighbor lists...
|
||||||
|
Computing forces...
|
||||||
|
Total time: 0.2466, Mega atom updates/s: 0.2076
|
||||||
|
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
|
||||||
|
Statistics:
|
||||||
|
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||||
|
Average neighbors per atom: 1018.9055
|
||||||
|
Average SIMD iterations per atom: 63.6816
|
||||||
|
Total number of computed pair interactions: 52428800
|
||||||
|
Total number of SIMD iterations: 3276800
|
||||||
|
Useful read data volume for force computation: 0.84GB
|
||||||
|
Cycles/SIMD iteration: 150.4999
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Region force, Group 1: MEM_SP
|
||||||
|
+-------------------+------------+
|
||||||
|
| Region Info | HWThread 0 |
|
||||||
|
+-------------------+------------+
|
||||||
|
| RDTSC Runtime [s] | 0.085843 |
|
||||||
|
| call count | 200 |
|
||||||
|
+-------------------+------------+
|
||||||
|
|
||||||
|
+------------------------------------------+---------+------------+
|
||||||
|
| Event | Counter | HWThread 0 |
|
||||||
|
+------------------------------------------+---------+------------+
|
||||||
|
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
|
||||||
|
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
|
||||||
|
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
|
||||||
|
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
|
||||||
|
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
|
||||||
|
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
|
||||||
|
| CAS_COUNT_RD | MBOX0C0 | 8354 |
|
||||||
|
| CAS_COUNT_WR | MBOX0C1 | 1126 |
|
||||||
|
| CAS_COUNT_RD | MBOX1C0 | 7863 |
|
||||||
|
| CAS_COUNT_WR | MBOX1C1 | 1105 |
|
||||||
|
| CAS_COUNT_RD | MBOX2C0 | 7990 |
|
||||||
|
| CAS_COUNT_WR | MBOX2C1 | 1113 |
|
||||||
|
| CAS_COUNT_RD | MBOX3C0 | 4775 |
|
||||||
|
| CAS_COUNT_WR | MBOX3C1 | 1112 |
|
||||||
|
| CAS_COUNT_RD | MBOX4C0 | 4201 |
|
||||||
|
| CAS_COUNT_WR | MBOX4C1 | 1127 |
|
||||||
|
| CAS_COUNT_RD | MBOX5C0 | 4035 |
|
||||||
|
| CAS_COUNT_WR | MBOX5C1 | 1120 |
|
||||||
|
+------------------------------------------+---------+------------+
|
||||||
|
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Metric | HWThread 0 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Runtime (RDTSC) [s] | 0.0858 |
|
||||||
|
| Runtime unhalted [s] | 0.0691 |
|
||||||
|
| Clock [MHz] | 1995.2787 |
|
||||||
|
| CPI | 1.3277 |
|
||||||
|
| Energy [J] | 9.2849 |
|
||||||
|
| Power [W] | 108.1610 |
|
||||||
|
| Energy DRAM [J] | 0 |
|
||||||
|
| Power DRAM [W] | 0 |
|
||||||
|
| SP [MFLOP/s] | 16606.5397 |
|
||||||
|
| AVX SP [MFLOP/s] | 16604.7458 |
|
||||||
|
| Packed [MUOPS/s] | 1037.7966 |
|
||||||
|
| Scalar [MUOPS/s] | 1.7940 |
|
||||||
|
| Memory read bandwidth [MBytes/s] | 27.7476 |
|
||||||
|
| Memory read data volume [GBytes] | 0.0024 |
|
||||||
|
| Memory write bandwidth [MBytes/s] | 4.9974 |
|
||||||
|
| Memory write data volume [GBytes] | 0.0004 |
|
||||||
|
| Memory bandwidth [MBytes/s] | 32.7450 |
|
||||||
|
| Memory data volume [GBytes] | 0.0028 |
|
||||||
|
| Operational intensity | 507.1471 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
|
168
likwid-outputs/csx-lammps-sp-mem_sp.out
Normal file
168
likwid-outputs/csx-lammps-sp-mem_sp.out
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
--------------------------------------------------------------------------------
|
||||||
|
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||||
|
CPU type: Intel Cascadelake SP processor
|
||||||
|
CPU clock: 2.49 GHz
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Parameters:
|
||||||
|
Force field: lj
|
||||||
|
Kernel: plain-C
|
||||||
|
Data layout: AoS
|
||||||
|
Floating-point precision: single
|
||||||
|
Unit cells (nx, ny, nz): 32, 32, 32
|
||||||
|
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||||
|
Periodic (x, y, z): 1, 1, 1
|
||||||
|
Lattice size: 1.679596e+00
|
||||||
|
Epsilon: 1.000000e+00
|
||||||
|
Sigma: 1.000000e+00
|
||||||
|
Spring constant: 1.000000e+00
|
||||||
|
Damping constant: 1.000000e+00
|
||||||
|
Temperature: 1.440000e+00
|
||||||
|
RHO: 8.442000e-01
|
||||||
|
Mass: 1.000000e+00
|
||||||
|
Number of types: 4
|
||||||
|
Number of timesteps: 200
|
||||||
|
Report stats every (timesteps): 100
|
||||||
|
Reneighbor every (timesteps): 20
|
||||||
|
Prune every (timesteps): 1000
|
||||||
|
Output positions every (timesteps): 20
|
||||||
|
Output velocities every (timesteps): 5
|
||||||
|
Delta time (dt): 5.000000e-03
|
||||||
|
Cutoff radius: 2.500000e+00
|
||||||
|
Skin: 3.000000e-01
|
||||||
|
Half neighbor lists: 0
|
||||||
|
Processor frequency (GHz): 2.0000
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
step temp pressure
|
||||||
|
0 1.440000e+00 1.215639e+00
|
||||||
|
100 8.200897e-01 6.923144e-01
|
||||||
|
200 7.961481e-01 6.721031e-01
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||||
|
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
|
||||||
|
----------------------------------------------------------------------------
|
||||||
|
Performance: 2.42 million atom updates per second
|
||||||
|
Statistics:
|
||||||
|
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||||
|
Average neighbors per atom: 76.0351
|
||||||
|
Average SIMD iterations per atom: 5.0875
|
||||||
|
Total number of computed pair interactions: 2003181259
|
||||||
|
Total number of SIMD iterations: 134032075
|
||||||
|
Useful read data volume for force computation: 32.79GB
|
||||||
|
Cycles/SIMD iteration: 68.9511
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
Region force, Group 1: MEM_SP
|
||||||
|
+-------------------+------------+
|
||||||
|
| Region Info | HWThread 0 |
|
||||||
|
+-------------------+------------+
|
||||||
|
| RDTSC Runtime [s] | 4.452877 |
|
||||||
|
| call count | 201 |
|
||||||
|
+-------------------+------------+
|
||||||
|
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| Event | Counter | HWThread 0 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
|
||||||
|
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
|
||||||
|
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
|
||||||
|
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
|
||||||
|
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
|
||||||
|
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
|
||||||
|
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
|
||||||
|
| CAS_COUNT_WR | MBOX0C1 | 595747 |
|
||||||
|
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
|
||||||
|
| CAS_COUNT_WR | MBOX1C1 | 597090 |
|
||||||
|
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
|
||||||
|
| CAS_COUNT_WR | MBOX2C1 | 595219 |
|
||||||
|
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
|
||||||
|
| CAS_COUNT_WR | MBOX3C1 | 632443 |
|
||||||
|
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
|
||||||
|
| CAS_COUNT_WR | MBOX4C1 | 633169 |
|
||||||
|
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
|
||||||
|
| CAS_COUNT_WR | MBOX5C1 | 634112 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Metric | HWThread 0 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Runtime (RDTSC) [s] | 4.4529 |
|
||||||
|
| Runtime unhalted [s] | 3.5585 |
|
||||||
|
| Clock [MHz] | 1995.2693 |
|
||||||
|
| CPI | 1.1947 |
|
||||||
|
| Energy [J] | 265.5057 |
|
||||||
|
| Power [W] | 59.6257 |
|
||||||
|
| Energy DRAM [J] | 0 |
|
||||||
|
| Power DRAM [W] | 0 |
|
||||||
|
| SP [MFLOP/s] | 14156.9661 |
|
||||||
|
| AVX SP [MFLOP/s] | 14139.2165 |
|
||||||
|
| Packed [MUOPS/s] | 883.7010 |
|
||||||
|
| Scalar [MUOPS/s] | 17.7496 |
|
||||||
|
| Memory read bandwidth [MBytes/s] | 1708.8254 |
|
||||||
|
| Memory read data volume [GBytes] | 7.6092 |
|
||||||
|
| Memory write bandwidth [MBytes/s] | 53.0035 |
|
||||||
|
| Memory write data volume [GBytes] | 0.2360 |
|
||||||
|
| Memory bandwidth [MBytes/s] | 1761.8288 |
|
||||||
|
| Memory data volume [GBytes] | 7.8452 |
|
||||||
|
| Operational intensity | 8.0354 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
|
||||||
|
Region reneighbour, Group 1: MEM_SP
|
||||||
|
+-------------------+------------+
|
||||||
|
| Region Info | HWThread 0 |
|
||||||
|
+-------------------+------------+
|
||||||
|
| RDTSC Runtime [s] | 5.935627 |
|
||||||
|
| call count | 10 |
|
||||||
|
+-------------------+------------+
|
||||||
|
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| Event | Counter | HWThread 0 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
|
||||||
|
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
|
||||||
|
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
|
||||||
|
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
|
||||||
|
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
|
||||||
|
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||||
|
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
|
||||||
|
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
|
||||||
|
| CAS_COUNT_WR | MBOX0C1 | 975760 |
|
||||||
|
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
|
||||||
|
| CAS_COUNT_WR | MBOX1C1 | 977433 |
|
||||||
|
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
|
||||||
|
| CAS_COUNT_WR | MBOX2C1 | 979122 |
|
||||||
|
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
|
||||||
|
| CAS_COUNT_WR | MBOX3C1 | 967621 |
|
||||||
|
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
|
||||||
|
| CAS_COUNT_WR | MBOX4C1 | 967179 |
|
||||||
|
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
|
||||||
|
| CAS_COUNT_WR | MBOX5C1 | 969349 |
|
||||||
|
+------------------------------------------+---------+-------------+
|
||||||
|
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Metric | HWThread 0 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
| Runtime (RDTSC) [s] | 5.9356 |
|
||||||
|
| Runtime unhalted [s] | 4.7334 |
|
||||||
|
| Clock [MHz] | 1995.2675 |
|
||||||
|
| CPI | 0.6483 |
|
||||||
|
| Energy [J] | 340.7903 |
|
||||||
|
| Power [W] | 57.4144 |
|
||||||
|
| Energy DRAM [J] | 0 |
|
||||||
|
| Power DRAM [W] | 0 |
|
||||||
|
| SP [MFLOP/s] | 1052.6723 |
|
||||||
|
| AVX SP [MFLOP/s] | 1.3249 |
|
||||||
|
| Packed [MUOPS/s] | 0.0828 |
|
||||||
|
| Scalar [MUOPS/s] | 1051.3474 |
|
||||||
|
| Memory read bandwidth [MBytes/s] | 114.9736 |
|
||||||
|
| Memory read data volume [GBytes] | 0.6824 |
|
||||||
|
| Memory write bandwidth [MBytes/s] | 62.9308 |
|
||||||
|
| Memory write data volume [GBytes] | 0.3735 |
|
||||||
|
| Memory bandwidth [MBytes/s] | 177.9044 |
|
||||||
|
| Memory data volume [GBytes] | 1.0560 |
|
||||||
|
| Operational intensity | 5.9171 |
|
||||||
|
+-----------------------------------+------------+
|
||||||
|
|
148
static_analysis/gromacs-avx512-dp-ICX-iaca.txt
Normal file
148
static_analysis/gromacs-avx512-dp-ICX-iaca.txt
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - gromacs-avx512-dp-ICX.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 47.68 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 42.0 0.0 | 12.5 | 5.0 5.0 | 5.0 5.0 | 0.0 | 42.0 | 12.5 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | movsxd rbx, dword ptr [r12+r14*4]
|
||||||
|
| 1 | | 1.0 | | | | | | | lea rcx, ptr [rbx+rbx*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | shl rcx, 0x6
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm3, zmmword ptr [rsp+0x40]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm4, zmm3, zmm29
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x140]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm30
|
||||||
|
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rbx+rbx*1]
|
||||||
|
| 1 | | | | | | | 1.0 | | cmp rdi, rcx
|
||||||
|
| 1 | | | | | | | 1.0 | | setnz dl
|
||||||
|
| 1 | | | | | | | 1.0 | | setz cl
|
||||||
|
| 1 | | 1.0 | | | | | | | lea ebx, ptr [rbx+rbx*1+0x1]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm25, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm17, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm18, zmm3, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm18, zmm4, zmm4
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm18
|
||||||
|
| 1 | | 1.0 | | | | | | | cmp rdi, rbx
|
||||||
|
| 1 | | | | | | | 1.0 | | setz bl
|
||||||
|
| 1* | | | | | | | | | mov ebp, ebx
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm20, zmm19, zmm22
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm21, zmm19, zmm19
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm20, zmm21, zmm20
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm21, zmmword ptr [rsp+0x80]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm21, zmm29
|
||||||
|
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm1, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddpd zmm20, zmm20, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm19, zmm20
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm20, zmmword ptr [rsp+0x100]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm20, zmm30
|
||||||
|
| 1 | | 1.0 | | | | | | | not bpl
|
||||||
|
| 1 | | 1.0 | | | | | | | sub bpl, cl
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm18, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm26, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm15{k1}, zmm19, zmm4
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm18, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4, zmm20, zmm20
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm4, zmm21, zmm21
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm19, zmm3
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm4
|
||||||
|
| 1 | | 1.0 | | | | | | | lea ecx, ptr [rdx+rdx*1]
|
||||||
|
| 1* | | | | | | | | | mov eax, ebx
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm3, zmm22
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm3, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm19, zmm17
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm29
|
||||||
|
| 1 | | | | | | | 1.0 | | shl al, 0x5
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm1, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddpd zmm17, zmm17, zmm2
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm17, zmm23, zmm30
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | sub cl, al
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | add cl, 0xfd
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm4, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm4, zmm27, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm14{k1}, zmm3, zmm21
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm21, zmm4, zmm4
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm21, zmm17, zmm17
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm21, zmm19, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm3, zmm20
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm20, zmm21
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm3, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm22
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm20, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm1, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddpd zmm3, zmm3, zmm2
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm18, zmm3
|
||||||
|
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||||
|
| 1* | | | | | | | | | mov ecx, ebx
|
||||||
|
| 1 | | | | | | | 1.0 | | shl cl, 0x6
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | sub al, cl
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | add al, 0xfb
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm21, zmm0, 0x1
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovupd zmm18, zmmword ptr [rsp+0x180]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm18, zmm18, zmm29
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm24, zmm30
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm21, zmm28, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm16{k1}, zmm3, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm21
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm20, zmm20
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm18, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm11{k1}, zmm3, zmm17
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm3, zmm4
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm17, zmm22
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm17, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm4, zmm1, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm4, zmm4, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddpd zmm3, zmm3, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm4, zmm3
|
||||||
|
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||||
|
| 1 | | | | | | | 1.0 | | shl bl, 0x7
|
||||||
|
| 1 | | 1.0 | | | | | | | sub dl, bl
|
||||||
|
| 1 | | 1.0 | | | | | | | add dl, 0xf7
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, edx
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13{k1}, zmm3, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k1}, zmm3, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm5{k1}, zmm3, zmm21
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | inc r14
|
||||||
|
| 1* | | | | | | | | | cmp r11, r14
|
||||||
|
| 0*F | | | | | | | | | jnz 0xfffffffffffffd99
|
||||||
|
Total Num Of Uops: 123
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
159
static_analysis/gromacs-avx512-dp-ICX-osaca.txt
Normal file
159
static_analysis/gromacs-avx512-dp-ICX-osaca.txt
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: gromacs-avx512-dp-ICX.s
|
||||||
|
Architecture: CSX
|
||||||
|
Timestamp: 2023-01-03 00:07:20
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
2287 | | | | | | | | || | | .LBB5_11: #
|
||||||
|
2288 | | | | | | | | || | | # Parent Loop BB5_6 Depth=1
|
||||||
|
2289 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||||
|
2290 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r12,%r14,4), %rbx
|
||||||
|
2291 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rbx,%rbx,2), %rcx
|
||||||
|
2292 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rcx
|
||||||
|
2293 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd (%rsi,%rcx), %zmm29
|
||||||
|
2294 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovapd 64(%rsi,%rcx), %zmm30
|
||||||
|
2295 | | | 0.50 0.50 | 0.50 0.50 | | | | || 0.0 | | vmovapd 128(%rsi,%rcx), %zmm31
|
||||||
|
2296 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsp), %zmm3 # 64-byte Reload
|
||||||
|
2297 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm4
|
||||||
|
2298 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 320(%rsp), %zmm3 # 64-byte Reload
|
||||||
|
2299 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm30, %zmm3, %zmm3
|
||||||
|
2300 | | 1.00 | | | | 0.00 | | || | | leal (%rbx,%rbx), %ecx
|
||||||
|
2301 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %rdi
|
||||||
|
2302 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||||
|
2303 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||||
|
2304 | | 1.00 | | | | | | || | | leal 1(%rbx,%rbx), %ebx
|
||||||
|
2305 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm31, %zmm25, %zmm17
|
||||||
|
2306 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm17, %zmm17, %zmm18
|
||||||
|
2307 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
|
||||||
|
2308 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
|
||||||
|
2309 | 2.75 | | | | | 0.25 | | || 8.0 | | vrcp14pd %zmm18, %zmm19
|
||||||
|
2310 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | cmpq %rbx, %rdi
|
||||||
|
2311 | 0.00 | | | | | | 1.00 | || | | sete %bl
|
||||||
|
2312 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ebp
|
||||||
|
2313 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm22, %zmm19, %zmm20
|
||||||
|
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm19, %zmm21
|
||||||
|
2315 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm21, %zmm20
|
||||||
|
2316 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 128(%rsp), %zmm21 # 64-byte Reload
|
||||||
|
2317 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm21, %zmm21
|
||||||
|
2318 | 0.00 | | | | | | 1.00 | || | | shlb $4, %bpl
|
||||||
|
2319 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm19, %zmm1, %zmm19
|
||||||
|
2320 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||||
|
2321 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm20, %zmm20
|
||||||
|
2322 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm19, %zmm19
|
||||||
|
2323 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 256(%rsp), %zmm20 # 64-byte Reload
|
||||||
|
2324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm20, %zmm20
|
||||||
|
2325 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | notb %bpl
|
||||||
|
2326 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | subb %cl, %bpl
|
||||||
|
2327 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||||
|
2328 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
|
||||||
|
2329 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm26, %zmm18
|
||||||
|
2330 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
|
||||||
|
2331 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm4
|
||||||
|
2332 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
|
||||||
|
2333 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
|
||||||
|
2334 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
|
||||||
|
2335 | 2.25 | | | | | 0.75 | | || | | vrcp14pd %zmm4, %zmm3
|
||||||
|
2336 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %ecx
|
||||||
|
2337 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %eax
|
||||||
|
2338 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
|
||||||
|
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm3, %zmm17
|
||||||
|
2340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm3, %zmm19
|
||||||
|
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm19, %zmm17
|
||||||
|
2342 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 448(%rsp), %zmm19 # 64-byte Reload
|
||||||
|
2343 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm29, %zmm19, %zmm19
|
||||||
|
2344 | 0.00 | | | | | | 1.00 | || | | shlb $5, %al
|
||||||
|
2345 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm1, %zmm3
|
||||||
|
2346 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||||
|
2347 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm17, %zmm17
|
||||||
|
2348 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm3, %zmm3
|
||||||
|
2349 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm23, %zmm17
|
||||||
|
2350 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %al, %cl
|
||||||
|
2351 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | addb $-3, %cl
|
||||||
|
2352 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||||
|
2353 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
|
||||||
|
2354 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm27, %zmm4
|
||||||
|
2355 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
|
||||||
|
2356 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm4, %zmm4, %zmm21
|
||||||
|
2357 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
|
||||||
|
2358 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
|
||||||
|
2359 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
|
||||||
|
2360 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm21, %zmm20
|
||||||
|
2361 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
|
||||||
|
2362 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm22, %zmm20, %zmm3
|
||||||
|
2363 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm18
|
||||||
|
2364 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||||
|
2365 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm1, %zmm18
|
||||||
|
2366 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm18
|
||||||
|
2367 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||||
|
2368 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm18, %zmm3
|
||||||
|
2369 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||||
|
2370 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %ebx, %ecx
|
||||||
|
2371 | 0.00 | | | | | | 1.00 | || | | shlb $6, %cl
|
||||||
|
2372 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %cl, %al
|
||||||
|
2373 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-5, %al
|
||||||
|
2374 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||||
|
2375 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
|
||||||
|
2376 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 384(%rsp), %zmm18 # 64-byte Reload
|
||||||
|
2377 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm18, %zmm18
|
||||||
|
2378 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm24, %zmm20
|
||||||
|
2379 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm31, %zmm28, %zmm21
|
||||||
|
2380 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
|
||||||
|
2381 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm21, %zmm21, %zmm19
|
||||||
|
2382 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
|
||||||
|
2383 | 0.25 | | | | | 0.75 | | || | | vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
|
||||||
|
2384 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
|
||||||
|
2385 | 2.00 | | | | | 1.00 | | || | | vrcp14pd %zmm19, %zmm17
|
||||||
|
2386 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
|
||||||
|
2387 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm22, %zmm17, %zmm3
|
||||||
|
2388 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm17, %zmm4
|
||||||
|
2389 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||||
|
2390 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm17, %zmm1, %zmm4
|
||||||
|
2391 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm4
|
||||||
|
2392 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm2, %zmm3, %zmm3
|
||||||
|
2393 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm4, %zmm3
|
||||||
|
2394 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||||
|
2395 | 0.00 | | | | | | 1.00 | || | | shlb $7, %bl
|
||||||
|
2396 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | subb %bl, %dl
|
||||||
|
2397 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addb $-9, %dl
|
||||||
|
2398 | 1.00 | | | | | | | || | | kmovd %edx, %k1
|
||||||
|
2399 | | | | | | | | || | | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||||
|
2400 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
|
||||||
|
2401 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
|
||||||
|
2402 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
|
||||||
|
2403 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incq %r14
|
||||||
|
2404 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %r14, %r11
|
||||||
|
2405 | | | | | | | | || | | * jne .LBB5_11
|
||||||
|
|
||||||
|
40.0 14.5 5.00 5.00 5.00 5.00 40.0 14.5 50.0 4.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
2402 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
|
||||||
|
2401 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
|
||||||
|
2400 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
|
||||||
|
2386 | 4.0 | vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
|
||||||
|
2384 | 4.0 | vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
|
||||||
|
2380 | 4.0 | vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
|
||||||
|
2361 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
|
||||||
|
2359 | 4.0 | vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
|
||||||
|
2355 | 4.0 | vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
|
||||||
|
2338 | 4.0 | vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
|
||||||
|
2334 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
|
||||||
|
2330 | 4.0 | vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
|
||||||
|
2394 | 3.0 | shlb $3, %dl | [2394, 2396, 2397]
|
||||||
|
2318 | 3.0 | shlb $4, %bpl | [2318, 2325, 2326]
|
||||||
|
2403 | 1.0 | incq %r14 | [2403]
|
||||||
|
|
2596
static_analysis/gromacs-avx512-dp-ICX.s
Normal file
2596
static_analysis/gromacs-avx512-dp-ICX.s
Normal file
File diff suppressed because it is too large
Load Diff
198
static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out
Normal file
198
static_analysis/jan/analyses/gromacs-icc-avx512-dp-iaca.out
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - gromacs-icc-avx512-dp.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 62.00 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 58.0 0.0 | 16.0 | 16.0 15.0 | 16.0 15.0 | 2.0 | 58.0 | 16.0 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | mov edx, dword ptr [r10+rsi*4]
|
||||||
|
| 1 | | | | | | | 1.0 | | inc rsi
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm20, zmmword ptr [rsp+0x380]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm25, zmmword ptr [rsp+0x340]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm24, zmmword ptr [rsp+0x1c0]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm23, zmmword ptr [rsp+0x2c0]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0x3c0]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm14, zmmword ptr [rsp+0x300]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm15, zmmword ptr [rsp+0x240]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm12, zmmword ptr [rsp+0x180]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm21, zmmword ptr [rsp+0x200]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm18, zmmword ptr [rsp+0x140]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm22, zmmword ptr [rsp+0x100]
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm17, zmmword ptr [rsp+0x280]
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | shl r12d, 0x3
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx+rdx*1]
|
||||||
|
| 1 | | | | | | | 1.0 | | movsxd r12, r12d
|
||||||
|
| 1 | | 1.0 | | | | | | | cmp r13d, r11d
|
||||||
|
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1+0x1]
|
||||||
|
| 1 | | | | | | | 1.0 | | mov edx, 0x0
|
||||||
|
| 1 | | | | | | | 1.0 | | setz dl
|
||||||
|
| 1 | | 1.0 | | | | | | | cmp eax, r11d
|
||||||
|
| 1 | | | | | | | 1.0 | | mov eax, 0x0
|
||||||
|
| 1* | | | | | | | | | mov r13d, edx
|
||||||
|
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
|
||||||
|
| 1 | | | | | | | 1.0 | | setz al
|
||||||
|
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
|
||||||
|
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
|
||||||
|
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
|
||||||
|
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
|
||||||
|
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
|
||||||
|
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
|
||||||
|
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
|
||||||
|
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
|
||||||
|
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
|
||||||
|
| 2 | 1.0 | | | 1.0 1.0 | | | | | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
|
||||||
|
| 2 | | | 1.0 1.0 | | | 1.0 | | | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm29, zmm29
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm15, zmm26, zmm26
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm12, zmm23, zmm23
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm14, zmm20, zmm20
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm16, zmmword ptr [rsp+0xc0]
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm30, zmm30
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm27, zmm27
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm24, zmm24
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm21, zmm21
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm13, zmm31, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15, zmm28, zmm28
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12, zmm25, zmm25
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14, zmm22, zmm22
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm19, zmm13
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm15
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm12
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1, zmm13, zmm16, 0x11
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k6, zmm15, zmm16, 0x11
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k7, zmm12, zmm16, 0x11
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k0, zmm14, zmm16, 0x11
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm15, zmm14
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm16, zmmword ptr [rsp+0x40]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm12, zmmword ptr [rsp+0x80]
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm19, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm19, zmm13
|
||||||
|
| 1 | | 1.0 | | | | | | | neg r13d
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm19, zmm13
|
||||||
|
| 1* | | | | | | | | | mov r12d, eax
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm13, zmm19, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm19, zmm12
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm13, zmm19
|
||||||
|
| 1 | | 1.0 | | | | | | | add r13d, 0xff
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm14, zmm13
|
||||||
|
| 1 | | | | | | | 1.0 | | nop
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm13, zmmword ptr [rsp+0x400]
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm10, zmm14
|
||||||
|
| 1 | | | | | | | 1.0 | | shl r12d, 0x4
|
||||||
|
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k5, r13d
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw r13d, k1
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k5, r12d
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||||
|
| 1* | | | | | | | | | mov r13d, eax
|
||||||
|
| 1 | 1.0 | | | | | | | | kandb k5, k5, k1
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb r12d, k5
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k5, r12d
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r12d, ptr [rdx+rdx*1]
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm9{k5}, zmm19, zmm29
|
||||||
|
| 1 | | | | | | | 1.0 | | neg r12d
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k5}, zmm19, zmm31
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm31, zmmword ptr [rsp+0x440]
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm18, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm31{k5}, zmm19, zmm30
|
||||||
|
| 2^ | | | 1.0 | | 1.0 | | | | vmovups zmmword ptr [rsp+0x400], zmm13
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm30, zmm18, zmm29
|
||||||
|
| 2^ | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [rsp+0x440], zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm13, zmm18, zmm30
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmsub213pd zmm30, zmm18, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm12
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm30, zmm18
|
||||||
|
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm13, zmm14
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm29, zmm10, zmm19
|
||||||
|
| 1 | | | | | | | 1.0 | | shl r13d, 0x5
|
||||||
|
| 1 | | 1.0 | | | | | | | sub r12d, r13d
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k1, r12d
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw r12d, k6
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k1, r13d
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k6, r12d
|
||||||
|
| 1* | | | | | | | | | mov r12d, eax
|
||||||
|
| 1 | 1.0 | | | | | | | | kandb k1, k1, k6
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb r13d, k1
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k1, r13d
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r13d, ptr [rdx*4]
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm29, zmm26
|
||||||
|
| 1 | | | | | | | 1.0 | | neg r13d
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm7{k1}, zmm29, zmm27
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm29, zmm28
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm26, zmm17, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm28, zmm17, zmm12
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm15, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm12, zmm15, zmm12
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm27, zmm17, zmm26
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm15, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm13, zmm17, zmm27
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm27, zmm17, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm14, zmm27, zmm28
|
||||||
|
| 1 | | | | | | | 1.0 | | add r13d, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm13, zmm14
|
||||||
|
| 1 | | | | | | | 1.0 | | shl edx, 0x3
|
||||||
|
| 1 | | | | | | | 1.0 | | shl r12d, 0x6
|
||||||
|
| 1 | | 1.0 | | | | | | | neg edx
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm10, zmm17
|
||||||
|
| 1 | | 1.0 | | | | | | | sub r13d, r12d
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k6, r13d
|
||||||
|
| 1 | | 1.0 | | | | | | | add edx, 0xff
|
||||||
|
| 1 | | | | | | | 1.0 | | shl eax, 0x7
|
||||||
|
| 1 | | 1.0 | | | | | | | sub edx, eax
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw eax, k7
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k7, eax
|
||||||
|
| 1 | 1.0 | | | | | | | | kandb k7, k6, k7
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k6, edx
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb edx, k7
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k7, edx
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw edx, k0
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k7}, zmm18, zmm23
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k7}, zmm18, zmm24
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k7}, zmm18, zmm25
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm23, zmm15, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmsub213pd zmm19, zmm15, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm15, zmm19, zmm12
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm15
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm25, zmm10, zmm24
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb eax, k6
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k6, eax
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovb k0, edx
|
||||||
|
| 1 | 1.0 | | | | | | | | kandb k0, k6, k0
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovb r12d, k0
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k6, r12d
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3{k6}, zmm25, zmm22
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm2{k6}, zmm25, zmm21
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm0{k6}, zmm25, zmm20
|
||||||
|
| 1* | | | | | | | | | cmp rsi, rdi
|
||||||
|
| 0*F | | | | | | | | | jl 0xfffffffffffffc6f
|
||||||
|
Total Num Of Uops: 187
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
152
static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out
Normal file
152
static_analysis/jan/analyses/gromacs-icc-avx512-sp-iaca.out
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - gromacs-icc-avx512-sp.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 51.00 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 47.5 0.0 | 9.0 | 11.0 11.0 | 11.0 8.0 | 3.0 | 47.5 | 9.0 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | mov edi, dword ptr [rcx+rax*4]
|
||||||
|
| 1* | | | | | | | | | mov r12d, r13d
|
||||||
|
| 1 | | | | | | | 1.0 | | movsxd rdi, edi
|
||||||
|
| 1 | | 1.0 | | | | | | | inc rax
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm10, zmmword ptr [rsp+0x140]
|
||||||
|
| 1 | | | | | | | 1.0 | | test edi, 0x7fffffff
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm11, zmmword ptr [rsp+0x100]
|
||||||
|
| 1 | | | | 1.0 1.0 | | | | | vmovups zmm9, zmmword ptr [rsp+0xc0]
|
||||||
|
| 1 | | | | | | | 1.0 | | setz r12b
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r14, ptr [rdi+rdi*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | shl r14, 0x5
|
||||||
|
| 1* | | | | | | | | | mov r8d, r12d
|
||||||
|
| 1 | | 1.0 | | | | | | | neg r8d
|
||||||
|
| 1* | | | | | | | | | mov r11d, r12d
|
||||||
|
| 1 | | 1.0 | | | | | | | add r8d, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k0, r8d
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r9d, ptr [r12+r12*2]
|
||||||
|
| 2 | 1.0 | | 1.0 1.0 | | | | | | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
|
||||||
|
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
|
||||||
|
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
|
||||||
|
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
|
||||||
|
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
|
||||||
|
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
|
||||||
|
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
|
||||||
|
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
|
||||||
|
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
|
||||||
|
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
|
||||||
|
| 2 | 0.5 | | 1.0 1.0 | | | 0.5 | | | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
|
||||||
|
| 2 | 0.5 | | | 1.0 1.0 | | 0.5 | | | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm3, zmm3
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm10, zmm10
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm11, zmm11
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm29, zmm29
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm5, zmm5
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm8, zmm8
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm9, zmm9
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm28, zmm28
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm2, zmm27, zmm27
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30, zmm4, zmm4
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm1, zmm7, zmm7
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm26, zmm25, zmm25
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k7, zmm30, zmm24, 0x11
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm6, zmm30
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm2, zmm24, 0x11
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k5, zmm26, zmm24, 0x11
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm26, zmm26
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm30, zmm31, zmm23
|
||||||
|
| 1 | 1.0 | | | | | | | | kandw k2, k0, k3
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k3, zmm1, zmm24, 0x11
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm31, zmm30
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm31, zmm1
|
||||||
|
| 1 | | | | | | | 1.0 | | neg r9d
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm31, zmm20
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm22
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm1, zmm31
|
||||||
|
| 1 | | | | | | | 1.0 | | add r9d, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k4, r9d
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm30, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | kandw k1, k4, k5
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm21, zmm30
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm23
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm26, zmm30
|
||||||
|
| 1 | | 1.0 | | | | | | | lea r10d, ptr [r12*8]
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm26, zmm31
|
||||||
|
| 1 | | | | | | | 1.0 | | neg r10d
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm31, zmm26, zmm20
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm26, zmm22
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31, zmm31, zmm26
|
||||||
|
| 1 | | 1.0 | | | | | | | add r10d, r12d
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30, zmm30, zmm31
|
||||||
|
| 1 | | | | | | | 1.0 | | add r10d, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k6, r10d
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm26, zmm21, zmm30
|
||||||
|
| 1 | 1.0 | | | | | | | | kandw k4, k6, k7
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm25{k1}{z}, zmm25, zmm26
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm31{k1}{z}, zmm28, zmm26
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm28, zmm6, zmm23
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm30{k1}{z}, zmm29, zmm26
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm29, zmm2, zmm23
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k2}, zmm27, zmm1
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k2}, zmm5, zmm1
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm6, zmm28
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k2}, zmm3, zmm1
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm2, zmm29
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm6, zmm27
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm27, zmm6, zmm20
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm6, zmm22
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm2, zmm1
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213ps zmm1, zmm2, zmm20
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm2, zmm2, zmm22
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm26, zmm27, zmm6
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm1, zmm1, zmm2
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm5, zmm5, zmm26
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm3, zmm3, zmm1
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm6, zmm21, zmm5
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulps zmm27, zmm21, zmm3
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k4}, zmm4, zmm6
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm4, zmmword ptr [r14+rsi*1]
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k4}, zmm8, zmm6
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k4}, zmm10, zmm6
|
||||||
|
| 1 | | | | | | | 1.0 | | shl r11d, 0x4
|
||||||
|
| 1 | | 1.0 | | | | | | | sub r12d, r11d
|
||||||
|
| 1 | | 1.0 | | | | | | | add r12d, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovw k0, r12d
|
||||||
|
| 1 | 1.0 | | | | | | | | kandw k5, k0, k3
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm25{k5}, zmm7, zmm27
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm31{k5}, zmm9, zmm27
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231ps zmm30{k5}, zmm11, zmm27
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm7, zmm4, zmm25
|
||||||
|
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1], zmm7
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm4, zmm8, zmm31
|
||||||
|
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
|
||||||
|
| 1 | | | 1.0 1.0 | | | | | | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vsubps zmm2, zmm1, zmm30
|
||||||
|
| 2 | | | | 1.0 | 1.0 | | | | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
|
||||||
|
| 1* | | | | | | | | | cmp rax, rdx
|
||||||
|
| 0*F | | | | | | | | | jb 0xfffffffffffffd30
|
||||||
|
Total Num Of Uops: 142
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
154
static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out
Normal file
154
static_analysis/jan/analyses/gromacs-icx-avx512-dp-iaca.out
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - gromacs-icx-avx512-dp.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 49.26 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 44.0 0.0 | 13.5 | 5.5 5.5 | 5.5 5.5 | 0.0 | 44.0 | 13.5 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rcx, dword ptr [r10+rbx*4]
|
||||||
|
| 1 | | 1.0 | | | | | | | lea rdx, ptr [rcx+rcx*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | shl rdx, 0x6
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0x10]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm3, zmm3, zmm28
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm24, zmm30
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm16, zmmword ptr [rsp+0x150]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm16, zmm16, zmm29
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm31, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm17, zmm16, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm17, zmm3, zmm3
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm18, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm21, zmm18
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm19, zmm18, zmm19
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm19, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm22, zmm18
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm18, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm20, zmm25, zmm30
|
||||||
|
| 1 | | 1.0 | | | | | | | lea edx, ptr [rcx+rcx*1]
|
||||||
|
| 1 | | | | | | | 1.0 | | cmp r11, rdx
|
||||||
|
| 1 | | | | | | | 1.0 | | setnz dl
|
||||||
|
| 1 | | | | | | | 1.0 | | setz al
|
||||||
|
| 1 | | 1.0 | | | | | | | add ecx, ecx
|
||||||
|
| 1 | | 1.0 | | | | | | | inc ecx
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | cmp r11, rcx
|
||||||
|
| 1 | | | | | | | 1.0 | | setz cl
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm19, zmm18
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm19, zmmword ptr [rsp+0x210]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm19, zmm28
|
||||||
|
| 1 | | | | | | | 1.0 | | setnz dil
|
||||||
|
| 1* | | | | | | | | | mov ebp, edi
|
||||||
|
| 1 | | | | | | | 1.0 | | shl bpl, 0x4
|
||||||
|
| 1 | | 1.0 | | | | | | | sub bpl, al
|
||||||
|
| 1 | | 1.0 | | | | | | | add bpl, 0xef
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm17, zmm0, 0x1
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x110]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm29
|
||||||
|
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx+rdx*1]
|
||||||
|
| 1* | | | | | | | | | mov ebp, edi
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm18, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm14{k1}, zmm3, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm20, zmm20
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm3, zmm17, zmm17
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm3, zmm19, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm11{k1}, zmm16, zmm18
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm16, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm7{k1}, zmm31, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm21, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm18, zmm16, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm18, zmm16, zmm18
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddpd zmm31, zmm18, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm22, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm16, zmm31
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm31, zmm31, zmm28
|
||||||
|
| 1 | | | | | | | 1.0 | | shl bpl, 0x5
|
||||||
|
| 1 | | 1.0 | | | | | | | or bpl, al
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | or bpl, 0xdd
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, ebp
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm3, zmm0, 0x1
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm3, zmmword ptr [rsp+0xd0]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm3, zmm3, zmm29
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm18, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm18, zmm26, zmm30
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm15{k1}, zmm19, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm19, zmm18, zmm18
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm19, zmm3, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm19, zmm31, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm10{k1}, zmm17, zmm16
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm17, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm6{k1}, zmm20, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm17, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm17, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddpd zmm20, zmm16, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm17, zmm22, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm17, zmm17, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm17
|
||||||
|
| 1 | | 1.0 | | | | | | | lea eax, ptr [rdx*4]
|
||||||
|
| 1 | | | | | | | 1.0 | | shl dil, 0x6
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | or dil, al
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | or dil, 0xbb
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, edi
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm19, zmm0, 0x1
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovupd zmm17, zmmword ptr [rsp+0x190]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm17, zmm17, zmm28
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubpd zmm19, zmm23, zmm29
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubpd zmm20, zmm27, zmm30
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm16, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm13{k1}, zmm31, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm28, zmm20, zmm20
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm28, zmm19, zmm19
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm28, zmm17, zmm17
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm9{k1}, zmm3, zmm16
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm3, zmm28
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm5{k1}, zmm18, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm21, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm16, zmm3, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm16, zmm3, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddpd zmm18, zmm16, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm22, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm3, zmm3, zmm18
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm16, zmm3
|
||||||
|
| 1 | | | | | | | 1.0 | | shl dl, 0x3
|
||||||
|
| 1 | | | | | | | 1.0 | | shl cl, 0x7
|
||||||
|
| 1 | | 1.0 | | | | | | | or cl, dl
|
||||||
|
| 1 | | 1.0 | | | | | | | add cl, 0xf7
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k1{k1}, zmm28, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulpd zmm3, zmm3, zmm2
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm12{k1}, zmm17, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231pd zmm8{k1}, zmm19, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231pd zmm4{k1}, zmm20, zmm3
|
||||||
|
| 1 | | 0.5 | | | | | 0.5 | | inc rbx
|
||||||
|
| 1* | | | | | | | | | cmp r9, rbx
|
||||||
|
| 0*F | | | | | | | | | jnz 0xfffffffffffffd5a
|
||||||
|
Total Num Of Uops: 129
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
288
static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out
Normal file
288
static_analysis/jan/analyses/gromacs-icx-avx512-dp-mca.out
Normal file
@@ -0,0 +1,288 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 12200
|
||||||
|
Total Cycles: 4745
|
||||||
|
Total uOps: 14000
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.95
|
||||||
|
IPC: 2.57
|
||||||
|
Block RThroughput: 34.0
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 5 0.50 * movslq (%r10,%rbx,4), %rcx
|
||||||
|
1 1 0.50 leaq (%rcx,%rcx,2), %rdx
|
||||||
|
1 1 0.50 shlq $6, %rdx
|
||||||
|
2 8 0.50 * vmovupd (%rsi,%rdx), %zmm28
|
||||||
|
2 8 0.50 * vmovupd 64(%rsi,%rdx), %zmm29
|
||||||
|
2 8 0.50 * vmovupd 128(%rsi,%rdx), %zmm30
|
||||||
|
2 8 0.50 * vmovupd 16(%rsp), %zmm3
|
||||||
|
1 4 0.50 vsubpd %zmm28, %zmm3, %zmm3
|
||||||
|
1 4 0.50 vsubpd %zmm30, %zmm24, %zmm31
|
||||||
|
2 8 0.50 * vmovupd 336(%rsp), %zmm16
|
||||||
|
1 4 0.50 vsubpd %zmm29, %zmm16, %zmm16
|
||||||
|
1 4 0.50 vmulpd %zmm31, %zmm31, %zmm17
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||||
|
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||||
|
3 4 2.00 vrcp14pd %zmm17, %zmm18
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm21, %zmm19
|
||||||
|
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
1 4 0.50 vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
1 4 0.50 vaddpd %zmm1, %zmm19, %zmm20
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm22, %zmm18
|
||||||
|
1 4 0.50 vmulpd %zmm20, %zmm18, %zmm18
|
||||||
|
1 4 0.50 vsubpd %zmm30, %zmm25, %zmm20
|
||||||
|
1 1 0.50 leal (%rcx,%rcx), %edx
|
||||||
|
1 1 0.25 cmpq %rdx, %r11
|
||||||
|
1 1 0.50 setne %dl
|
||||||
|
1 1 0.50 sete %al
|
||||||
|
1 1 0.25 addl %ecx, %ecx
|
||||||
|
1 1 0.25 incl %ecx
|
||||||
|
1 1 0.25 cmpq %rcx, %r11
|
||||||
|
1 1 0.50 sete %cl
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm19, %zmm18
|
||||||
|
2 8 0.50 * vmovupd 528(%rsp), %zmm19
|
||||||
|
1 4 0.50 vsubpd %zmm28, %zmm19, %zmm19
|
||||||
|
1 1 0.50 setne %dil
|
||||||
|
1 1 0.25 movl %edi, %ebp
|
||||||
|
1 1 0.50 shlb $4, %bpl
|
||||||
|
1 1 0.25 subb %al, %bpl
|
||||||
|
1 1 0.25 addb $-17, %bpl
|
||||||
|
1 1 1.00 kmovd %ebp, %k1
|
||||||
|
1 4 1.00 vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||||
|
2 8 0.50 * vmovupd 272(%rsp), %zmm17
|
||||||
|
1 4 0.50 vsubpd %zmm29, %zmm17, %zmm17
|
||||||
|
1 1 0.50 leal (%rdx,%rdx), %eax
|
||||||
|
1 1 0.25 movl %edi, %ebp
|
||||||
|
1 4 0.50 vmulpd %zmm2, %zmm18, %zmm18
|
||||||
|
1 4 0.50 vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm3
|
||||||
|
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||||
|
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||||
|
1 4 0.50 vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||||
|
3 4 2.00 vrcp14pd %zmm3, %zmm16
|
||||||
|
1 4 0.50 vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm21, %zmm18
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
1 4 0.50 vaddpd %zmm1, %zmm18, %zmm31
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm22, %zmm16
|
||||||
|
1 4 0.50 vmulpd %zmm31, %zmm16, %zmm16
|
||||||
|
2 8 0.50 * vmovupd 464(%rsp), %zmm31
|
||||||
|
1 4 0.50 vsubpd %zmm28, %zmm31, %zmm31
|
||||||
|
1 1 0.50 shlb $5, %bpl
|
||||||
|
1 1 0.25 orb %al, %bpl
|
||||||
|
1 1 0.25 orb $-35, %bpl
|
||||||
|
1 1 1.00 kmovd %ebp, %k1
|
||||||
|
1 4 1.00 vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||||
|
2 8 0.50 * vmovupd 208(%rsp), %zmm3
|
||||||
|
1 4 0.50 vsubpd %zmm29, %zmm3, %zmm3
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm18, %zmm16
|
||||||
|
1 4 0.50 vsubpd %zmm30, %zmm26, %zmm18
|
||||||
|
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm19
|
||||||
|
1 4 0.50 vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||||
|
1 4 0.50 vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||||
|
3 4 2.00 vrcp14pd %zmm19, %zmm17
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm17, %zmm21, %zmm16
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm20
|
||||||
|
1 4 0.50 vmulpd %zmm17, %zmm22, %zmm17
|
||||||
|
1 4 0.50 vmulpd %zmm20, %zmm17, %zmm17
|
||||||
|
1 4 0.50 vmulpd %zmm17, %zmm16, %zmm16
|
||||||
|
1 1 0.50 leal (,%rdx,4), %eax
|
||||||
|
1 1 0.50 shlb $6, %dil
|
||||||
|
1 1 0.25 orb %al, %dil
|
||||||
|
1 1 0.25 orb $-69, %dil
|
||||||
|
1 1 1.00 kmovd %edi, %k1
|
||||||
|
1 4 1.00 vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||||
|
2 8 0.50 * vmovupd 400(%rsp), %zmm17
|
||||||
|
1 4 0.50 vsubpd %zmm28, %zmm17, %zmm17
|
||||||
|
1 4 0.50 vsubpd %zmm29, %zmm23, %zmm19
|
||||||
|
1 4 0.50 vsubpd %zmm30, %zmm27, %zmm20
|
||||||
|
1 4 0.50 vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm20, %zmm20, %zmm28
|
||||||
|
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||||
|
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||||
|
3 4 2.00 vrcp14pd %zmm28, %zmm3
|
||||||
|
1 4 0.50 vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm3, %zmm21, %zmm16
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
1 4 0.50 vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
1 4 0.50 vaddpd %zmm1, %zmm16, %zmm18
|
||||||
|
1 4 0.50 vmulpd %zmm3, %zmm22, %zmm3
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm3, %zmm3
|
||||||
|
1 4 0.50 vmulpd %zmm3, %zmm16, %zmm3
|
||||||
|
1 1 0.50 shlb $3, %dl
|
||||||
|
1 1 0.50 shlb $7, %cl
|
||||||
|
1 1 0.25 orb %dl, %cl
|
||||||
|
1 1 0.25 addb $-9, %cl
|
||||||
|
1 1 1.00 kmovd %ecx, %k1
|
||||||
|
1 4 1.00 vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
1 4 0.50 vmulpd %zmm2, %zmm3, %zmm3
|
||||||
|
1 4 0.50 vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||||
|
1 4 0.50 vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||||
|
1 4 0.50 vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||||
|
1 1 0.25 incq %rbx
|
||||||
|
1 1 0.25 cmpq %rbx, %r9
|
||||||
|
1 1 0.50 jne .LBB5_12
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - SKXDivider
|
||||||
|
[1] - SKXFPDivider
|
||||||
|
[2] - SKXPort0
|
||||||
|
[3] - SKXPort1
|
||||||
|
[4] - SKXPort2
|
||||||
|
[5] - SKXPort3
|
||||||
|
[6] - SKXPort4
|
||||||
|
[7] - SKXPort5
|
||||||
|
[8] - SKXPort6
|
||||||
|
[9] - SKXPort7
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||||
|
- - 45.53 20.45 5.50 5.50 - 44.64 18.38 -
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||||
|
- - - - 0.50 0.50 - - - - movslq (%r10,%rbx,4), %rcx
|
||||||
|
- - - 0.99 - - - 0.01 - - leaq (%rcx,%rcx,2), %rdx
|
||||||
|
- - 0.01 - - - - - 0.99 - shlq $6, %rdx
|
||||||
|
- - 0.01 0.99 0.49 0.51 - - - - vmovupd (%rsi,%rdx), %zmm28
|
||||||
|
- - 0.01 0.91 0.51 0.49 - 0.08 - - vmovupd 64(%rsi,%rdx), %zmm29
|
||||||
|
- - 0.01 0.56 0.49 0.51 - 0.43 - - vmovupd 128(%rsi,%rdx), %zmm30
|
||||||
|
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 16(%rsp), %zmm3
|
||||||
|
- - 0.95 - - - - 0.05 - - vsubpd %zmm28, %zmm3, %zmm3
|
||||||
|
- - 0.48 - - - - 0.52 - - vsubpd %zmm30, %zmm24, %zmm31
|
||||||
|
- - - 1.00 0.50 0.50 - - - - vmovupd 336(%rsp), %zmm16
|
||||||
|
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm16, %zmm16
|
||||||
|
- - 0.48 - - - - 0.52 - - vmulpd %zmm31, %zmm31, %zmm17
|
||||||
|
- - 0.49 - - - - 0.51 - - vfmadd231pd %zmm16, %zmm16, %zmm17
|
||||||
|
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm3, %zmm3, %zmm17
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm17, %zmm18
|
||||||
|
- - 1.00 - - - - - - - vmulpd %zmm18, %zmm21, %zmm19
|
||||||
|
- - 0.51 - - - - 0.49 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
- - 0.49 - - - - 0.51 - - vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm19, %zmm20
|
||||||
|
- - - - - - - 1.00 - - vmulpd %zmm18, %zmm22, %zmm18
|
||||||
|
- - 0.95 - - - - 0.05 - - vmulpd %zmm20, %zmm18, %zmm18
|
||||||
|
- - 0.92 - - - - 0.08 - - vsubpd %zmm30, %zmm25, %zmm20
|
||||||
|
- - - 0.94 - - - 0.06 - - leal (%rcx,%rcx), %edx
|
||||||
|
- - - - - - - - 1.00 - cmpq %rdx, %r11
|
||||||
|
- - - - - - - - 1.00 - setne %dl
|
||||||
|
- - 0.44 - - - - - 0.56 - sete %al
|
||||||
|
- - - 0.07 - - - 0.02 0.91 - addl %ecx, %ecx
|
||||||
|
- - - 0.53 - - - 0.46 0.01 - incl %ecx
|
||||||
|
- - - 0.51 - - - 0.46 0.03 - cmpq %rcx, %r11
|
||||||
|
- - 0.02 - - - - - 0.98 - sete %cl
|
||||||
|
- - 0.94 - - - - 0.06 - - vmulpd %zmm18, %zmm19, %zmm18
|
||||||
|
- - 0.01 0.99 0.51 0.49 - - - - vmovupd 528(%rsp), %zmm19
|
||||||
|
- - 0.47 - - - - 0.53 - - vsubpd %zmm28, %zmm19, %zmm19
|
||||||
|
- - 0.04 - - - - - 0.96 - setne %dil
|
||||||
|
- - - 0.95 - - - 0.02 0.03 - movl %edi, %ebp
|
||||||
|
- - 0.01 - - - - - 0.99 - shlb $4, %bpl
|
||||||
|
- - - 0.96 - - - - 0.04 - subb %al, %bpl
|
||||||
|
- - - 0.06 - - - - 0.94 - addb $-17, %bpl
|
||||||
|
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||||
|
- - 0.02 0.97 0.50 0.50 - 0.01 - - vmovupd 272(%rsp), %zmm17
|
||||||
|
- - 0.96 - - - - 0.04 - - vsubpd %zmm29, %zmm17, %zmm17
|
||||||
|
- - - 1.00 - - - - - - leal (%rdx,%rdx), %eax
|
||||||
|
- - - 0.05 - - - - 0.95 - movl %edi, %ebp
|
||||||
|
- - 0.51 - - - - 0.49 - - vmulpd %zmm2, %zmm18, %zmm18
|
||||||
|
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1}
|
||||||
|
- - 0.45 - - - - 0.55 - - vmulpd %zmm20, %zmm20, %zmm3
|
||||||
|
- - 0.94 - - - - 0.06 - - vfmadd231pd %zmm17, %zmm17, %zmm3
|
||||||
|
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm19, %zmm19, %zmm3
|
||||||
|
- - 0.47 - - - - 0.53 - - vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1}
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm3, %zmm16
|
||||||
|
- - 0.53 - - - - 0.47 - - vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1}
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm21, %zmm18
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
- - 0.97 - - - - 0.03 - - vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
- - 0.52 - - - - 0.48 - - vaddpd %zmm1, %zmm18, %zmm31
|
||||||
|
- - 0.01 - - - - 0.99 - - vmulpd %zmm16, %zmm22, %zmm16
|
||||||
|
- - 0.52 - - - - 0.48 - - vmulpd %zmm31, %zmm16, %zmm16
|
||||||
|
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 464(%rsp), %zmm31
|
||||||
|
- - 0.03 - - - - 0.97 - - vsubpd %zmm28, %zmm31, %zmm31
|
||||||
|
- - 0.01 - - - - - 0.99 - shlb $5, %bpl
|
||||||
|
- - - 0.94 - - - - 0.06 - orb %al, %bpl
|
||||||
|
- - - 0.04 - - - - 0.96 - orb $-35, %bpl
|
||||||
|
- - - - - - - 1.00 - - kmovd %ebp, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||||
|
- - - 0.99 0.50 0.50 - 0.01 - - vmovupd 208(%rsp), %zmm3
|
||||||
|
- - 0.95 - - - - 0.05 - - vsubpd %zmm29, %zmm3, %zmm3
|
||||||
|
- - 0.51 - - - - 0.49 - - vmulpd %zmm16, %zmm18, %zmm16
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubpd %zmm30, %zmm26, %zmm18
|
||||||
|
- - 0.52 - - - - 0.48 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1}
|
||||||
|
- - 0.03 - - - - 0.97 - - vmulpd %zmm18, %zmm18, %zmm19
|
||||||
|
- - 0.06 - - - - 0.94 - - vfmadd231pd %zmm3, %zmm3, %zmm19
|
||||||
|
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm31, %zmm31, %zmm19
|
||||||
|
- - - - - - - 1.00 - - vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1}
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm19, %zmm17
|
||||||
|
- - 1.00 - - - - - - - vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1}
|
||||||
|
- - 0.07 - - - - 0.93 - - vmulpd %zmm17, %zmm21, %zmm16
|
||||||
|
- - 0.50 - - - - 0.50 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
- - 0.09 - - - - 0.91 - - vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
- - 0.07 - - - - 0.93 - - vaddpd %zmm1, %zmm16, %zmm20
|
||||||
|
- - 0.93 - - - - 0.07 - - vmulpd %zmm17, %zmm22, %zmm17
|
||||||
|
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm17, %zmm17
|
||||||
|
- - 0.51 - - - - 0.49 - - vmulpd %zmm17, %zmm16, %zmm16
|
||||||
|
- - - 1.00 - - - - - - leal (,%rdx,4), %eax
|
||||||
|
- - - - - - - - 1.00 - shlb $6, %dil
|
||||||
|
- - - 0.02 - - - - 0.98 - orb %al, %dil
|
||||||
|
- - - 0.48 - - - - 0.52 - orb $-69, %dil
|
||||||
|
- - - - - - - 1.00 - - kmovd %edi, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||||
|
- - - 1.00 0.50 0.50 - - - - vmovupd 400(%rsp), %zmm17
|
||||||
|
- - 0.49 - - - - 0.51 - - vsubpd %zmm28, %zmm17, %zmm17
|
||||||
|
- - 0.49 - - - - 0.51 - - vsubpd %zmm29, %zmm23, %zmm19
|
||||||
|
- - 0.02 - - - - 0.98 - - vsubpd %zmm30, %zmm27, %zmm20
|
||||||
|
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1}
|
||||||
|
- - 0.94 - - - - 0.06 - - vmulpd %zmm20, %zmm20, %zmm28
|
||||||
|
- - 0.04 - - - - 0.96 - - vfmadd231pd %zmm19, %zmm19, %zmm28
|
||||||
|
- - 0.07 - - - - 0.93 - - vfmadd231pd %zmm17, %zmm17, %zmm28
|
||||||
|
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1}
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm28, %zmm3
|
||||||
|
- - 0.50 - - - - 0.50 - - vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1}
|
||||||
|
- - 1.00 - - - - - - - vmulpd %zmm3, %zmm21, %zmm16
|
||||||
|
- - 0.55 - - - - 0.45 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
- - 0.99 - - - - 0.01 - - vaddpd %zmm1, %zmm16, %zmm18
|
||||||
|
- - - - - - - 1.00 - - vmulpd %zmm3, %zmm22, %zmm3
|
||||||
|
- - 0.52 - - - - 0.48 - - vmulpd %zmm18, %zmm3, %zmm3
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulpd %zmm3, %zmm16, %zmm3
|
||||||
|
- - - - - - - - 1.00 - shlb $3, %dl
|
||||||
|
- - - - - - - - 1.00 - shlb $7, %cl
|
||||||
|
- - - 1.00 - - - - - - orb %dl, %cl
|
||||||
|
- - - 0.52 - - - - 0.48 - addb $-9, %cl
|
||||||
|
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
- - 0.98 - - - - 0.02 - - vmulpd %zmm2, %zmm3, %zmm3
|
||||||
|
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1}
|
||||||
|
- - 0.03 - - - - 0.97 - - vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1}
|
||||||
|
- - 0.97 - - - - 0.03 - - vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1}
|
||||||
|
- - - 0.48 - - - - 0.52 - incq %rbx
|
||||||
|
- - - 0.52 - - - - 0.48 - cmpq %rbx, %r9
|
||||||
|
- - - - - - - - 1.00 - jne .LBB5_12
|
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out
Normal file
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca-icx.out
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: gromacs-icx-avx512-dp.s
|
||||||
|
Architecture: ICX
|
||||||
|
Timestamp: 2023-02-14 12:51:57
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||||
|
------------------------------------------------------------------------------------------------------------------------
|
||||||
|
2241 | | | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||||
|
2242 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
2243 | | | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||||
|
2244 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||||
|
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r10,%rbx,4), %rcx
|
||||||
|
2246 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||||
|
2247 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $6, %rdx
|
||||||
|
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||||
|
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||||
|
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||||
|
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||||
|
2252 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||||
|
2253 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||||
|
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||||
|
2255 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||||
|
2256 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||||
|
2257 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||||
|
2258 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||||
|
2259 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm17, %zmm18
|
||||||
|
2260 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||||
|
2261 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
2262 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
2263 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||||
|
2264 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||||
|
2265 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||||
|
2266 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||||
|
2267 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||||
|
2268 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rdx, %r11
|
||||||
|
2269 | 0.00 | | | | | | 1.00 | | | || | | setne %dl
|
||||||
|
2270 | 0.00 | | | | | | 1.00 | | | || | | sete %al
|
||||||
|
2271 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl %ecx, %ecx
|
||||||
|
2272 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | incl %ecx
|
||||||
|
2273 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | cmpq %rcx, %r11
|
||||||
|
2274 | 0.00 | | | | | | 1.00 | | | || | | sete %cl
|
||||||
|
2275 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||||
|
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||||
|
2277 | 1.00 | | | | | 0.000 | | | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||||
|
2278 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||||
|
2279 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||||
|
2280 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $4, %bpl
|
||||||
|
2281 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | subb %al, %bpl
|
||||||
|
2282 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | addb $-17, %bpl
|
||||||
|
2283 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||||
|
2284 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||||
|
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||||
|
2286 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||||
|
2287 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rdx,%rdx), %eax
|
||||||
|
2288 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %edi, %ebp
|
||||||
|
2289 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||||
|
2290 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||||
|
2291 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||||
|
2292 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||||
|
2293 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||||
|
2294 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||||
|
2295 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm3, %zmm16
|
||||||
|
2296 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||||
|
2297 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||||
|
2298 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
2299 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
2300 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||||
|
2301 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||||
|
2302 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||||
|
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||||
|
2304 | 0.75 | | | | | 0.250 | | | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||||
|
2305 | 0.00 | | | | | | 1.00 | | | || | 1.0 | shlb $5, %bpl
|
||||||
|
2306 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb %al, %bpl
|
||||||
|
2307 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | 1.0 | orb $-35, %bpl
|
||||||
|
2308 | 1.00 | | | | | | | | | || | | kmovd %ebp, %k1
|
||||||
|
2309 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||||
|
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||||
|
2311 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||||
|
2312 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||||
|
2313 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||||
|
2314 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
2315 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||||
|
2316 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||||
|
2317 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||||
|
2318 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||||
|
2319 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||||
|
2320 | 2.50 | | | | | 0.500 | | | | || | | vrcp14pd %zmm19, %zmm17
|
||||||
|
2321 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||||
|
2322 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||||
|
2323 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
2324 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
2325 | 0.50 | | | | | 0.500 | | | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||||
|
2326 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||||
|
2327 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||||
|
2328 | 0.75 | | | | | 0.250 | | | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||||
|
2329 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (,%rdx,4), %eax
|
||||||
|
2330 | 0.00 | | | | | | 1.00 | | | || | | shlb $6, %dil
|
||||||
|
2331 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %al, %dil
|
||||||
|
2332 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb $-69, %dil
|
||||||
|
2333 | 1.00 | | | | | | | | | || | | kmovd %edi, %k1
|
||||||
|
2334 | 0.50 | | | | | 0.500 | | | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||||
|
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||||
|
2336 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||||
|
2337 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||||
|
2338 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||||
|
2339 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
2340 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||||
|
2341 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||||
|
2342 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||||
|
2343 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||||
|
2344 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||||
|
2345 | 2.00 | | | | | 1.000 | | | | || | | vrcp14pd %zmm28, %zmm3
|
||||||
|
2346 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||||
|
2347 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||||
|
2348 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
2349 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
2350 | 0.00 | | | | | 1.000 | | | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||||
|
2351 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||||
|
2352 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||||
|
2353 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||||
|
2354 | 0.00 | | | | | | 1.00 | | | || | | shlb $3, %dl
|
||||||
|
2355 | 0.00 | | | | | | 1.00 | | | || | | shlb $7, %cl
|
||||||
|
2356 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orb %dl, %cl
|
||||||
|
2357 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addb $-9, %cl
|
||||||
|
2358 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k1
|
||||||
|
2359 | 0.00 | | | | | 1.000 | | | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
2360 | 0.00 | | | | | 1.000 | | | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||||
|
2361 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||||
|
2362 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||||
|
2363 | 0.24 | | | | | 0.760 | | | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||||
|
2364 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rbx
|
||||||
|
2365 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rbx, %r9
|
||||||
|
2366 | | | | | | | | | | || | | * jne .LBB5_12
|
||||||
|
2367 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
44.0 15.0 5.50 5.50 5.50 5.50 43.99 15.0 71 6.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||||
|
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||||
|
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||||
|
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||||
|
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||||
|
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||||
|
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||||
|
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||||
|
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||||
|
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||||
|
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||||
|
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||||
|
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||||
|
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||||
|
2364 | 1.0 | incq %rbx | [2364]
|
||||||
|
|
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out
Normal file
167
static_analysis/jan/analyses/gromacs-icx-avx512-dp-osaca.out
Normal file
@@ -0,0 +1,167 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: gromacs-icx-avx512-dp.s
|
||||||
|
Architecture: CSX
|
||||||
|
Timestamp: 2023-02-10 16:30:53
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
2241 | | | | | | | | || | | # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
|
||||||
|
2242 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
2243 | | | | | | | | || | | .LBB5_12: # Parent Loop BB5_7 Depth=1
|
||||||
|
2244 | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||||
|
2245 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r10,%rbx,4), %rcx
|
||||||
|
2246 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||||
|
2247 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $6, %rdx
|
||||||
|
2248 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV
|
||||||
|
2249 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV
|
||||||
|
2250 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV
|
||||||
|
2251 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 16(%rsp), %zmm3 # 64-byte Reload
|
||||||
|
2252 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm3, %zmm3
|
||||||
|
2253 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm30, %zmm24, %zmm31
|
||||||
|
2254 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 336(%rsp), %zmm16 # 64-byte Reload
|
||||||
|
2255 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm16, %zmm16
|
||||||
|
2256 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm31, %zmm31, %zmm17
|
||||||
|
2257 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17
|
||||||
|
2258 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17
|
||||||
|
2259 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm17, %zmm18
|
||||||
|
2260 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm21, %zmm19
|
||||||
|
2261 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
2262 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm18, %zmm19
|
||||||
|
2263 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddpd %zmm1, %zmm19, %zmm20
|
||||||
|
2264 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm22, %zmm18
|
||||||
|
2265 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm20, %zmm18, %zmm18
|
||||||
|
2266 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm30, %zmm25, %zmm20
|
||||||
|
2267 | | 1.00 | | | | 0.00 | | || | | leal (%rcx,%rcx), %edx
|
||||||
|
2268 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r11
|
||||||
|
2269 | 0.00 | | | | | | 1.00 | || | | setne %dl
|
||||||
|
2270 | 0.00 | | | | | | 1.00 | || | | sete %al
|
||||||
|
2271 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | addl %ecx, %ecx
|
||||||
|
2272 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | incl %ecx
|
||||||
|
2273 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rcx, %r11
|
||||||
|
2274 | 0.00 | | | | | | 1.00 | || | | sete %cl
|
||||||
|
2275 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm19, %zmm18
|
||||||
|
2276 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 528(%rsp), %zmm19 # 64-byte Reload
|
||||||
|
2277 | 1.00 | | | | | 0.00 | | || | | vsubpd %zmm28, %zmm19, %zmm19
|
||||||
|
2278 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||||
|
2279 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||||
|
2280 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $4, %bpl
|
||||||
|
2281 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | subb %al, %bpl
|
||||||
|
2282 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | 1.0 | addb $-17, %bpl
|
||||||
|
2283 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||||
|
2284 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm17, %k1 {%k1}
|
||||||
|
2285 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 272(%rsp), %zmm17 # 64-byte Reload
|
||||||
|
2286 | 0.25 | | | | | 0.75 | | || | | vsubpd %zmm29, %zmm17, %zmm17
|
||||||
|
2287 | | 1.00 | | | | 0.00 | | || | | leal (%rdx,%rdx), %eax
|
||||||
|
2288 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl %edi, %ebp
|
||||||
|
2289 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm2, %zmm18, %zmm18
|
||||||
|
2290 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
|
||||||
|
2291 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm3
|
||||||
|
2292 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3
|
||||||
|
2293 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3
|
||||||
|
2294 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
|
||||||
|
2295 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm3, %zmm16
|
||||||
|
2296 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
|
||||||
|
2297 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm21, %zmm18
|
||||||
|
2298 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
2299 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm16, %zmm18
|
||||||
|
2300 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm18, %zmm31
|
||||||
|
2301 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm22, %zmm16
|
||||||
|
2302 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm31, %zmm16, %zmm16
|
||||||
|
2303 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 464(%rsp), %zmm31 # 64-byte Reload
|
||||||
|
2304 | 0.75 | | | | | 0.25 | | || | | vsubpd %zmm28, %zmm31, %zmm31
|
||||||
|
2305 | 0.00 | | | | | | 1.00 | || | 1.0 | shlb $5, %bpl
|
||||||
|
2306 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb %al, %bpl
|
||||||
|
2307 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | 1.0 | orb $-35, %bpl
|
||||||
|
2308 | 1.00 | | | | | | | || | | kmovd %ebp, %k1
|
||||||
|
2309 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm3, %k1 {%k1}
|
||||||
|
2310 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 208(%rsp), %zmm3 # 64-byte Reload
|
||||||
|
2311 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm3, %zmm3
|
||||||
|
2312 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm18, %zmm16
|
||||||
|
2313 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm26, %zmm18
|
||||||
|
2314 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
2315 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
|
||||||
|
2316 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm18, %zmm18, %zmm19
|
||||||
|
2317 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19
|
||||||
|
2318 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19
|
||||||
|
2319 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
|
||||||
|
2320 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm19, %zmm17
|
||||||
|
2321 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
|
||||||
|
2322 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm21, %zmm16
|
||||||
|
2323 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
2324 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm16, %zmm17, %zmm16
|
||||||
|
2325 | 0.50 | | | | | 0.50 | | || | | vaddpd %zmm1, %zmm16, %zmm20
|
||||||
|
2326 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm22, %zmm17
|
||||||
|
2327 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm17, %zmm17
|
||||||
|
2328 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm17, %zmm16, %zmm16
|
||||||
|
2329 | | 1.00 | | | | 0.00 | | || | | leal (,%rdx,4), %eax
|
||||||
|
2330 | 0.00 | | | | | | 1.00 | || | | shlb $6, %dil
|
||||||
|
2331 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | orb %al, %dil
|
||||||
|
2332 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | orb $-69, %dil
|
||||||
|
2333 | 1.00 | | | | | | | || | | kmovd %edi, %k1
|
||||||
|
2334 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm19, %k1 {%k1}
|
||||||
|
2335 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovupd 400(%rsp), %zmm17 # 64-byte Reload
|
||||||
|
2336 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm28, %zmm17, %zmm17
|
||||||
|
2337 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm29, %zmm23, %zmm19
|
||||||
|
2338 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm30, %zmm27, %zmm20
|
||||||
|
2339 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm2, %zmm16, %zmm16
|
||||||
|
2340 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
|
||||||
|
2341 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm20, %zmm20, %zmm28
|
||||||
|
2342 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28
|
||||||
|
2343 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28
|
||||||
|
2344 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
|
||||||
|
2345 | 2.50 | | | | | 0.50 | | || | | vrcp14pd %zmm28, %zmm3
|
||||||
|
2346 | 0.50 | | | | | 0.50 | | || | | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
|
||||||
|
2347 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm3, %zmm21, %zmm16
|
||||||
|
2348 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
2349 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm16, %zmm3, %zmm16
|
||||||
|
2350 | 0.00 | | | | | 1.00 | | || | | vaddpd %zmm1, %zmm16, %zmm18
|
||||||
|
2351 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm22, %zmm3
|
||||||
|
2352 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm18, %zmm3, %zmm3
|
||||||
|
2353 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm3, %zmm16, %zmm3
|
||||||
|
2354 | 0.00 | | | | | | 1.00 | || | | shlb $3, %dl
|
||||||
|
2355 | 0.00 | | | | | | 1.00 | || | | shlb $7, %cl
|
||||||
|
2356 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | orb %dl, %cl
|
||||||
|
2357 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | addb $-9, %cl
|
||||||
|
2358 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||||
|
2359 | | | | | | 1.00 | | || | | vcmpltpd %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
2360 | 0.00 | | | | | 1.00 | | || | | vmulpd %zmm2, %zmm3, %zmm3
|
||||||
|
2361 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
|
||||||
|
2362 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
|
||||||
|
2363 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
|
||||||
|
2364 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | incq %rbx
|
||||||
|
2365 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rbx, %r9
|
||||||
|
2366 | | | | | | | | || | | * jne .LBB5_12
|
||||||
|
2367 | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
44.0 15.0 5.50 5.50 5.50 5.50 44.0 15.0 66.0 6.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
2280 | 6.0 | shlb $4, %bpl | [2280, 2281, 2282, 2305, 2306, 2307]
|
||||||
|
2363 | 4.0 | vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
|
||||||
|
2362 | 4.0 | vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
|
||||||
|
2361 | 4.0 | vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
|
||||||
|
2346 | 4.0 | vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
|
||||||
|
2344 | 4.0 | vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
|
||||||
|
2340 | 4.0 | vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
|
||||||
|
2321 | 4.0 | vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
|
||||||
|
2319 | 4.0 | vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
|
||||||
|
2315 | 4.0 | vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
|
||||||
|
2296 | 4.0 | vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
|
||||||
|
2294 | 4.0 | vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
|
||||||
|
2290 | 4.0 | vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
|
||||||
|
2330 | 3.0 | shlb $6, %dil | [2330, 2331, 2332]
|
||||||
|
2364 | 1.0 | incq %rbx | [2364]
|
||||||
|
|
162
static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out
Normal file
162
static_analysis/jan/analyses/gromacs-icx-avx512-sp-iaca.out
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - gromacs-icx-avx512-sp.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 64.00 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 50.0 0.0 | 7.0 | 9.5 8.1 | 9.5 7.9 | 3.0 | 50.0 | 7.0 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | movsxd rax, dword ptr [r11+rdx*4]
|
||||||
|
| 1* | | | | | | | | | mov rsi, rax
|
||||||
|
| 1 | | | | | | | 1.0 | | shl rsi, 0x5
|
||||||
|
| 1 | | 1.0 | | | | | | | lea rbx, ptr [rsi+rsi*2]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm15, zmmword ptr [rdi+rbx*1]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x80]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm24, zmm1, zmm15
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x140]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm25, zmm1, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm26, zmm9, zmm27
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm21, zmm1, zmm15
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x100]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm22, zmm1, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm23, zmm10, zmm27
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x1c0]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm17, zmm1, zmm15
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0xc0]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm19, zmm1, zmm16
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm20, zmm11, zmm27
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm1, zmmword ptr [rsp+0x180]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm18, zmm1, zmm15
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm16, zmm8, zmm16
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm15, zmm12, zmm27
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm27, zmm26, zmm26
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm27, zmm25, zmm25
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm27, zmm24, zmm24
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm28, zmm23, zmm23
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm28, zmm22, zmm22
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm28, zmm21, zmm21
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm29, zmm20, zmm20
|
||||||
|
| 1 | 1.0 | | | | | | | | vfmadd231ps zmm29, zmm19, zmm19
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm29, zmm17, zmm17
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm30, zmm15, zmm15
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm16, zmm16
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm31, zmm27
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm1, zmm28
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm2, zmm29
|
||||||
|
| 1 | | | | | | 1.0 | | | vfmadd231ps zmm30, zmm18, zmm18
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14ps zmm3, zmm30
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm6, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm4, zmm31, zmm4
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm4, zmm13
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm7, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm31, zmm5
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm1, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm1, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm5
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm7, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm1, zmm5
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm6, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm5, zmm2, zmm5
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm5, zmm2, zmm5
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm1, zmm31, zmm1
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddps zmm31, zmm5, zmm13
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm2, zmm7, zmm2
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm6, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm31, zmm3, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm31, zmm3, zmm31
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm5, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm31, zmm13
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm7, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm3, zmm3, zmm5
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm31, zmm3
|
||||||
|
| 1* | | | | | | | | | xor esi, esi
|
||||||
|
| 1* | | | | | | | | | xor edi, edi
|
||||||
|
| 1 | | 1.0 | | | | | | | test eax, 0x7fffffff
|
||||||
|
| 1 | | | | | | | 1.0 | | setz sil
|
||||||
|
| 1 | | | | | | | 1.0 | | setnz dil
|
||||||
|
| 1 | | 1.0 | | | | | | | mov eax, 0xff
|
||||||
|
| 1 | | | | | | | 1.0 | | cmovz eax, r8d
|
||||||
|
| 1 | | 1.0 | | | | | | | mov ecx, 0xff
|
||||||
|
| 1 | | | | | | | 1.0 | | cmovz ecx, r9d
|
||||||
|
| 1 | | 1.0 | | | | | | | xor esi, 0xff
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm27, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm4, zmm4, zmm14
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm24, zmm4
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm24{k1}{z}, zmm25, zmm4
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm26, zmm4
|
||||||
|
| 1 | | 1.0 | | | | | | | lea esi, ptr [rdi+rdi*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | or esi, 0xfc
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, esi
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm28, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm1, zmm1, zmm14
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm21, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm5, zmm21
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm21{k1}{z}, zmm22, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm21, zmm24, zmm21
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm1{k1}{z}, zmm23, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm1, zmm4, zmm1
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, eax
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm29, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm2, zmm2, zmm14
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm4{k1}{z}, zmm17, zmm2
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm17{k1}{z}, zmm19, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm2{k1}{z}, zmm20, zmm2
|
||||||
|
| 1 | | | | | | 1.0 | | | kmovd k1, ecx
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmpps k1{k1}, zmm30, zmm0, 0x1
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm3, zmm3, zmm14
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm18{k1}{z}, zmm18, zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddps zmm4, zmm4, zmm18
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm4, zmm5, zmm4
|
||||||
|
| 1 | 1.0 | | | | | | | | vmulps zmm5{k1}{z}, zmm16, zmm3
|
||||||
|
| 1 | | | | | | 1.0 | | | vaddps zmm5, zmm17, zmm5
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddps zmm5, zmm21, zmm5
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulps zmm3{k1}{z}, zmm15, zmm3
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rax, qword ptr [r15+0xb0]
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddps zmm2, zmm2, zmm3
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm3, zmmword ptr [rax+rbx*1]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm3, zmm3, zmm4
|
||||||
|
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1], zmm3
|
||||||
|
| 1 | 1.0 | | | | | | | | vaddps zmm1, zmm1, zmm2
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
|
||||||
|
| 1 | | | | | | 1.0 | | | vsubps zmm2, zmm2, zmm5
|
||||||
|
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
|
||||||
|
| 1 | 1.0 | | | | | | | | vsubps zmm1, zmm2, zmm1
|
||||||
|
| 2 | | | 0.5 | 0.5 | 1.0 | | | | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
|
||||||
|
| 1* | | | | | | | | | cmp r10, rdx
|
||||||
|
| 0*F | | | | | | | | | jz 0x34
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | mov rdi, qword ptr [r15+0xa0]
|
||||||
|
| 1 | | 1.0 | | | | | | | inc rdx
|
||||||
|
| 1 | | | | | | | 1.0 | | jmp 0xfffffffffffffcfc
|
||||||
|
Total Num Of Uops: 140
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
304
static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out
Normal file
304
static_analysis/jan/analyses/gromacs-icx-avx512-sp-mca.out
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 13000
|
||||||
|
Total Cycles: 5640
|
||||||
|
Total uOps: 15400
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.73
|
||||||
|
IPC: 2.30
|
||||||
|
Block RThroughput: 40.0
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 5 0.50 * movslq (%r11,%rdx,4), %rax
|
||||||
|
1 1 0.25 movq %rax, %rsi
|
||||||
|
1 1 0.50 shlq $5, %rsi
|
||||||
|
1 1 0.50 leaq (%rsi,%rsi,2), %rbx
|
||||||
|
2 8 0.50 * vmovups (%rdi,%rbx), %zmm15
|
||||||
|
2 8 0.50 * vmovups 32(%rdi,%rbx), %zmm16
|
||||||
|
2 8 0.50 * vmovups 64(%rdi,%rbx), %zmm27
|
||||||
|
2 8 0.50 * vmovups 128(%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm15, %zmm1, %zmm24
|
||||||
|
2 8 0.50 * vmovups 320(%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm16, %zmm1, %zmm25
|
||||||
|
1 4 0.50 vsubps %zmm27, %zmm9, %zmm26
|
||||||
|
2 8 0.50 * vmovups (%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm15, %zmm1, %zmm21
|
||||||
|
2 8 0.50 * vmovups 256(%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm16, %zmm1, %zmm22
|
||||||
|
1 4 0.50 vsubps %zmm27, %zmm10, %zmm23
|
||||||
|
2 8 0.50 * vmovups 448(%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm15, %zmm1, %zmm17
|
||||||
|
2 8 0.50 * vmovups 192(%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm16, %zmm1, %zmm19
|
||||||
|
1 4 0.50 vsubps %zmm27, %zmm11, %zmm20
|
||||||
|
2 8 0.50 * vmovups 384(%rsp), %zmm1
|
||||||
|
1 4 0.50 vsubps %zmm15, %zmm1, %zmm18
|
||||||
|
1 4 0.50 vsubps %zmm16, %zmm8, %zmm16
|
||||||
|
1 4 0.50 vsubps %zmm27, %zmm12, %zmm15
|
||||||
|
1 4 0.50 vmulps %zmm26, %zmm26, %zmm27
|
||||||
|
1 4 0.50 vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||||
|
1 4 0.50 vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||||
|
1 4 0.50 vmulps %zmm23, %zmm23, %zmm28
|
||||||
|
1 4 0.50 vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||||
|
1 4 0.50 vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||||
|
1 4 0.50 vmulps %zmm20, %zmm20, %zmm29
|
||||||
|
1 4 0.50 vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||||
|
1 4 0.50 vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||||
|
1 4 0.50 vmulps %zmm15, %zmm15, %zmm30
|
||||||
|
1 4 0.50 vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||||
|
3 4 2.00 vrcp14ps %zmm27, %zmm31
|
||||||
|
3 4 2.00 vrcp14ps %zmm28, %zmm1
|
||||||
|
3 4 2.00 vrcp14ps %zmm29, %zmm2
|
||||||
|
1 4 0.50 vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||||
|
3 4 2.00 vrcp14ps %zmm30, %zmm3
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm6, %zmm4
|
||||||
|
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||||
|
1 4 0.50 vmulps %zmm4, %zmm31, %zmm4
|
||||||
|
1 4 0.50 vaddps %zmm13, %zmm4, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm7, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm5, %zmm31, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm1, %zmm6, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm1, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm5, %zmm4, %zmm4
|
||||||
|
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm1, %zmm7, %zmm1
|
||||||
|
1 4 0.50 vmulps %zmm5, %zmm1, %zmm1
|
||||||
|
1 4 0.50 vmulps %zmm2, %zmm6, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm5, %zmm2, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm1, %zmm31, %zmm1
|
||||||
|
1 4 0.50 vaddps %zmm13, %zmm5, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm2, %zmm7, %zmm2
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm2, %zmm2
|
||||||
|
1 4 0.50 vmulps %zmm3, %zmm6, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm31, %zmm3, %zmm31
|
||||||
|
1 4 0.50 vmulps %zmm2, %zmm5, %zmm2
|
||||||
|
1 4 0.50 vaddps %zmm13, %zmm31, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm3, %zmm7, %zmm3
|
||||||
|
1 4 0.50 vmulps %zmm5, %zmm3, %zmm3
|
||||||
|
1 4 0.50 vmulps %zmm3, %zmm31, %zmm3
|
||||||
|
1 0 0.17 xorl %esi, %esi
|
||||||
|
1 0 0.17 xorl %edi, %edi
|
||||||
|
1 1 0.25 testl $2147483647, %eax
|
||||||
|
1 1 0.50 sete %sil
|
||||||
|
1 1 0.50 setne %dil
|
||||||
|
1 1 0.25 movl $255, %eax
|
||||||
|
1 1 0.50 cmovel %r8d, %eax
|
||||||
|
1 1 0.25 movl $255, %ecx
|
||||||
|
1 1 0.50 cmovel %r9d, %ecx
|
||||||
|
1 1 0.25 xorl $255, %esi
|
||||||
|
1 1 1.00 kmovd %esi, %k1
|
||||||
|
1 4 1.00 vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||||
|
1 4 0.50 vmulps %zmm14, %zmm4, %zmm4
|
||||||
|
1 4 0.50 vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||||
|
1 4 0.50 vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||||
|
1 4 0.50 vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||||
|
1 1 0.50 leal (%rdi,%rdi,2), %esi
|
||||||
|
1 1 0.25 orl $252, %esi
|
||||||
|
1 1 1.00 kmovd %esi, %k1
|
||||||
|
1 4 1.00 vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
1 4 0.50 vmulps %zmm14, %zmm1, %zmm1
|
||||||
|
1 4 0.50 vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||||
|
1 4 0.50 vaddps %zmm21, %zmm5, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||||
|
1 4 0.50 vaddps %zmm21, %zmm24, %zmm21
|
||||||
|
1 4 0.50 vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||||
|
1 4 0.50 vaddps %zmm1, %zmm4, %zmm1
|
||||||
|
1 1 1.00 kmovd %eax, %k1
|
||||||
|
1 4 1.00 vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||||
|
1 4 0.50 vmulps %zmm14, %zmm2, %zmm2
|
||||||
|
1 4 0.50 vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||||
|
1 4 0.50 vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||||
|
1 4 0.50 vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||||
|
1 1 1.00 kmovd %ecx, %k1
|
||||||
|
1 4 1.00 vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||||
|
1 4 0.50 vmulps %zmm14, %zmm3, %zmm3
|
||||||
|
1 4 0.50 vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||||
|
1 4 0.50 vaddps %zmm18, %zmm4, %zmm4
|
||||||
|
1 4 0.50 vaddps %zmm4, %zmm5, %zmm4
|
||||||
|
1 4 0.50 vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||||
|
1 4 0.50 vaddps %zmm5, %zmm17, %zmm5
|
||||||
|
1 4 0.50 vaddps %zmm5, %zmm21, %zmm5
|
||||||
|
1 4 0.50 vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||||
|
1 5 0.50 * movq 176(%r15), %rax
|
||||||
|
1 4 0.50 vaddps %zmm3, %zmm2, %zmm2
|
||||||
|
2 8 0.50 * vmovups (%rax,%rbx), %zmm3
|
||||||
|
1 4 0.50 vsubps %zmm4, %zmm3, %zmm3
|
||||||
|
2 1 1.00 * vmovups %zmm3, (%rax,%rbx)
|
||||||
|
1 4 0.50 vaddps %zmm2, %zmm1, %zmm1
|
||||||
|
2 8 0.50 * vmovups 32(%rax,%rbx), %zmm2
|
||||||
|
1 4 0.50 vsubps %zmm5, %zmm2, %zmm2
|
||||||
|
2 1 1.00 * vmovups %zmm2, 32(%rax,%rbx)
|
||||||
|
2 8 0.50 * vmovups 64(%rax,%rbx), %zmm2
|
||||||
|
1 4 0.50 vsubps %zmm1, %zmm2, %zmm1
|
||||||
|
2 1 1.00 * vmovups %zmm1, 64(%rax,%rbx)
|
||||||
|
1 1 0.25 cmpq %rdx, %r10
|
||||||
|
1 1 0.50 je .LBB4_18
|
||||||
|
1 5 0.50 * movq 160(%r15), %rdi
|
||||||
|
1 1 0.25 incq %rdx
|
||||||
|
1 1 0.50 jmp .LBB4_8
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - SKXDivider
|
||||||
|
[1] - SKXFPDivider
|
||||||
|
[2] - SKXPort0
|
||||||
|
[3] - SKXPort1
|
||||||
|
[4] - SKXPort2
|
||||||
|
[5] - SKXPort3
|
||||||
|
[6] - SKXPort4
|
||||||
|
[7] - SKXPort5
|
||||||
|
[8] - SKXPort6
|
||||||
|
[9] - SKXPort7
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||||
|
- - 52.01 14.97 8.49 8.51 3.00 52.02 11.00 2.00
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||||
|
- - - - 0.49 0.51 - - - - movslq (%r11,%rdx,4), %rax
|
||||||
|
- - - - - - - - 1.00 - movq %rax, %rsi
|
||||||
|
- - - - - - - - 1.00 - shlq $5, %rsi
|
||||||
|
- - - 1.00 - - - - - - leaq (%rsi,%rsi,2), %rbx
|
||||||
|
- - 0.01 0.99 0.50 0.50 - - - - vmovups (%rdi,%rbx), %zmm15
|
||||||
|
- - - - 0.50 0.50 - 1.00 - - vmovups 32(%rdi,%rbx), %zmm16
|
||||||
|
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rdi,%rbx), %zmm27
|
||||||
|
- - - 0.99 0.51 0.49 - 0.01 - - vmovups 128(%rsp), %zmm1
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm24
|
||||||
|
- - - 1.00 0.49 0.51 - - - - vmovups 320(%rsp), %zmm1
|
||||||
|
- - 0.99 - - - - 0.01 - - vsubps %zmm16, %zmm1, %zmm25
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm9, %zmm26
|
||||||
|
- - 0.01 0.99 0.51 0.49 - - - - vmovups (%rsp), %zmm1
|
||||||
|
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm21
|
||||||
|
- - - - 0.49 0.51 - 1.00 - - vmovups 256(%rsp), %zmm1
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm1, %zmm22
|
||||||
|
- - 0.99 - - - - 0.01 - - vsubps %zmm27, %zmm10, %zmm23
|
||||||
|
- - - 1.00 0.51 0.49 - - - - vmovups 448(%rsp), %zmm1
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubps %zmm15, %zmm1, %zmm17
|
||||||
|
- - 0.01 - 0.49 0.51 - 0.99 - - vmovups 192(%rsp), %zmm1
|
||||||
|
- - - - - - - 1.00 - - vsubps %zmm16, %zmm1, %zmm19
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubps %zmm27, %zmm11, %zmm20
|
||||||
|
- - 0.99 - 0.50 0.50 - 0.01 - - vmovups 384(%rsp), %zmm1
|
||||||
|
- - - - - - - 1.00 - - vsubps %zmm15, %zmm1, %zmm18
|
||||||
|
- - 0.01 - - - - 0.99 - - vsubps %zmm16, %zmm8, %zmm16
|
||||||
|
- - - - - - - 1.00 - - vsubps %zmm27, %zmm12, %zmm15
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm26, %zmm26, %zmm27
|
||||||
|
- - 1.00 - - - - - - - vfmadd231ps %zmm25, %zmm25, %zmm27
|
||||||
|
- - 0.99 - - - - 0.01 - - vfmadd231ps %zmm24, %zmm24, %zmm27
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm23, %zmm23, %zmm28
|
||||||
|
- - - - - - - 1.00 - - vfmadd231ps %zmm22, %zmm22, %zmm28
|
||||||
|
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm21, %zmm21, %zmm28
|
||||||
|
- - 0.01 - - - - 0.99 - - vmulps %zmm20, %zmm20, %zmm29
|
||||||
|
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm19, %zmm19, %zmm29
|
||||||
|
- - - - - - - 1.00 - - vfmadd231ps %zmm17, %zmm17, %zmm29
|
||||||
|
- - 0.01 - - - - 0.99 - - vmulps %zmm15, %zmm15, %zmm30
|
||||||
|
- - 0.01 - - - - 0.99 - - vfmadd231ps %zmm16, %zmm16, %zmm30
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm27, %zmm31
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm28, %zmm1
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm29, %zmm2
|
||||||
|
- - 1.00 - - - - - - - vfmadd231ps %zmm18, %zmm18, %zmm30
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14ps %zmm30, %zmm3
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm31, %zmm6, %zmm4
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm4, %zmm31, %zmm4
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm13, %zmm4, %zmm5
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm31, %zmm7, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm5, %zmm31, %zmm5
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm1, %zmm6, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm31, %zmm1, %zmm31
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulps %zmm31, %zmm1, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm5, %zmm4, %zmm4
|
||||||
|
- - - - - - - 1.00 - - vaddps %zmm13, %zmm31, %zmm5
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm1, %zmm7, %zmm1
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm5, %zmm1, %zmm1
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm2, %zmm6, %zmm5
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm5, %zmm2, %zmm5
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm31, %zmm1
|
||||||
|
- - - - - - - 1.00 - - vaddps %zmm13, %zmm5, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm2, %zmm7, %zmm2
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm31, %zmm2, %zmm2
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm3, %zmm6, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm31, %zmm3, %zmm31
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm2, %zmm5, %zmm2
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm13, %zmm31, %zmm5
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm3, %zmm7, %zmm3
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm5, %zmm3, %zmm3
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm3, %zmm31, %zmm3
|
||||||
|
- - - - - - - - - - xorl %esi, %esi
|
||||||
|
- - - - - - - - - - xorl %edi, %edi
|
||||||
|
- - - - - - - - 1.00 - testl $2147483647, %eax
|
||||||
|
- - - - - - - - 1.00 - sete %sil
|
||||||
|
- - - - - - - - 1.00 - setne %dil
|
||||||
|
- - - 1.00 - - - - - - movl $255, %eax
|
||||||
|
- - - - - - - - 1.00 - cmovel %r8d, %eax
|
||||||
|
- - - 1.00 - - - - - - movl $255, %ecx
|
||||||
|
- - - - - - - - 1.00 - cmovel %r9d, %ecx
|
||||||
|
- - - 1.00 - - - - - - xorl $255, %esi
|
||||||
|
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm14, %zmm4, %zmm4
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||||
|
- - - 1.00 - - - - - - leal (%rdi,%rdi,2), %esi
|
||||||
|
- - - - - - - - 1.00 - orl $252, %esi
|
||||||
|
- - - - - - - 1.00 - - kmovd %esi, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulps %zmm14, %zmm1, %zmm1
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm21, %zmm5, %zmm5
|
||||||
|
- - 0.01 - - - - 0.99 - - vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||||
|
- - - - - - - 1.00 - - vaddps %zmm21, %zmm24, %zmm21
|
||||||
|
- - 0.99 - - - - 0.01 - - vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm1, %zmm4, %zmm1
|
||||||
|
- - - - - - - 1.00 - - kmovd %eax, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm14, %zmm2, %zmm2
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||||
|
- - - - - - - 1.00 - - kmovd %ecx, %k1
|
||||||
|
- - - - - - - 1.00 - - vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm14, %zmm3, %zmm3
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||||
|
- - 0.99 - - - - 0.01 - - vaddps %zmm18, %zmm4, %zmm4
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm4, %zmm5, %zmm4
|
||||||
|
- - - - - - - 1.00 - - vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm5, %zmm17, %zmm5
|
||||||
|
- - 0.99 - - - - 0.01 - - vaddps %zmm5, %zmm21, %zmm5
|
||||||
|
- - 1.00 - - - - - - - vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||||
|
- - - - 1.00 - - - - - movq 176(%r15), %rax
|
||||||
|
- - 0.99 - - - - 0.01 - - vaddps %zmm3, %zmm2, %zmm2
|
||||||
|
- - - 1.00 0.50 0.50 - - - - vmovups (%rax,%rbx), %zmm3
|
||||||
|
- - 0.99 - - - - 0.01 - - vsubps %zmm4, %zmm3, %zmm3
|
||||||
|
- - - - - - 1.00 - - 1.00 vmovups %zmm3, (%rax,%rbx)
|
||||||
|
- - 1.00 - - - - - - - vaddps %zmm2, %zmm1, %zmm1
|
||||||
|
- - - 1.00 0.50 0.50 - - - - vmovups 32(%rax,%rbx), %zmm2
|
||||||
|
- - 1.00 - - - - - - - vsubps %zmm5, %zmm2, %zmm2
|
||||||
|
- - - - - - 1.00 - - 1.00 vmovups %zmm2, 32(%rax,%rbx)
|
||||||
|
- - - 1.00 0.50 0.50 - - - - vmovups 64(%rax,%rbx), %zmm2
|
||||||
|
- - 0.99 - - - - 0.01 - - vsubps %zmm1, %zmm2, %zmm1
|
||||||
|
- - - - - 1.00 1.00 - - - vmovups %zmm1, 64(%rax,%rbx)
|
||||||
|
- - - - - - - - 1.00 - cmpq %rdx, %r10
|
||||||
|
- - - - - - - - 1.00 - je .LBB4_18
|
||||||
|
- - - - 0.50 0.50 - - - - movq 160(%r15), %rdi
|
||||||
|
- - - 1.00 - - - - - - incq %rdx
|
||||||
|
- - - - - - - - 1.00 - jmp .LBB4_8
|
116
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out
Normal file
116
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca-icx.out
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: gromacs-icx-avx512-sp.s
|
||||||
|
Architecture: ICX
|
||||||
|
Timestamp: 2023-02-14 12:51:43
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||||
|
------------------------------------------------------------------------------------------------------------------------
|
||||||
|
1338 | | | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||||
|
1339 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
1340 | | | | | | | | | | || | | .LBB2_12: # Parent Loop BB2_7 Depth=1
|
||||||
|
1341 | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||||
|
1342 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | movslq (%r11,%rax,4), %rcx
|
||||||
|
1343 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || 6.0 | | leaq (%rcx,%rcx,2), %rdx
|
||||||
|
1344 | 0.00 | | | | | | 1.00 | | | || 1.0 | | shlq $5, %rdx
|
||||||
|
1345 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovupd (%rsi,%rdx), %zmm16
|
||||||
|
1346 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3]
|
||||||
|
1347 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3]
|
||||||
|
1348 | | | | | | 1.000 | | | | || | | vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
|
||||||
|
1349 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm6, %zmm18
|
||||||
|
1350 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm10, %zmm17
|
||||||
|
1351 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubps %zmm20, %zmm14, %zmm16
|
||||||
|
1352 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm16, %zmm16, %zmm22
|
||||||
|
1353 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22
|
||||||
|
1354 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22
|
||||||
|
1355 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14ps %zmm22, %zmm23
|
||||||
|
1356 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm23, %zmm26, %zmm24
|
||||||
|
1357 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||||
|
1358 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm24, %zmm23, %zmm24
|
||||||
|
1359 | 0.75 | | | | | 0.250 | | | | || 4.0 | | vaddps %zmm1, %zmm24, %zmm25
|
||||||
|
1360 | 1.00 | | | | | 0.000 | | | | || | | vmulps %zmm23, %zmm27, %zmm23
|
||||||
|
1361 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm25, %zmm23, %zmm23
|
||||||
|
1362 | 1.00 | | | | | 0.000 | | | | || 4.0 | | vmulps %zmm23, %zmm24, %zmm23
|
||||||
|
1363 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | leal (%rcx,%rcx), %edx
|
||||||
|
1364 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edi, %edi
|
||||||
|
1365 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebp, %ebp
|
||||||
|
1366 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rdx, %r12
|
||||||
|
1367 | 0.00 | | | | | | 1.00 | | | || | | setne %dil
|
||||||
|
1368 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal 1(%rcx,%rcx), %ecx
|
||||||
|
1369 | 0.00 | | | | | | 1.00 | | | || | | sete %bpl
|
||||||
|
1370 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %edx, %edx
|
||||||
|
1371 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | xorl %ebx, %ebx
|
||||||
|
1372 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | cmpq %rcx, %r12
|
||||||
|
1373 | 0.00 | | | | | | 1.00 | | | || | | sete %dl
|
||||||
|
1374 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | movl $0, %ecx
|
||||||
|
1375 | 0.00 | | | | | | 1.00 | | | || | | setne %bl
|
||||||
|
1376 | 0.00 | | | | | | 1.00 | | | || | | cmovel %r8d, %ecx
|
||||||
|
1377 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | movl %ebx, %r14d
|
||||||
|
1378 | 0.00 | | | | | | 1.00 | | | || | | shll $4, %r14d
|
||||||
|
1379 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | subl %ebp, %r14d
|
||||||
|
1380 | 0.00 | 0.75 | | | | 0.000 | 0.25 | | | || | | leal (%rcx,%rdi,2), %ecx
|
||||||
|
1381 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %ecx
|
||||||
|
1382 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $239, %r14d
|
||||||
|
1383 | 0.00 | 0.50 | | | | 0.000 | 0.50 | | | || | | addl $-768, %ecx # imm = 0xFD00
|
||||||
|
1384 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | orl %r14d, %ecx
|
||||||
|
1385 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||||
|
1386 | 0.50 | | | | | 0.500 | | | | || | | vcmpltps %zmm0, %zmm22, %k2 {%k2}
|
||||||
|
1387 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm21, %zmm11, %zmm21
|
||||||
|
1388 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm20, %zmm15, %zmm20
|
||||||
|
1389 | 0.50 | | | | | 0.500 | | | | || | | vsubps %zmm19, %zmm7, %zmm19
|
||||||
|
1390 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulps %zmm2, %zmm23, %zmm22
|
||||||
|
1391 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
|
||||||
|
1392 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm20, %zmm20, %zmm18
|
||||||
|
1393 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18
|
||||||
|
1394 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18
|
||||||
|
1395 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
|
||||||
|
1396 | 2.50 | | | | | 0.500 | | | | || | | vrcp14ps %zmm18, %zmm17
|
||||||
|
1397 | 0.50 | | | | | 0.500 | | | | || | | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
|
||||||
|
1398 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm17, %zmm26, %zmm16
|
||||||
|
1399 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||||
|
1400 | 0.50 | | | | | 0.500 | | | | || | | vmulps %zmm16, %zmm17, %zmm16
|
||||||
|
1401 | 0.00 | | | | | 1.000 | | | | || | | vaddps %zmm1, %zmm16, %zmm22
|
||||||
|
1402 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm27, %zmm17
|
||||||
|
1403 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm22, %zmm17, %zmm17
|
||||||
|
1404 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm17, %zmm16, %zmm16
|
||||||
|
1405 | 0.00 | | | | | | 1.00 | | | || | | shll $6, %ebx
|
||||||
|
1406 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rbx,%rdi,4), %ecx
|
||||||
|
1407 | 0.00 | | | | | | 1.00 | | | || | | shll $7, %edx
|
||||||
|
1408 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | leal (%rdx,%rdi,8), %edx
|
||||||
|
1409 | 0.00 | | | | | | 1.00 | | | || | | shll $8, %edx
|
||||||
|
1410 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl %edx, %ecx
|
||||||
|
1411 | 0.00 | 1.00 | | | | 0.000 | 0.00 | | | || | | addl $-2117, %ecx # imm = 0xF7BB
|
||||||
|
1412 | 1.00 | | | | | | | | | || | | kmovd %ecx, %k2
|
||||||
|
1413 | 0.00 | | | | | 1.000 | | | | || | | vcmpltps %zmm0, %zmm18, %k2 {%k2}
|
||||||
|
1414 | 0.00 | | | | | 1.000 | | | | || | | vmulps %zmm2, %zmm16, %zmm16
|
||||||
|
1415 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
|
||||||
|
1416 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
|
||||||
|
1417 | 0.24 | | | | | 0.760 | | | | || | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
|
||||||
|
1418 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | incq %rax
|
||||||
|
1419 | 0.00 | 1.00 | | | | -0.01 | 0.00 | | | || | | cmpq %rax, %r10
|
||||||
|
1420 | | | | | | | | | | || | | * jne .LBB2_12
|
||||||
|
1421 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
22.5 16.5 2.00 2.00 2.00 2.00 22.49 16.5 71 4.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
1417 | 4.0 | vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
|
||||||
|
1416 | 4.0 | vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
|
||||||
|
1415 | 4.0 | vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
|
||||||
|
1397 | 4.0 | vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
|
||||||
|
1395 | 4.0 | vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
|
||||||
|
1391 | 4.0 | vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
|
||||||
|
1418 | 1.0 | incq %rax | [1418]
|
||||||
|
|
161
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out
Normal file
161
static_analysis/jan/analyses/gromacs-icx-avx512-sp-osaca.out
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: gromacs-icx-avx512-sp.s
|
||||||
|
Architecture: CSX
|
||||||
|
Timestamp: 2023-02-10 16:31:04
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
1662 | | | | | | | | || | | # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
|
||||||
|
1663 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
1664 | | | | | | | | || | | .LBB4_8: # =>This Inner Loop Header: Depth=1
|
||||||
|
1665 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | movslq (%r11,%rdx,4), %rax
|
||||||
|
1666 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || 1.0 | | movq %rax, %rsi
|
||||||
|
1667 | 0.00 | | | | | | 1.00 | || 1.0 | | shlq $5, %rsi
|
||||||
|
1668 | | 1.00 | | | | 0.00 | | || 1.0 | | leaq (%rsi,%rsi,2), %rbx
|
||||||
|
1669 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV
|
||||||
|
1670 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV
|
||||||
|
1671 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV
|
||||||
|
1672 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 128(%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1673 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm24
|
||||||
|
1674 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 320(%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1675 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm25
|
||||||
|
1676 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubps %zmm27, %zmm9, %zmm26
|
||||||
|
1677 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1678 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm21
|
||||||
|
1679 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 256(%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1680 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm22
|
||||||
|
1681 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm10, %zmm23
|
||||||
|
1682 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 448(%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1683 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm17
|
||||||
|
1684 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 192(%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1685 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm1, %zmm19
|
||||||
|
1686 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm11, %zmm20
|
||||||
|
1687 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 384(%rsp), %zmm1 # 64-byte Reload
|
||||||
|
1688 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm15, %zmm1, %zmm18
|
||||||
|
1689 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm16, %zmm8, %zmm16
|
||||||
|
1690 | 0.50 | | | | | 0.50 | | || | | vsubps %zmm27, %zmm12, %zmm15
|
||||||
|
1691 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm26, %zmm26, %zmm27
|
||||||
|
1692 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27
|
||||||
|
1693 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27
|
||||||
|
1694 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm23, %zmm23, %zmm28
|
||||||
|
1695 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28
|
||||||
|
1696 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28
|
||||||
|
1697 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm20, %zmm20, %zmm29
|
||||||
|
1698 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29
|
||||||
|
1699 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29
|
||||||
|
1700 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm15, %zmm15, %zmm30
|
||||||
|
1701 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30
|
||||||
|
1702 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14ps %zmm27, %zmm31
|
||||||
|
1703 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm28, %zmm1
|
||||||
|
1704 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm29, %zmm2
|
||||||
|
1705 | 0.50 | | | | | 0.50 | | || | | vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30
|
||||||
|
1706 | 2.50 | | | | | 0.50 | | || | | vrcp14ps %zmm30, %zmm3
|
||||||
|
1707 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm31, %zmm6, %zmm4
|
||||||
|
1708 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||||
|
1709 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm31, %zmm4
|
||||||
|
1710 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm13, %zmm4, %zmm5
|
||||||
|
1711 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm7, %zmm31
|
||||||
|
1712 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm31, %zmm5
|
||||||
|
1713 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm6, %zmm31
|
||||||
|
1714 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||||
|
1715 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm1, %zmm31
|
||||||
|
1716 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm5, %zmm4, %zmm4
|
||||||
|
1717 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||||
|
1718 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm7, %zmm1
|
||||||
|
1719 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm1, %zmm1
|
||||||
|
1720 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm6, %zmm5
|
||||||
|
1721 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||||
|
1722 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm5, %zmm2, %zmm5
|
||||||
|
1723 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm31, %zmm1
|
||||||
|
1724 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm5, %zmm31
|
||||||
|
1725 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm7, %zmm2
|
||||||
|
1726 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm2, %zmm2
|
||||||
|
1727 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm6, %zmm31
|
||||||
|
1728 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||||
|
1729 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm31, %zmm3, %zmm31
|
||||||
|
1730 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm5, %zmm2
|
||||||
|
1731 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm13, %zmm31, %zmm5
|
||||||
|
1732 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm7, %zmm3
|
||||||
|
1733 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm5, %zmm3, %zmm3
|
||||||
|
1734 | 1.00 | | | | | 0.00 | | || | | vmulps %zmm3, %zmm31, %zmm3
|
||||||
|
1735 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %esi, %esi
|
||||||
|
1736 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | xorl %edi, %edi
|
||||||
|
1737 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | testl $2147483647, %eax # imm = 0x7FFFFFFF
|
||||||
|
1738 | 0.00 | | | | | | 1.00 | || | | sete %sil
|
||||||
|
1739 | 0.00 | | | | | | 1.00 | || | | setne %dil
|
||||||
|
1740 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %eax
|
||||||
|
1741 | 0.00 | | | | | | 1.00 | || | | cmovel %r8d, %eax
|
||||||
|
1742 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | movl $255, %ecx
|
||||||
|
1743 | 0.00 | | | | | | 1.00 | || | | cmovel %r9d, %ecx
|
||||||
|
1744 | 0.00 | 0.25 | | | | 0.00 | 0.75 | || | | xorl $255, %esi
|
||||||
|
1745 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||||
|
1746 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm27, %k1 {%k1}
|
||||||
|
1747 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm14, %zmm4, %zmm4
|
||||||
|
1748 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
|
||||||
|
1749 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
|
||||||
|
1750 | 0.25 | | | | | 0.75 | | || | | vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
|
||||||
|
1751 | | 1.00 | | | | 0.00 | | || | | leal (%rdi,%rdi,2), %esi
|
||||||
|
1752 | 0.00 | 0.75 | | | | 0.00 | 0.25 | || | | orl $252, %esi
|
||||||
|
1753 | 1.00 | | | | | | | || | | kmovd %esi, %k1
|
||||||
|
1754 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm28, %k1 {%k1}
|
||||||
|
1755 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm1, %zmm1
|
||||||
|
1756 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
|
||||||
|
1757 | 0.50 | | | | | 0.50 | | || 4.0 | | vaddps %zmm21, %zmm5, %zmm5
|
||||||
|
1758 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
|
||||||
|
1759 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm21, %zmm24, %zmm21
|
||||||
|
1760 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
|
||||||
|
1761 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm1, %zmm4, %zmm1
|
||||||
|
1762 | 1.00 | | | | | | | || | | kmovd %eax, %k1
|
||||||
|
1763 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm29, %k1 {%k1}
|
||||||
|
1764 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm2, %zmm2
|
||||||
|
1765 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
|
||||||
|
1766 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
|
||||||
|
1767 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
|
||||||
|
1768 | 1.00 | | | | | | | || | | kmovd %ecx, %k1
|
||||||
|
1769 | | | | | | 1.00 | | || | | vcmpltps %zmm0, %zmm30, %k1 {%k1}
|
||||||
|
1770 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm14, %zmm3, %zmm3
|
||||||
|
1771 | 0.50 | | | | | 0.50 | | || | | vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
|
||||||
|
1772 | 0.50 | | | | | 0.50 | | || | | vaddps %zmm18, %zmm4, %zmm4
|
||||||
|
1773 | 0.25 | | | | | 0.75 | | || 4.0 | | vaddps %zmm4, %zmm5, %zmm4
|
||||||
|
1774 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
|
||||||
|
1775 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm17, %zmm5
|
||||||
|
1776 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm5, %zmm21, %zmm5
|
||||||
|
1777 | 0.00 | | | | | 1.00 | | || | | vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
|
||||||
|
1778 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 176(%r15), %rax
|
||||||
|
1779 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm3, %zmm2, %zmm2
|
||||||
|
1780 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV
|
||||||
|
1781 | 0.00 | | | | | 1.00 | | || 4.0 | | vsubps %zmm4, %zmm3, %zmm3
|
||||||
|
1782 | | | 0.50 | 0.50 | 1.00 | | | || 0.0 | | vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||||
|
1783 | 0.00 | | | | | 1.00 | | || | | vaddps %zmm2, %zmm1, %zmm1
|
||||||
|
1784 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||||
|
1785 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm5, %zmm2, %zmm2
|
||||||
|
1786 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||||
|
1787 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV
|
||||||
|
1788 | 0.00 | | | | | 1.00 | | || | | vsubps %zmm1, %zmm2, %zmm1
|
||||||
|
1789 | | | 0.50 | 0.50 | 1.00 | | | || | | vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV
|
||||||
|
1790 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | | cmpq %rdx, %r10
|
||||||
|
1791 | | | | | | | | || | | * je .LBB4_18
|
||||||
|
1792 | | | | | | | | || | | # %bb.9: # in Loop: Header=BB4_8 Depth=1
|
||||||
|
1793 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | movq 160(%r15), %rdi
|
||||||
|
1794 | 0.00 | 1.00 | | | | 0.00 | 0.00 | || | 1.0 | incq %rdx
|
||||||
|
1795 | 0.00 | | | | | | 1.00 | || | | jmp .LBB4_8
|
||||||
|
1796 | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
50.0 9.00 9.50 8.00 9.50 8.00 3.00 50.0 9.00 79.0 1.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
1794 | 1.0 | incq %rdx | [1794]
|
||||||
|
|
88
static_analysis/jan/analyses/lammps-icc-avx2-iaca.out
Normal file
88
static_analysis/jan/analyses/lammps-icc-avx2-iaca.out
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - lammps-icc-avx2.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 25.58 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 13.7 8.0 | 13.6 | 5.5 5.5 | 5.5 5.5 | 0.0 | 13.7 | 7.0 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
|
||||||
|
| 1 | 1.0 | | | | | | | | vmovq rcx, xmm0
|
||||||
|
| 1 | | | | | | 1.0 | | | vpunpckhqdq xmm2, xmm0, xmm0
|
||||||
|
| 1 | 1.0 | | | | | | | | vmovq r15, xmm2
|
||||||
|
| 1* | | | | | | | | | mov r8d, ecx
|
||||||
|
| 1 | | | | | | | 1.0 | | shr rcx, 0x20
|
||||||
|
| 1 | | | | | | 1.0 | | | lea r14d, ptr [rcx+rcx*2]
|
||||||
|
| 1 | | | | | | 1.0 | | | lea r8d, ptr [r8+r8*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | movsxd rcx, r8d
|
||||||
|
| 1 | | | | | | | 1.0 | | movsxd r8, r14d
|
||||||
|
| 1* | | | | | | | | | mov r14d, r15d
|
||||||
|
| 1 | | | | | | | 1.0 | | shr r15, 0x20
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm7, xmmword ptr [r11+rcx*8]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovups xmm6, xmmword ptr [r11+r8*8]
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
|
||||||
|
| 1 | | 0.3 | | | | 0.7 | | | lea r14d, ptr [r14+r14*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | movsxd r14, r14d
|
||||||
|
| 1 | | 0.7 | | | | 0.3 | | | lea r15d, ptr [r15+r15*2]
|
||||||
|
| 1 | | | | | | | 1.0 | | movsxd r15, r15d
|
||||||
|
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
|
||||||
|
| 2 | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
|
||||||
|
| 1 | | | 0.5 0.5 | 0.5 0.5 | | | | | vmovq xmm0, qword ptr [r11+r14*8+0x10]
|
||||||
|
| 2 | | 0.3 | 0.5 0.5 | 0.5 0.5 | | 0.7 | | | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
|
||||||
|
| 2 | | | 0.5 0.5 | 0.5 0.5 | | 1.0 | | | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
|
||||||
|
| 1 | | | | | | 1.0 | | | vunpcklpd ymm14, ymm1, ymm6
|
||||||
|
| 1 | | | | | | 1.0 | | | vunpckhpd ymm1, ymm1, ymm6
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm6, ymm10, ymm14
|
||||||
|
| 1 | | | | | | 1.0 | | | vinsertf128 ymm7, ymm15, xmm2, 0x1
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vsubpd ymm2, ymm9, ymm1
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vsubpd ymm0, ymm8, ymm7
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm14, ymm2, ymm2
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vfmadd231pd ymm14, ymm6, ymm6
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vfmadd231pd ymm14, ymm0, ymm0
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vcmppd ymm1, ymm14, ymm5, 0x1
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vpcmpeqd ymm7, ymm7, ymm7
|
||||||
|
| 2 | 1.0 | | | | | 1.0 | | | vptest ymm1, ymm7
|
||||||
|
| 1 | 1.0 8.0 | | | | | | | | vdivpd ymm7, ymm4, ymm14
|
||||||
|
| 2^ | | 1.0 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
|
||||||
|
| 1 | | 1.0 | | | | | | | vmulpd ymm14, ymm7, ymm14
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm15, ymm7, ymm14
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vfmsub213pd ymm14, ymm7, ymm3
|
||||||
|
| 2^ | 0.7 | 0.3 | 0.5 0.5 | 0.5 0.5 | | | | | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm15, ymm15, ymm7
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm7, ymm15, ymm14
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vmulpd ymm6, ymm6, ymm7
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm2, ymm2, ymm7
|
||||||
|
| 1 | | | | | | 1.0 | | | vandpd ymm6, ymm1, ymm6
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm13, ymm13, ymm6
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vmulpd ymm6, ymm0, ymm7
|
||||||
|
| 1 | | | | | | 1.0 | | | vandpd ymm0, ymm1, ymm2
|
||||||
|
| 1 | | | | | | 1.0 | | | vandpd ymm1, ymm1, ymm6
|
||||||
|
| 1 | 0.3 | 0.7 | | | | | | | vaddpd ymm12, ymm12, ymm0
|
||||||
|
| 1 | 0.7 | 0.3 | | | | | | | vaddpd ymm11, ymm11, ymm1
|
||||||
|
| 1 | | | | | | | 1.0 | | add rdx, 0x4
|
||||||
|
| 1* | | | | | | | | | cmp rdx, rsi
|
||||||
|
| 0*F | | | | | | | | | jb 0xffffffffffffff02
|
||||||
|
Total Num Of Uops: 62
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
156
static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out
Normal file
156
static_analysis/jan/analyses/lammps-icc-avx2-mca-csx.out
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 5600
|
||||||
|
Total Cycles: 2352
|
||||||
|
Total uOps: 6300
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.68
|
||||||
|
IPC: 2.38
|
||||||
|
Block RThroughput: 10.5
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||||
|
1 2 1.00 vmovq %xmm0, %rcx
|
||||||
|
1 1 1.00 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||||
|
1 2 1.00 vmovq %xmm2, %r15
|
||||||
|
1 1 0.25 movl %ecx, %r8d
|
||||||
|
1 1 0.50 shrq $32, %rcx
|
||||||
|
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||||
|
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||||
|
1 1 0.25 movslq %r8d, %rcx
|
||||||
|
1 1 0.25 movslq %r14d, %r8
|
||||||
|
1 1 0.25 movl %r15d, %r14d
|
||||||
|
1 1 0.50 shrq $32, %r15
|
||||||
|
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||||
|
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||||
|
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||||
|
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||||
|
1 1 0.25 movslq %r14d, %r14
|
||||||
|
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||||
|
1 1 0.25 movslq %r15d, %r15
|
||||||
|
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||||
|
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||||
|
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||||
|
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||||
|
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||||
|
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||||
|
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||||
|
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||||
|
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||||
|
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||||
|
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||||
|
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||||
|
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||||
|
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||||
|
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||||
|
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||||
|
2 3 1.00 vptest %ymm7, %ymm1
|
||||||
|
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||||
|
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||||
|
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||||
|
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||||
|
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||||
|
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||||
|
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||||
|
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||||
|
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||||
|
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||||
|
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||||
|
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||||
|
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||||
|
1 1 0.25 addq $4, %rdx
|
||||||
|
1 1 0.25 cmpq %rsi, %rdx
|
||||||
|
1 1 0.50 jb ..B1.22
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - SKXDivider
|
||||||
|
[1] - SKXFPDivider
|
||||||
|
[2] - SKXPort0
|
||||||
|
[3] - SKXPort1
|
||||||
|
[4] - SKXPort2
|
||||||
|
[5] - SKXPort3
|
||||||
|
[6] - SKXPort4
|
||||||
|
[7] - SKXPort5
|
||||||
|
[8] - SKXPort6
|
||||||
|
[9] - SKXPort7
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||||
|
- 5.00 16.00 14.12 5.50 5.50 - 13.47 8.41 -
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||||
|
- - - - 0.50 0.50 - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||||
|
- - 1.00 - - - - - - - vmovq %xmm0, %rcx
|
||||||
|
- - - - - - - 1.00 - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||||
|
- - 1.00 - - - - - - - vmovq %xmm2, %r15
|
||||||
|
- - - - - - - - 1.00 - movl %ecx, %r8d
|
||||||
|
- - 0.06 - - - - - 0.94 - shrq $32, %rcx
|
||||||
|
- - - 0.02 - - - 0.98 - - leal (%rcx,%rcx,2), %r14d
|
||||||
|
- - - 0.02 - - - 0.98 - - leal (%r8,%r8,2), %r8d
|
||||||
|
- - 0.47 0.02 - - - - 0.51 - movslq %r8d, %rcx
|
||||||
|
- - 0.46 0.02 - - - 0.01 0.51 - movslq %r14d, %r8
|
||||||
|
- - 0.03 0.01 - - - 0.45 0.51 - movl %r15d, %r14d
|
||||||
|
- - 0.51 - - - - - 0.49 - shrq $32, %r15
|
||||||
|
- - - - 0.49 0.51 - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||||
|
- - - - 0.49 0.51 - - - - vmovups (%r11,%r8,8), %xmm6
|
||||||
|
- - - - 0.52 0.48 - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||||
|
- - - 0.02 - - - 0.98 - - leal (%r14,%r14,2), %r14d
|
||||||
|
- - 0.01 0.01 - - - 0.01 0.97 - movslq %r14d, %r14
|
||||||
|
- - - 0.03 - - - 0.97 - - leal (%r15,%r15,2), %r15d
|
||||||
|
- - 0.04 - - - - - 0.96 - movslq %r15d, %r15
|
||||||
|
- - - - 0.07 0.93 - 1.00 - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||||
|
- - 0.03 0.46 0.49 0.51 - 0.51 - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||||
|
- - - - 0.51 0.49 - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||||
|
- - 0.47 0.02 0.93 0.07 - 0.51 - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||||
|
- - - - 0.50 0.50 - 1.00 - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||||
|
- - - - - - - 1.00 - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||||
|
- - - - - - - 1.00 - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||||
|
- - 0.01 0.99 - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||||
|
- - - - - - - 1.00 - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||||
|
- - 0.96 0.04 - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||||
|
- - 0.49 0.51 - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||||
|
- - 0.48 0.52 - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||||
|
- - 0.03 0.97 - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||||
|
- - 0.94 0.06 - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||||
|
- - 0.47 0.53 - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||||
|
- - 0.96 0.04 - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||||
|
- - 1.00 - - - - 1.00 - - vptest %ymm7, %ymm1
|
||||||
|
- 5.00 1.00 - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||||
|
- - 0.93 0.07 0.49 0.51 - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||||
|
- - 0.05 0.95 - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||||
|
- - 0.02 0.98 - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||||
|
- - 0.98 0.02 - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||||
|
- - 0.07 0.93 0.51 0.49 - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||||
|
- - 0.01 0.99 - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||||
|
- - 0.01 0.99 - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||||
|
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||||
|
- - 0.97 0.03 - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||||
|
- - 0.03 0.90 - - - 0.07 - - vandpd %ymm6, %ymm1, %ymm6
|
||||||
|
- - 0.06 0.94 - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||||
|
- - 0.03 0.97 - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||||
|
- - 0.46 0.08 - - - 0.46 - - vandpd %ymm2, %ymm1, %ymm0
|
||||||
|
- - 0.47 0.01 - - - 0.52 - - vandpd %ymm6, %ymm1, %ymm1
|
||||||
|
- - 0.48 0.52 - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||||
|
- - 0.52 0.48 - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||||
|
- - 0.01 - - - - - 0.99 - addq $4, %rdx
|
||||||
|
- - - - - - - 0.02 0.98 - cmpq %rsi, %rdx
|
||||||
|
- - 0.45 - - - - - 0.55 - jb ..B1.22
|
158
static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out
Normal file
158
static_analysis/jan/analyses/lammps-icc-avx2-mca-icx.out
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 5600
|
||||||
|
Total Cycles: 2306
|
||||||
|
Total uOps: 6300
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.73
|
||||||
|
IPC: 2.43
|
||||||
|
Block RThroughput: 10.5
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 6 0.50 * vmovdqu (%rbx,%rdx,4), %xmm0
|
||||||
|
1 2 1.00 vmovq %xmm0, %rcx
|
||||||
|
1 1 0.50 vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||||
|
1 2 1.00 vmovq %xmm2, %r15
|
||||||
|
1 1 0.25 movl %ecx, %r8d
|
||||||
|
1 1 0.50 shrq $32, %rcx
|
||||||
|
1 1 0.50 leal (%rcx,%rcx,2), %r14d
|
||||||
|
1 1 0.50 leal (%r8,%r8,2), %r8d
|
||||||
|
1 1 0.25 movslq %r8d, %rcx
|
||||||
|
1 1 0.25 movslq %r14d, %r8
|
||||||
|
1 1 0.25 movl %r15d, %r14d
|
||||||
|
1 1 0.50 shrq $32, %r15
|
||||||
|
1 6 0.50 * vmovups (%r11,%rcx,8), %xmm7
|
||||||
|
1 6 0.50 * vmovups (%r11,%r8,8), %xmm6
|
||||||
|
1 5 0.50 * vmovq 16(%r11,%rcx,8), %xmm14
|
||||||
|
1 1 0.50 leal (%r14,%r14,2), %r14d
|
||||||
|
1 1 0.25 movslq %r14d, %r14
|
||||||
|
1 1 0.50 leal (%r15,%r15,2), %r15d
|
||||||
|
1 1 0.25 movslq %r15d, %r15
|
||||||
|
2 6 1.00 * vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||||
|
2 7 0.50 * vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||||
|
1 5 0.50 * vmovq 16(%r11,%r14,8), %xmm0
|
||||||
|
2 7 0.50 * vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||||
|
2 6 1.00 * vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||||
|
1 1 1.00 vunpcklpd %ymm6, %ymm1, %ymm14
|
||||||
|
1 1 1.00 vunpckhpd %ymm6, %ymm1, %ymm1
|
||||||
|
1 4 0.50 vsubpd %ymm14, %ymm10, %ymm6
|
||||||
|
1 3 1.00 vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||||
|
1 4 0.50 vsubpd %ymm1, %ymm9, %ymm2
|
||||||
|
1 4 0.50 vsubpd %ymm7, %ymm8, %ymm0
|
||||||
|
1 4 0.50 vmulpd %ymm2, %ymm2, %ymm14
|
||||||
|
1 4 0.50 vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||||
|
1 4 0.50 vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||||
|
1 4 0.50 vcmpltpd %ymm5, %ymm14, %ymm1
|
||||||
|
1 1 0.50 vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||||
|
2 3 1.00 vptest %ymm7, %ymm1
|
||||||
|
1 14 5.00 vdivpd %ymm14, %ymm4, %ymm7
|
||||||
|
2 11 0.50 * vmulpd 96(%rsp), %ymm7, %ymm14
|
||||||
|
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm14
|
||||||
|
1 4 0.50 vmulpd %ymm14, %ymm7, %ymm15
|
||||||
|
1 4 0.50 vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||||
|
2 11 0.50 * vmulpd 64(%rsp), %ymm7, %ymm7
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm15, %ymm15
|
||||||
|
1 4 0.50 vmulpd %ymm14, %ymm15, %ymm7
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm6, %ymm6
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm2, %ymm2
|
||||||
|
1 1 0.33 vandpd %ymm6, %ymm1, %ymm6
|
||||||
|
1 4 0.50 vaddpd %ymm6, %ymm13, %ymm13
|
||||||
|
1 4 0.50 vmulpd %ymm7, %ymm0, %ymm6
|
||||||
|
1 1 0.33 vandpd %ymm2, %ymm1, %ymm0
|
||||||
|
1 1 0.33 vandpd %ymm6, %ymm1, %ymm1
|
||||||
|
1 4 0.50 vaddpd %ymm0, %ymm12, %ymm12
|
||||||
|
1 4 0.50 vaddpd %ymm1, %ymm11, %ymm11
|
||||||
|
1 1 0.25 addq $4, %rdx
|
||||||
|
1 1 0.25 cmpq %rsi, %rdx
|
||||||
|
1 1 0.50 jb ..B1.22
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - ICXDivider
|
||||||
|
[1] - ICXFPDivider
|
||||||
|
[2] - ICXPort0
|
||||||
|
[3] - ICXPort1
|
||||||
|
[4] - ICXPort2
|
||||||
|
[5] - ICXPort3
|
||||||
|
[6] - ICXPort4
|
||||||
|
[7] - ICXPort5
|
||||||
|
[8] - ICXPort6
|
||||||
|
[9] - ICXPort7
|
||||||
|
[10] - ICXPort8
|
||||||
|
[11] - ICXPort9
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||||
|
- 5.00 15.12 15.03 5.50 5.50 - 13.45 8.40 - - -
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||||
|
- - - - 0.50 0.50 - - - - - - vmovdqu (%rbx,%rdx,4), %xmm0
|
||||||
|
- - 1.00 - - - - - - - - - vmovq %xmm0, %rcx
|
||||||
|
- - - 0.46 - - - 0.54 - - - - vpunpckhqdq %xmm0, %xmm0, %xmm2
|
||||||
|
- - 1.00 - - - - - - - - - vmovq %xmm2, %r15
|
||||||
|
- - - - - - - - 1.00 - - - movl %ecx, %r8d
|
||||||
|
- - 0.96 - - - - - 0.04 - - - shrq $32, %rcx
|
||||||
|
- - - 0.01 - - - 0.99 - - - - leal (%rcx,%rcx,2), %r14d
|
||||||
|
- - - 0.03 - - - 0.97 - - - - leal (%r8,%r8,2), %r8d
|
||||||
|
- - 0.48 0.01 - - - - 0.51 - - - movslq %r8d, %rcx
|
||||||
|
- - 0.02 0.02 - - - 0.01 0.95 - - - movslq %r14d, %r8
|
||||||
|
- - 0.02 - - - - - 0.98 - - - movl %r15d, %r14d
|
||||||
|
- - 0.52 - - - - - 0.48 - - - shrq $32, %r15
|
||||||
|
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%rcx,8), %xmm7
|
||||||
|
- - - - 0.49 0.51 - - - - - - vmovups (%r11,%r8,8), %xmm6
|
||||||
|
- - - - 0.52 0.48 - - - - - - vmovq 16(%r11,%rcx,8), %xmm14
|
||||||
|
- - - 0.47 - - - 0.53 - - - - leal (%r14,%r14,2), %r14d
|
||||||
|
- - 0.01 0.01 - - - 0.01 0.97 - - - movslq %r14d, %r14
|
||||||
|
- - - 0.04 - - - 0.96 - - - - leal (%r15,%r15,2), %r15d
|
||||||
|
- - 0.48 - - - - 0.01 0.51 - - - movslq %r15d, %r15
|
||||||
|
- - - - 0.51 0.49 - 1.00 - - - - vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15
|
||||||
|
- - 0.02 0.01 0.95 0.05 - 0.97 - - - - vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1
|
||||||
|
- - - - 0.05 0.95 - - - - - - vmovq 16(%r11,%r14,8), %xmm0
|
||||||
|
- - 0.02 0.49 0.49 0.51 - 0.49 - - - - vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6
|
||||||
|
- - - - 0.50 0.50 - 1.00 - - - - vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2
|
||||||
|
- - - - - - - 1.00 - - - - vunpcklpd %ymm6, %ymm1, %ymm14
|
||||||
|
- - - - - - - 1.00 - - - - vunpckhpd %ymm6, %ymm1, %ymm1
|
||||||
|
- - 0.47 0.53 - - - - - - - - vsubpd %ymm14, %ymm10, %ymm6
|
||||||
|
- - - - - - - 1.00 - - - - vinsertf128 $1, %xmm2, %ymm15, %ymm7
|
||||||
|
- - 0.50 0.50 - - - - - - - - vsubpd %ymm1, %ymm9, %ymm2
|
||||||
|
- - 0.94 0.06 - - - - - - - - vsubpd %ymm7, %ymm8, %ymm0
|
||||||
|
- - 0.06 0.94 - - - - - - - - vmulpd %ymm2, %ymm2, %ymm14
|
||||||
|
- - 0.04 0.96 - - - - - - - - vfmadd231pd %ymm6, %ymm6, %ymm14
|
||||||
|
- - 0.95 0.05 - - - - - - - - vfmadd231pd %ymm0, %ymm0, %ymm14
|
||||||
|
- - 0.02 0.98 - - - - - - - - vcmpltpd %ymm5, %ymm14, %ymm1
|
||||||
|
- - 0.05 0.95 - - - - - - - - vpcmpeqd %ymm7, %ymm7, %ymm7
|
||||||
|
- - 1.00 - - - - 1.00 - - - - vptest %ymm7, %ymm1
|
||||||
|
- 5.00 1.00 - - - - - - - - - vdivpd %ymm14, %ymm4, %ymm7
|
||||||
|
- - 0.51 0.49 0.49 0.51 - - - - - - vmulpd 96(%rsp), %ymm7, %ymm14
|
||||||
|
- - 0.04 0.96 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm14
|
||||||
|
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm7, %ymm15
|
||||||
|
- - 0.99 0.01 - - - - - - - - vfmsub213pd %ymm3, %ymm7, %ymm14
|
||||||
|
- - 0.49 0.51 0.51 0.49 - - - - - - vmulpd 64(%rsp), %ymm7, %ymm7
|
||||||
|
- - 0.01 0.99 - - - - - - - - vmulpd %ymm7, %ymm15, %ymm15
|
||||||
|
- - 0.01 0.99 - - - - - - - - vmulpd %ymm14, %ymm15, %ymm7
|
||||||
|
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm6, %ymm6
|
||||||
|
- - 0.52 0.48 - - - - - - - - vmulpd %ymm7, %ymm2, %ymm2
|
||||||
|
- - 0.46 0.02 - - - 0.52 - - - - vandpd %ymm6, %ymm1, %ymm6
|
||||||
|
- - 0.49 0.51 - - - - - - - - vaddpd %ymm6, %ymm13, %ymm13
|
||||||
|
- - 0.48 0.52 - - - - - - - - vmulpd %ymm7, %ymm0, %ymm6
|
||||||
|
- - 0.02 0.52 - - - 0.46 - - - - vandpd %ymm2, %ymm1, %ymm0
|
||||||
|
- - 0.02 - - - - 0.98 - - - - vandpd %ymm6, %ymm1, %ymm1
|
||||||
|
- - 0.49 0.51 - - - - - - - - vaddpd %ymm0, %ymm12, %ymm12
|
||||||
|
- - 0.51 0.49 - - - - - - - - vaddpd %ymm1, %ymm11, %ymm11
|
||||||
|
- - 0.01 - - - - - 0.99 - - - addq $4, %rdx
|
||||||
|
- - 0.01 - - - - 0.01 0.98 - - - cmpq %rsi, %rdx
|
||||||
|
- - 0.01 - - - - - 0.99 - - - jb ..B1.22
|
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out
Normal file
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-csx.out
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: lammps-icc-avx2.s
|
||||||
|
Architecture: CSX
|
||||||
|
Timestamp: 2023-02-10 16:29:58
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||||
|
----------------------------------------------------------------------------------------------------
|
||||||
|
256 | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||||
|
257 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
258 | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||||
|
259 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||||
|
260 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||||
|
261 | 1.00 | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||||
|
262 | | | | | | 1.000 | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||||
|
263 | 1.00 | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||||
|
264 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movl %ecx, %r8d #60.21
|
||||||
|
265 | 0.00 | | | | | | 1.00 | || | | shrq $32, %rcx #60.21
|
||||||
|
266 | | 0.500 | | | | 0.500 | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||||
|
267 | | 0.500 | | | | 0.500 | | || 1.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||||
|
268 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||||
|
269 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r8 #61.36
|
||||||
|
270 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movl %r15d, %r14d #60.21
|
||||||
|
271 | 0.00 | | | | | | 1.00 | || | | shrq $32, %r15 #60.21
|
||||||
|
272 | | | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||||
|
273 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||||
|
274 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||||
|
275 | | 0.500 | | | | 0.500 | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||||
|
276 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r14d, %r14 #61.36
|
||||||
|
277 | | 0.500 | | | | 0.500 | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||||
|
278 | 0.00 | 0.000 | | | | 0.000 | 1.00 | || | | movslq %r15d, %r15 #61.36
|
||||||
|
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||||
|
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||||
|
281 | | | 0.50 0.50 | 0.50 0.50 | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||||
|
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||||
|
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.000 | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||||
|
284 | | | | | | 1.000 | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||||
|
285 | | | | | | 1.000 | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||||
|
286 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||||
|
287 | | | | | | 1.000 | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||||
|
288 | 0.50 | 0.500 | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||||
|
289 | 0.50 | 0.500 | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||||
|
290 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||||
|
291 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||||
|
292 | 0.50 | 0.500 | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||||
|
293 | | | | | | 1.000 | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||||
|
294 | 0.50 | 0.500 | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||||
|
295 | 1.00 | | | | | 1.000 | | || | | vptest %ymm7, %ymm1 #74.22
|
||||||
|
296 | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||||
|
297 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||||
|
298 | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||||
|
299 | | | | | | | | || | | # Execution count [1.25e+01]
|
||||||
|
300 | 1.00 8.00 | | | | | | | || 15.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||||
|
301 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||||
|
302 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||||
|
303 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||||
|
304 | 0.50 | 0.500 | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||||
|
305 | 0.50 | 0.500 | 0.50 0.50 | 0.50 0.50 | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||||
|
306 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||||
|
307 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||||
|
308 | 0.50 | 0.500 | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||||
|
309 | 0.50 | 0.500 | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||||
|
310 | 0.25 | 0.253 | | | | 0.493 | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||||
|
311 | 0.50 | 0.500 | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||||
|
312 | 0.25 | 0.750 | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||||
|
313 | 0.16 | 0.417 | | | | 0.423 | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||||
|
314 | 0.00 | 0.250 | | | | 0.750 | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||||
|
315 | 0.00 | 1.000 | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||||
|
316 | 0.50 | 0.500 | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||||
|
317 | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||||
|
318 | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||||
|
319 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||||
|
320 | 0.00 | 0.000 | | | | -0.01 | 1.00 | || | | addq $4, %rdx #59.9
|
||||||
|
321 | 0.00 | -0.01 | | | | 0.000 | 1.00 | || | | cmpq %rsi, %rdx #59.9
|
||||||
|
322 | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||||
|
323 | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
13.7 8.00 13.66 5.50 5.50 5.50 5.50 13.66 10.0 76.0 4.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||||
|
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||||
|
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||||
|
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||||
|
|
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out
Normal file
97
static_analysis/jan/analyses/lammps-icc-avx2-osaca-icx.out
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: lammps-icc-avx2.s
|
||||||
|
Architecture: ICX
|
||||||
|
Timestamp: 2023-02-10 16:29:48
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------
|
||||||
|
256 | | | | | | | | | | || | | # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
|
||||||
|
257 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
258 | | | | | | | | | | || | | ..B1.22: # Preds ..B1.24 ..B1.21
|
||||||
|
259 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||||
|
260 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu (%rbx,%rdx,4), %xmm0 #60.21
|
||||||
|
261 | 1.00 | | | | | | | | | || 1.0 | | vmovq %xmm0, %rcx #60.21
|
||||||
|
262 | | 0.50 | | | | 0.50 | | | | || | | vpunpckhqdq %xmm0, %xmm0, %xmm2 #60.21
|
||||||
|
263 | 1.00 | | | | | | | | | || | | vmovq %xmm2, %r15 #60.21
|
||||||
|
264 | 0.37 | 0.00 | | | | 0.25 | 0.38 | | | || 1.0 | | movl %ecx, %r8d #60.21
|
||||||
|
265 | 0.50 | | | | | | 0.50 | | | || | | shrq $32, %rcx #60.21
|
||||||
|
266 | 0.13 | 0.00 | | | | 0.00 | 0.87 | | | || | | lea (%rcx,%rcx,2), %r14d #61.36
|
||||||
|
267 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 6.0 | | lea (%r8,%r8,2), %r8d #61.36
|
||||||
|
268 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || 1.0 | | movslq %r8d, %rcx #61.36
|
||||||
|
269 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r8 #61.36
|
||||||
|
270 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movl %r15d, %r14d #60.21
|
||||||
|
271 | 0.00 | | | | | | 1.00 | | | || | | shrq $32, %r15 #60.21
|
||||||
|
272 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovups (%r11,%rcx,8), %xmm7 #61.36
|
||||||
|
273 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovups (%r11,%r8,8), %xmm6 #61.36
|
||||||
|
274 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%rcx,8), %xmm14 #61.36
|
||||||
|
275 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r14,%r14,2), %r14d #61.36
|
||||||
|
276 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r14d, %r14 #61.36
|
||||||
|
277 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | lea (%r15,%r15,2), %r15d #61.36
|
||||||
|
278 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | movslq %r15d, %r15 #61.36
|
||||||
|
279 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #61.36
|
||||||
|
280 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || 3.0 | | vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #61.36
|
||||||
|
281 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmovq 16(%r11,%r14,8), %xmm0 #61.36
|
||||||
|
282 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #61.36
|
||||||
|
283 | | | 0.50 0.50 | 0.50 0.50 | | 1.00 | | | | || | | vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #61.36
|
||||||
|
284 | | | | | | 1.00 | | | | || | | vunpcklpd %ymm6, %ymm1, %ymm14 #61.36
|
||||||
|
285 | | | | | | 1.00 | | | | || 1.0 | | vunpckhpd %ymm6, %ymm1, %ymm1 #61.36
|
||||||
|
286 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm14, %ymm10, %ymm6 #61.36
|
||||||
|
287 | | | | | | 1.00 | | | | || | | vinsertf128 $1, %xmm2, %ymm15, %ymm7 #61.36
|
||||||
|
288 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vsubpd %ymm1, %ymm9, %ymm2 #62.36
|
||||||
|
289 | 0.50 | 0.50 | | | | | | | | || | | vsubpd %ymm7, %ymm8, %ymm0 #63.36
|
||||||
|
290 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm2, %ymm2, %ymm14 #64.49
|
||||||
|
291 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vfmadd231pd %ymm6, %ymm6, %ymm14 #64.49
|
||||||
|
292 | 0.75 | 0.25 | | | | | | | | || 4.0 | | vfmadd231pd %ymm0, %ymm0, %ymm14 #64.63
|
||||||
|
293 | 0.00 | | | | | 1.00 | | | | || | | vcmpltpd %ymm5, %ymm14, %ymm1 #74.22
|
||||||
|
294 | 0.50 | 0.50 | | | | | | | | || | | vpcmpeqd %ymm7, %ymm7, %ymm7 #74.22
|
||||||
|
295 | 1.00 | | | | | 1.00 | | | | || | | vptest %ymm7, %ymm1 #74.22
|
||||||
|
296 | | | | | | | | | | || | | #je ..B1.24 # Prob 50% #74.22
|
||||||
|
297 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
|
||||||
|
298 | | | | | | | | | | || | | ..B1.23: # Preds ..B1.22
|
||||||
|
299 | | | | | | | | | | || | | # Execution count [1.25e+01]
|
||||||
|
300 | 1.00 8.00 | | | | | | | | | || 13.0 | | vdivpd %ymm14, %ymm4, %ymm7 #75.39
|
||||||
|
301 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || 4.0 | | vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill]
|
||||||
|
302 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm14 #76.44
|
||||||
|
303 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm7, %ymm15 #76.50
|
||||||
|
304 | 0.50 | 0.50 | | | | | | | | || | | vfmsub213pd %ymm3, %ymm7, %ymm14 #77.55
|
||||||
|
305 | 0.50 | 0.50 | 0.50 0.50 | 0.50 0.50 | | | | | | || | | vmulpd 64(%rsp), %ymm7, %ymm7 #77.55[spill]
|
||||||
|
306 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm15, %ymm15 #77.64
|
||||||
|
307 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm14, %ymm15, %ymm7 #77.70
|
||||||
|
308 | 0.50 | 0.50 | | | | | | | | || 4.0 | | vmulpd %ymm7, %ymm6, %ymm6 #78.31
|
||||||
|
309 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm2, %ymm2 #79.31
|
||||||
|
310 | 0.00 | 0.00 | | | | 1.00 | | | | || 1.0 | | vandpd %ymm6, %ymm1, %ymm6 #78.31
|
||||||
|
311 | 0.00 | 1.00 | | | | | | | | || 4.0 | | vaddpd %ymm6, %ymm13, %ymm13 #78.17
|
||||||
|
312 | 0.00 | 1.00 | | | | | | | | || | | vmulpd %ymm7, %ymm0, %ymm6 #80.31
|
||||||
|
313 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm2, %ymm1, %ymm0 #79.31
|
||||||
|
314 | 0.00 | 0.00 | | | | 1.00 | | | | || | | vandpd %ymm6, %ymm1, %ymm1 #80.31
|
||||||
|
315 | 0.00 | 1.00 | | | | | | | | || | | vaddpd %ymm0, %ymm12, %ymm12 #79.17
|
||||||
|
316 | 0.00 | 1.00 | | | | | | | | || | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17
|
||||||
|
317 | | | | | | | | | | || | | # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
|
||||||
|
318 | | | | | | | | | | || | | ..B1.24: # Preds ..B1.23 ..B1.22
|
||||||
|
319 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||||
|
320 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | addq $4, %rdx #59.9
|
||||||
|
321 | 0.00 | 0.00 | | | | 0.00 | 1.00 | | | || | | cmpq %rsi, %rdx #59.9
|
||||||
|
322 | | | | | | | | | | || | | * jb ..B1.22 # Prob 82% #59.9
|
||||||
|
323 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
12.8 8.00 12.8 5.50 5.50 5.50 5.50 12.8 12.8 81 4.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
316 | 4.0 | vaddpd %ymm1, %ymm11, %ymm11 #80.17| [316]
|
||||||
|
315 | 4.0 | vaddpd %ymm0, %ymm12, %ymm12 #79.17| [315]
|
||||||
|
311 | 4.0 | vaddpd %ymm6, %ymm13, %ymm13 #78.17| [311]
|
||||||
|
320 | 1.0 | addq $4, %rdx #59.9| [320]
|
||||||
|
|
75
static_analysis/jan/analyses/lammps-icc-avx512-iaca.out
Normal file
75
static_analysis/jan/analyses/lammps-icc-avx512-iaca.out
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
Intel(R) Architecture Code Analyzer Version - v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
|
||||||
|
Analyzed File - lammps-icc-avx512.o
|
||||||
|
Binary Format - 64Bit
|
||||||
|
Architecture - SKX
|
||||||
|
Analysis Type - Throughput
|
||||||
|
|
||||||
|
Throughput Analysis Report
|
||||||
|
--------------------------
|
||||||
|
Block Throughput: 30.89 Cycles Throughput Bottleneck: Backend
|
||||||
|
Loop Count: 22
|
||||||
|
Port Binding In Cycles Per Iteration:
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Port | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
| Cycles | 19.0 0.0 | 4.0 | 13.0 13.0 | 13.0 13.0 | 0.0 | 17.0 | 4.0 | 0.0 |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
DV - Divider pipe (on port 0)
|
||||||
|
D - Data fetch pipe (on ports 2 and 3)
|
||||||
|
F - Macro Fusion with the previous instruction occurred
|
||||||
|
* - instruction micro-ops not bound to a port
|
||||||
|
^ - Micro Fusion occurred
|
||||||
|
# - ESP Tracking sync uop was issued
|
||||||
|
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
|
||||||
|
X - instruction not supported, was not accounted in Analysis
|
||||||
|
|
||||||
|
| Num Of | Ports pressure in cycles | |
|
||||||
|
| Uops | 0 - DV | 1 | 2 - D | 3 - D | 4 | 5 | 6 | 7 |
|
||||||
|
-----------------------------------------------------------------------------------------
|
||||||
|
| 1 | | | | | | 1.0 | | | vpcmpgtd k5, ymm3, ymm4
|
||||||
|
| 1 | | 1.0 | | | | | | | vpaddd ymm4, ymm4, ymm15
|
||||||
|
| 2 | | 1.0 | 1.0 1.0 | | | | | | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
|
||||||
|
| 1 | | 1.0 | | | | | | | vpaddd ymm18, ymm17, ymm17
|
||||||
|
| 1 | | | | | | | 1.0 | | add r15, 0x8
|
||||||
|
| 1 | | 1.0 | | | | | | | vpaddd ymm19, ymm17, ymm18
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw k2, k5
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw k3, k5
|
||||||
|
| 1 | 1.0 | | | | | | | | kmovw k1, k5
|
||||||
|
| 1* | | | | | | | | | vpxord zmm21, zmm21, zmm21
|
||||||
|
| 1* | | | | | | | | | vpxord zmm20, zmm20, zmm20
|
||||||
|
| 1* | | | | | | | | | vpxord zmm22, zmm22, zmm22
|
||||||
|
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
|
||||||
|
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
|
||||||
|
| 5^ | 1.0 | | 4.0 4.0 | 4.0 4.0 | | 1.0 | 1.0 | | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm18, zmm1, zmm21
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm17, zmm2, zmm20
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vsubpd zmm19, zmm0, zmm22
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm31, zmm18, zmm18
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm17, zmm17
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm31, zmm19, zmm19
|
||||||
|
| 3 | 2.0 | | | | | 1.0 | | | vrcp14pd zmm30, zmm31
|
||||||
|
| 1 | | | | | | 1.0 | | | vcmppd k6{k5}, zmm31, zmm14, 0x1
|
||||||
|
| 1 | | | | | | 1.0 | | | vfpclasspd k0, zmm30, 0x1e
|
||||||
|
| 1* | | | | | | | | | vmovaps zmm23, zmm31
|
||||||
|
| 2^ | 1.0 | | | 1.0 1.0 | | | | | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
|
||||||
|
| 1 | 1.0 | | | | | | | | knotw k4, k0
|
||||||
|
| 1 | | | | | | 1.0 | | | vmulpd zmm24, zmm23, zmm23
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm23, zmm30
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd213pd zmm30{k4}, zmm24, zmm30
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm25, zmm30, zmm13
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm27, zmm30, zmm12
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm28, zmm30, zmm25
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm26, zmm30, zmm28
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmsub213pd zmm30, zmm28, zmm5
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm29, zmm26, zmm27
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vmulpd zmm23, zmm29, zmm30
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm10{k6}, zmm23, zmm17
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm9{k6}, zmm23, zmm18
|
||||||
|
| 1 | 0.5 | | | | | 0.5 | | | vfmadd231pd zmm8{k6}, zmm23, zmm19
|
||||||
|
| 1* | | | | | | | | | cmp r15, r14
|
||||||
|
| 0*F | | | | | | | | | jb 0xffffffffffffff0c
|
||||||
|
Total Num Of Uops: 57
|
||||||
|
Analysis Notes:
|
||||||
|
Backend allocation was stalled due to unavailable allocation resources.
|
||||||
|
There were bubbles in the frontend.
|
128
static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out
Normal file
128
static_analysis/jan/analyses/lammps-icc-avx512-mca-csx.out
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 4200
|
||||||
|
Total Cycles: 2465
|
||||||
|
Total uOps: 5800
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.35
|
||||||
|
IPC: 1.70
|
||||||
|
Block RThroughput: 13.0
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||||
|
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||||
|
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||||
|
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||||
|
1 1 0.25 addq $8, %r15
|
||||||
|
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||||
|
1 1 1.00 kmovw %k5, %k2
|
||||||
|
1 1 1.00 kmovw %k5, %k3
|
||||||
|
1 1 1.00 kmovw %k5, %k1
|
||||||
|
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||||
|
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||||
|
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||||
|
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||||
|
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||||
|
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||||
|
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||||
|
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||||
|
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||||
|
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||||
|
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||||
|
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||||
|
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||||
|
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||||
|
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||||
|
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||||
|
1 1 1.00 knotw %k0, %k4
|
||||||
|
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||||
|
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||||
|
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||||
|
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||||
|
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||||
|
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||||
|
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||||
|
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||||
|
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||||
|
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||||
|
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||||
|
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||||
|
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||||
|
1 1 0.25 cmpq %r14, %r15
|
||||||
|
1 1 0.50 jb ..B1.16
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - SKXDivider
|
||||||
|
[1] - SKXFPDivider
|
||||||
|
[2] - SKXPort0
|
||||||
|
[3] - SKXPort1
|
||||||
|
[4] - SKXPort2
|
||||||
|
[5] - SKXPort3
|
||||||
|
[6] - SKXPort4
|
||||||
|
[7] - SKXPort5
|
||||||
|
[8] - SKXPort6
|
||||||
|
[9] - SKXPort7
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
|
||||||
|
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 -
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
|
||||||
|
- - - - - - - 1.00 - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||||
|
- - 0.28 0.72 - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||||
|
- - 0.14 0.71 0.55 0.45 - 0.15 - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||||
|
- - - 0.97 - - - 0.03 - - vpaddd %ymm17, %ymm17, %ymm18
|
||||||
|
- - 0.14 0.41 - - - 0.13 0.32 - addq $8, %r15
|
||||||
|
- - - 0.99 - - - 0.01 - - vpaddd %ymm18, %ymm17, %ymm19
|
||||||
|
- - 1.00 - - - - - - - kmovw %k5, %k2
|
||||||
|
- - 1.00 - - - - - - - kmovw %k5, %k3
|
||||||
|
- - 1.00 - - - - - - - kmovw %k5, %k1
|
||||||
|
- - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||||
|
- - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||||
|
- - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||||
|
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||||
|
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||||
|
- - 1.00 1.00 3.52 4.48 - - 1.00 - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||||
|
- - 0.02 - - - - 0.98 - - vsubpd %zmm21, %zmm1, %zmm18
|
||||||
|
- - 0.17 - - - - 0.83 - - vsubpd %zmm20, %zmm2, %zmm17
|
||||||
|
- - 0.18 - - - - 0.82 - - vsubpd %zmm22, %zmm0, %zmm19
|
||||||
|
- - 0.01 - - - - 0.99 - - vmulpd %zmm18, %zmm18, %zmm31
|
||||||
|
- - 0.69 - - - - 0.31 - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||||
|
- - 0.68 - - - - 0.32 - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||||
|
- - 2.00 - - - - 1.00 - - vrcp14pd %zmm31, %zmm30
|
||||||
|
- - - - - - - 1.00 - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||||
|
- - - - - - - 1.00 - - vfpclasspd $30, %zmm30, %k0
|
||||||
|
- - 0.83 - - - - 0.17 - - vmovaps %zmm31, %zmm23
|
||||||
|
- - 1.00 - 0.57 0.43 - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||||
|
- - 1.00 - - - - - - - knotw %k0, %k4
|
||||||
|
- - 0.44 - - - - 0.56 - - vmulpd %zmm23, %zmm23, %zmm24
|
||||||
|
- - 0.56 - - - - 0.44 - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||||
|
- - 0.55 - - - - 0.45 - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||||
|
- - 0.69 - - - - 0.31 - - vmulpd %zmm13, %zmm30, %zmm25
|
||||||
|
- - 0.31 - - - - 0.69 - - vmulpd %zmm12, %zmm30, %zmm27
|
||||||
|
- - 0.56 - - - - 0.44 - - vmulpd %zmm25, %zmm30, %zmm28
|
||||||
|
- - 0.02 - - - - 0.98 - - vmulpd %zmm28, %zmm30, %zmm26
|
||||||
|
- - 0.98 - - - - 0.02 - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||||
|
- - 0.30 - - - - 0.70 - - vmulpd %zmm27, %zmm26, %zmm29
|
||||||
|
- - 0.16 - - - - 0.84 - - vmulpd %zmm30, %zmm29, %zmm23
|
||||||
|
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||||
|
- - 0.83 - - - - 0.17 - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||||
|
- - 0.17 - - - - 0.83 - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||||
|
- - - 0.01 - - - 0.01 0.98 - cmpq %r14, %r15
|
||||||
|
- - 0.14 - - - - - 0.86 - jb ..B1.16
|
130
static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out
Normal file
130
static_analysis/jan/analyses/lammps-icc-avx512-mca-icx.out
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 4200
|
||||||
|
Total Cycles: 2465
|
||||||
|
Total uOps: 5800
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.35
|
||||||
|
IPC: 1.70
|
||||||
|
Block RThroughput: 13.0
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 4 1.00 vpcmpgtd %ymm4, %ymm3, %k5
|
||||||
|
1 1 0.33 vpaddd %ymm15, %ymm4, %ymm4
|
||||||
|
2 8 0.50 * vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||||
|
1 1 0.33 vpaddd %ymm17, %ymm17, %ymm18
|
||||||
|
1 1 0.25 addq $8, %r15
|
||||||
|
1 1 0.33 vpaddd %ymm18, %ymm17, %ymm19
|
||||||
|
1 1 1.00 kmovw %k5, %k2
|
||||||
|
1 1 1.00 kmovw %k5, %k3
|
||||||
|
1 1 1.00 kmovw %k5, %k1
|
||||||
|
1 0 0.17 vpxord %zmm21, %zmm21, %zmm21
|
||||||
|
1 0 0.17 vpxord %zmm20, %zmm20, %zmm20
|
||||||
|
1 0 0.17 vpxord %zmm22, %zmm22, %zmm22
|
||||||
|
5 21 4.00 * vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||||
|
5 21 4.00 * vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||||
|
5 21 4.00 * vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||||
|
1 4 0.50 vsubpd %zmm21, %zmm1, %zmm18
|
||||||
|
1 4 0.50 vsubpd %zmm20, %zmm2, %zmm17
|
||||||
|
1 4 0.50 vsubpd %zmm22, %zmm0, %zmm19
|
||||||
|
1 4 0.50 vmulpd %zmm18, %zmm18, %zmm31
|
||||||
|
1 4 0.50 vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||||
|
1 4 0.50 vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||||
|
3 4 2.00 vrcp14pd %zmm31, %zmm30
|
||||||
|
1 4 1.00 vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||||
|
1 4 1.00 vfpclasspd $30, %zmm30, %k0
|
||||||
|
1 1 0.50 vmovaps %zmm31, %zmm23
|
||||||
|
2 11 0.50 * vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||||
|
1 1 1.00 knotw %k0, %k4
|
||||||
|
1 4 0.50 vmulpd %zmm23, %zmm23, %zmm24
|
||||||
|
1 4 0.50 vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||||
|
1 4 0.50 vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||||
|
1 4 0.50 vmulpd %zmm13, %zmm30, %zmm25
|
||||||
|
1 4 0.50 vmulpd %zmm12, %zmm30, %zmm27
|
||||||
|
1 4 0.50 vmulpd %zmm25, %zmm30, %zmm28
|
||||||
|
1 4 0.50 vmulpd %zmm28, %zmm30, %zmm26
|
||||||
|
1 4 0.50 vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||||
|
1 4 0.50 vmulpd %zmm27, %zmm26, %zmm29
|
||||||
|
1 4 0.50 vmulpd %zmm30, %zmm29, %zmm23
|
||||||
|
1 4 0.50 vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||||
|
1 4 0.50 vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||||
|
1 4 0.50 vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||||
|
1 1 0.25 cmpq %r14, %r15
|
||||||
|
1 1 0.50 jb ..B1.16
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - ICXDivider
|
||||||
|
[1] - ICXFPDivider
|
||||||
|
[2] - ICXPort0
|
||||||
|
[3] - ICXPort1
|
||||||
|
[4] - ICXPort2
|
||||||
|
[5] - ICXPort3
|
||||||
|
[6] - ICXPort4
|
||||||
|
[7] - ICXPort5
|
||||||
|
[8] - ICXPort6
|
||||||
|
[9] - ICXPort7
|
||||||
|
[10] - ICXPort8
|
||||||
|
[11] - ICXPort9
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
|
||||||
|
- - 19.02 6.79 12.64 13.36 - 16.03 5.16 - - -
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
|
||||||
|
- - - - - - - 1.00 - - - - vpcmpgtd %ymm4, %ymm3, %k5
|
||||||
|
- - 0.28 0.72 - - - - - - - - vpaddd %ymm15, %ymm4, %ymm4
|
||||||
|
- - 0.14 0.71 0.55 0.45 - 0.15 - - - - vmovdqu32 (%r10,%r15,4), %ymm17 {%k5} {z}
|
||||||
|
- - - 0.97 - - - 0.03 - - - - vpaddd %ymm17, %ymm17, %ymm18
|
||||||
|
- - 0.14 0.41 - - - 0.13 0.32 - - - addq $8, %r15
|
||||||
|
- - - 0.99 - - - 0.01 - - - - vpaddd %ymm18, %ymm17, %ymm19
|
||||||
|
- - 1.00 - - - - - - - - - kmovw %k5, %k2
|
||||||
|
- - 1.00 - - - - - - - - - kmovw %k5, %k3
|
||||||
|
- - 1.00 - - - - - - - - - kmovw %k5, %k1
|
||||||
|
- - - - - - - - - - - - vpxord %zmm21, %zmm21, %zmm21
|
||||||
|
- - - - - - - - - - - - vpxord %zmm20, %zmm20, %zmm20
|
||||||
|
- - - - - - - - - - - - vpxord %zmm22, %zmm22, %zmm22
|
||||||
|
- - 1.00 0.99 3.52 4.48 - 0.01 1.00 - - - vgatherdpd 8(%rbx,%ymm19,8), %zmm21 {%k2}
|
||||||
|
- - 1.00 0.99 4.48 3.52 - 0.01 1.00 - - - vgatherdpd (%rbx,%ymm19,8), %zmm20 {%k3}
|
||||||
|
- - 1.00 1.00 3.52 4.48 - - 1.00 - - - vgatherdpd 16(%rbx,%ymm19,8), %zmm22 {%k1}
|
||||||
|
- - 0.02 - - - - 0.98 - - - - vsubpd %zmm21, %zmm1, %zmm18
|
||||||
|
- - 0.17 - - - - 0.83 - - - - vsubpd %zmm20, %zmm2, %zmm17
|
||||||
|
- - 0.18 - - - - 0.82 - - - - vsubpd %zmm22, %zmm0, %zmm19
|
||||||
|
- - 0.01 - - - - 0.99 - - - - vmulpd %zmm18, %zmm18, %zmm31
|
||||||
|
- - 0.69 - - - - 0.31 - - - - vfmadd231pd %zmm17, %zmm17, %zmm31
|
||||||
|
- - 0.68 - - - - 0.32 - - - - vfmadd231pd %zmm19, %zmm19, %zmm31
|
||||||
|
- - 2.00 - - - - 1.00 - - - - vrcp14pd %zmm31, %zmm30
|
||||||
|
- - - - - - - 1.00 - - - - vcmpltpd %zmm14, %zmm31, %k6 {%k5}
|
||||||
|
- - - - - - - 1.00 - - - - vfpclasspd $30, %zmm30, %k0
|
||||||
|
- - 0.83 - - - - 0.17 - - - - vmovaps %zmm31, %zmm23
|
||||||
|
- - 1.00 - 0.57 0.43 - - - - - - vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
|
||||||
|
- - 1.00 - - - - - - - - - knotw %k0, %k4
|
||||||
|
- - 0.44 - - - - 0.56 - - - - vmulpd %zmm23, %zmm23, %zmm24
|
||||||
|
- - 0.56 - - - - 0.44 - - - - vfmadd213pd %zmm30, %zmm23, %zmm30 {%k4}
|
||||||
|
- - 0.55 - - - - 0.45 - - - - vfmadd213pd %zmm30, %zmm24, %zmm30 {%k4}
|
||||||
|
- - 0.69 - - - - 0.31 - - - - vmulpd %zmm13, %zmm30, %zmm25
|
||||||
|
- - 0.31 - - - - 0.69 - - - - vmulpd %zmm12, %zmm30, %zmm27
|
||||||
|
- - 0.56 - - - - 0.44 - - - - vmulpd %zmm25, %zmm30, %zmm28
|
||||||
|
- - 0.02 - - - - 0.98 - - - - vmulpd %zmm28, %zmm30, %zmm26
|
||||||
|
- - 0.98 - - - - 0.02 - - - - vfmsub213pd %zmm5, %zmm28, %zmm30
|
||||||
|
- - 0.30 - - - - 0.70 - - - - vmulpd %zmm27, %zmm26, %zmm29
|
||||||
|
- - 0.16 - - - - 0.84 - - - - vmulpd %zmm30, %zmm29, %zmm23
|
||||||
|
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm17, %zmm23, %zmm10 {%k6}
|
||||||
|
- - 0.83 - - - - 0.17 - - - - vfmadd231pd %zmm18, %zmm23, %zmm9 {%k6}
|
||||||
|
- - 0.17 - - - - 0.83 - - - - vfmadd231pd %zmm19, %zmm23, %zmm8 {%k6}
|
||||||
|
- - - 0.01 - - - 0.01 0.98 - - - cmpq %r14, %r15
|
||||||
|
- - 0.14 - - - - - 0.86 - - - jb ..B1.16
|
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out
Normal file
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-csx.out
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: lammps-icc-avx512.s
|
||||||
|
Architecture: CSX
|
||||||
|
Timestamp: 2023-02-10 16:30:08
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||||
|
--------------------------------------------------------------------------------------------------
|
||||||
|
200 | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||||
|
201 | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
202 | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||||
|
203 | | | | | | | | || | | # Execution count [2.50e+01]
|
||||||
|
204 | | | | | | 1.00 | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||||
|
205 | 0.00 | 1.00 | | | | 0.00 | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||||
|
206 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 0.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||||
|
207 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||||
|
208 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | addq $8, %r15 #59.9
|
||||||
|
209 | 0.00 | 1.00 | | | | 0.00 | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||||
|
210 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||||
|
211 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||||
|
212 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||||
|
213 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||||
|
214 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||||
|
215 | 0.50 | | | | | 0.50 | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||||
|
216 | 1.25 | 0.75 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 0.75 | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||||
|
217 | 1.25 | 0.25 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.25 | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||||
|
218 | 1.25 | 0.09 | 5.00 5.00 | 5.00 5.00 | | 0.25 | 1.41 | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||||
|
219 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||||
|
220 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||||
|
221 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||||
|
222 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||||
|
223 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||||
|
224 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||||
|
225 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||||
|
226 | | | | | | 1.00 | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||||
|
227 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||||
|
228 | | | | | | | | || | | * vmovaps %zmm31, %zmm23 #75.39
|
||||||
|
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||||
|
230 | 1.00 | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||||
|
231 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||||
|
232 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||||
|
233 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||||
|
234 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||||
|
235 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||||
|
236 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||||
|
237 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||||
|
238 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||||
|
239 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||||
|
240 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||||
|
241 | 0.00 | | | | | 1.00 | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||||
|
242 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||||
|
243 | 0.00 | | | | | 1.00 | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||||
|
244 | 0.00 | 0.00 | | | | 0.00 | 1.00 | || | | cmpq %r14, %r15 #59.9
|
||||||
|
245 | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||||
|
246 | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
18.8 5.25 16.0 16.0 16.0 16.0 18.8 5.25 86.0 4.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||||
|
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||||
|
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||||
|
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||||
|
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||||
|
|
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out
Normal file
77
static_analysis/jan/analyses/lammps-icc-avx512-osaca-icx.out
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: lammps-icc-avx512.s
|
||||||
|
Architecture: ICX
|
||||||
|
Timestamp: 2023-02-10 16:29:42
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 - 1DV | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 | 8 | 9 || CP | LCD |
|
||||||
|
------------------------------------------------------------------------------------------------------------------------
|
||||||
|
200 | | | | | | | | | | || | | # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
|
||||||
|
201 | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
202 | | | | | | | | | | || | | ..B1.16: # Preds ..B1.16 ..B1.15
|
||||||
|
203 | | | | | | | | | | || | | # Execution count [2.50e+01]
|
||||||
|
204 | | | | | | 1.000 | | | | || | | vpcmpgtd %ymm4, %ymm3, %k5 #59.9
|
||||||
|
205 | 0.00 | 1.00 | | | | 0.000 | | | | || | | vpaddd %ymm15, %ymm4, %ymm4 #59.9
|
||||||
|
206 | | | 0.50 0.50 | 0.50 0.50 | | | | | | || 5.0 | | vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #60.21
|
||||||
|
207 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm17, %ymm17, %ymm18 #61.36
|
||||||
|
208 | 0.00 | 0.00 | | | | 0.000 | 1.00 | | | || | | addq $8, %r15 #59.9
|
||||||
|
209 | 0.00 | 1.00 | | | | 0.000 | | | | || 1.0 | | vpaddd %ymm18, %ymm17, %ymm19 #61.36
|
||||||
|
210 | 1.00 | | | | | | | | | || | | kmovw %k5, %k2 #61.36
|
||||||
|
211 | 1.00 | | | | | | | | | || | | kmovw %k5, %k3 #61.36
|
||||||
|
212 | 1.00 | | | | | | | | | || | | kmovw %k5, %k1 #61.36
|
||||||
|
213 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm21, %zmm21, %zmm21 #61.36
|
||||||
|
214 | 0.24 | | | | | 0.760 | | | | || | | vpxord %zmm20, %zmm20, %zmm20 #61.36
|
||||||
|
215 | 0.50 | | | | | 0.500 | | | | || | | vpxord %zmm22, %zmm22, %zmm22 #61.36
|
||||||
|
216 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || 24.0 | | vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #61.36
|
||||||
|
217 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #61.36
|
||||||
|
218 | 0.67 | 2.33 | 7.00 7.00 | 7.00 7.00 | | 0.000 | | | | || | | vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #61.36
|
||||||
|
219 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vsubpd %zmm21, %zmm1, %zmm18 #62.36
|
||||||
|
220 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm20, %zmm2, %zmm17 #61.36
|
||||||
|
221 | 0.50 | | | | | 0.500 | | | | || | | vsubpd %zmm22, %zmm0, %zmm19 #63.36
|
||||||
|
222 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm18, %zmm18, %zmm31 #64.49
|
||||||
|
223 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm17, %zmm31 #64.49
|
||||||
|
224 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd231pd %zmm19, %zmm19, %zmm31 #64.63
|
||||||
|
225 | 2.50 | | | | | 0.500 | | | | || 6.0 | | vrcp14pd %zmm31, %zmm30 #75.39
|
||||||
|
226 | | | | | | 1.000 | | | | || | | vcmppd $1, %zmm14, %zmm31, %k6{%k5} #74.22
|
||||||
|
227 | | | | | | 1.000 | | | | || | | vfpclasspd $30, %zmm30, %k0 #75.39
|
||||||
|
228 | 0.50 | | | | | 0.500 | | | | || | | vmovaps %zmm31, %zmm23 #75.39
|
||||||
|
229 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.500 | | | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
|
||||||
|
230 | 1.00 | | | | | | | | | || | | knotw %k0, %k4 #75.39
|
||||||
|
231 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm23, %zmm23, %zmm24 #75.39
|
||||||
|
232 | 0.50 | | | | | 0.500 | | | | || | | vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.39
|
||||||
|
233 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.39
|
||||||
|
234 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm13, %zmm30, %zmm25 #76.38
|
||||||
|
235 | 0.50 | | | | | 0.500 | | | | || | | vmulpd %zmm12, %zmm30, %zmm27 #77.55
|
||||||
|
236 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm25, %zmm30, %zmm28 #76.44
|
||||||
|
237 | 0.50 | | | | | 0.500 | | | | || 4.0 | | vmulpd %zmm28, %zmm30, %zmm26 #76.50
|
||||||
|
238 | 0.50 | | | | | 0.500 | | | | || | | vfmsub213pd %zmm5, %zmm28, %zmm30 #77.55
|
||||||
|
239 | 0.25 | | | | | 0.750 | | | | || 4.0 | | vmulpd %zmm27, %zmm26, %zmm29 #77.64
|
||||||
|
240 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vmulpd %zmm30, %zmm29, %zmm23 #77.70
|
||||||
|
241 | 0.00 | | | | | 1.000 | | | | || 4.0 | | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17
|
||||||
|
242 | 0.00 | | | | | 1.000 | | | | || | | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17
|
||||||
|
243 | 0.00 | | | | | 1.000 | | | | || | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17
|
||||||
|
244 | 0.00 | 0.00 | | | | -0.01 | 1.00 | | | || | | cmpq %r14, %r15 #59.9
|
||||||
|
245 | | | | | | | | | | || | | * jb ..B1.16 # Prob 82% #59.9
|
||||||
|
246 | | | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
18.0 9.98 22.0 22.0 22.0 22.0 18.00 2.00 89 4.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
243 | 4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17| [243]
|
||||||
|
242 | 4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17| [242]
|
||||||
|
241 | 4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17| [241]
|
||||||
|
208 | 1.0 | addq $8, %r15 #59.9| [208]
|
||||||
|
205 | 1.0 | vpaddd %ymm15, %ymm4, %ymm4 #59.9| [205]
|
||||||
|
|
197
static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out
Normal file
197
static_analysis/jan/analyses/lammps-icx-avx2zen-mca.out
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
|
||||||
|
[0] Code Region
|
||||||
|
|
||||||
|
Iterations: 100
|
||||||
|
Instructions: 7000
|
||||||
|
Total Cycles: 3866
|
||||||
|
Total uOps: 7900
|
||||||
|
|
||||||
|
Dispatch Width: 6
|
||||||
|
uOps Per Cycle: 2.04
|
||||||
|
IPC: 1.81
|
||||||
|
Block RThroughput: 21.5
|
||||||
|
|
||||||
|
|
||||||
|
Instruction Info:
|
||||||
|
[1]: #uOps
|
||||||
|
[2]: Latency
|
||||||
|
[3]: RThroughput
|
||||||
|
[4]: MayLoad
|
||||||
|
[5]: MayStore
|
||||||
|
[6]: HasSideEffects (U)
|
||||||
|
|
||||||
|
[1] [2] [3] [4] [5] [6] Instructions:
|
||||||
|
1 8 0.50 * vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||||
|
1 10 0.50 * vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||||
|
2 4 1.50 vpmovsxdq %xmm11, %ymm1
|
||||||
|
1 1 0.50 vpsllq $3, %ymm1, %ymm1
|
||||||
|
1 1 0.25 vpaddq %ymm1, %ymm3, %ymm1
|
||||||
|
1 1 1.00 vmovq %xmm1, %r14
|
||||||
|
2 1 1.00 vpextrq $1, %xmm1, %r9
|
||||||
|
1 4 1.00 vextracti128 $1, %ymm1, %xmm1
|
||||||
|
1 8 0.50 * vmovsd (%r14), %xmm2
|
||||||
|
1 8 0.50 * vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||||
|
2 4 1.50 vpmovsxdq %xmm6, %ymm6
|
||||||
|
1 1 0.50 vpsllq $3, %ymm6, %ymm6
|
||||||
|
1 1 1.00 vmovq %xmm1, %rdi
|
||||||
|
1 1 0.25 vpaddq %ymm6, %ymm3, %ymm6
|
||||||
|
1 1 1.00 vmovq %xmm6, %rcx
|
||||||
|
2 1 1.00 vpextrq $1, %xmm1, %rbx
|
||||||
|
2 1 1.00 vpextrq $1, %xmm6, %rax
|
||||||
|
1 4 1.00 vextracti128 $1, %ymm6, %xmm1
|
||||||
|
1 8 0.50 * vmovsd (%rdi), %xmm6
|
||||||
|
1 1 1.00 vmovq %xmm1, %rdi
|
||||||
|
2 1 1.00 vpextrq $1, %xmm1, %rsi
|
||||||
|
1 8 0.50 * vmovsd (%rdi), %xmm1
|
||||||
|
1 8 0.50 * vmovsd (%rcx), %xmm7
|
||||||
|
1 8 0.50 * vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||||
|
1 8 0.50 * vmovhpd (%r9), %xmm2, %xmm2
|
||||||
|
1 1 0.25 vpaddd %xmm12, %xmm11, %xmm4
|
||||||
|
2 4 1.50 vpmovsxdq %xmm4, %ymm4
|
||||||
|
1 8 0.50 * vmovhpd (%rax), %xmm7, %xmm7
|
||||||
|
1 1 0.50 vpsllq $3, %ymm4, %ymm4
|
||||||
|
1 1 0.25 vpaddq %ymm4, %ymm3, %ymm4
|
||||||
|
1 8 0.50 * vmovhpd (%rbx), %xmm6, %xmm6
|
||||||
|
2 1 1.00 vpextrq $1, %xmm4, %rax
|
||||||
|
1 8 0.50 * vmovhpd (%rsi), %xmm1, %xmm1
|
||||||
|
1 1 1.00 vmovq %xmm4, %rcx
|
||||||
|
1 4 1.00 vextracti128 $1, %ymm4, %xmm4
|
||||||
|
1 1 1.00 vmovq %xmm4, %rsi
|
||||||
|
1 2 1.00 vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||||
|
2 1 1.00 vpextrq $1, %xmm4, %rdi
|
||||||
|
1 8 0.50 * vmovsd (%rsi), %xmm4
|
||||||
|
1 3 0.50 vsubpd %ymm2, %ymm14, %ymm2
|
||||||
|
1 8 0.50 * vmovhpd (%rdi), %xmm4, %xmm4
|
||||||
|
1 8 0.50 * vmovsd (%rcx), %xmm6
|
||||||
|
1 2 1.00 vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||||
|
1 8 0.50 * vmovhpd (%rax), %xmm6, %xmm6
|
||||||
|
1 2 1.00 vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||||
|
1 3 0.50 vsubpd %ymm1, %ymm5, %ymm1
|
||||||
|
1 3 0.50 vsubpd %ymm4, %ymm10, %ymm4
|
||||||
|
1 3 0.50 vmulpd %ymm2, %ymm2, %ymm6
|
||||||
|
1 4 1.00 vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||||
|
1 4 1.00 vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||||
|
1 8 0.50 * vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||||
|
1 13 5.00 vdivpd %ymm6, %ymm7, %ymm7
|
||||||
|
1 3 0.50 vmulpd %ymm7, %ymm7, %ymm11
|
||||||
|
1 3 0.50 vmulpd %ymm9, %ymm11, %ymm11
|
||||||
|
1 8 0.50 * vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||||
|
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm11
|
||||||
|
1 3 0.50 vaddpd %ymm12, %ymm11, %ymm12
|
||||||
|
1 10 0.50 * vmulpd 128(%rsp), %ymm7, %ymm7
|
||||||
|
1 3 0.50 vmulpd %ymm7, %ymm11, %ymm7
|
||||||
|
1 3 0.50 vmulpd %ymm7, %ymm12, %ymm7
|
||||||
|
1 1 0.50 vcmpltpd %ymm8, %ymm6, %ymm6
|
||||||
|
1 4 1.00 vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||||
|
1 1 0.50 vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||||
|
1 4 1.00 vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||||
|
1 4 1.00 vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||||
|
1 1 0.50 vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||||
|
1 1 0.50 vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||||
|
1 1 0.25 addq $4, %rbp
|
||||||
|
1 1 0.25 cmpq %rdx, %rbp
|
||||||
|
1 1 0.50 jb .LBB0_9
|
||||||
|
|
||||||
|
|
||||||
|
Resources:
|
||||||
|
[0] - Zn3AGU0
|
||||||
|
[1] - Zn3AGU1
|
||||||
|
[2] - Zn3AGU2
|
||||||
|
[3] - Zn3ALU0
|
||||||
|
[4] - Zn3ALU1
|
||||||
|
[5] - Zn3ALU2
|
||||||
|
[6] - Zn3ALU3
|
||||||
|
[7] - Zn3BRU1
|
||||||
|
[8] - Zn3FPP0
|
||||||
|
[9] - Zn3FPP1
|
||||||
|
[10] - Zn3FPP2
|
||||||
|
[11] - Zn3FPP3
|
||||||
|
[12.0] - Zn3FPP45
|
||||||
|
[12.1] - Zn3FPP45
|
||||||
|
[13] - Zn3FPSt
|
||||||
|
[14.0] - Zn3LSU
|
||||||
|
[14.1] - Zn3LSU
|
||||||
|
[14.2] - Zn3LSU
|
||||||
|
[15.0] - Zn3Load
|
||||||
|
[15.1] - Zn3Load
|
||||||
|
[15.2] - Zn3Load
|
||||||
|
[16.0] - Zn3Store
|
||||||
|
[16.1] - Zn3Store
|
||||||
|
|
||||||
|
|
||||||
|
Resource pressure per iteration:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
|
||||||
|
- - - 0.60 0.60 0.60 0.60 0.60 16.84 23.53 16.30 7.33 21.50 21.50 - 6.33 6.33 6.34 6.33 6.33 6.34 - -
|
||||||
|
|
||||||
|
Resource pressure by instruction:
|
||||||
|
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
|
||||||
|
- - - - - - - - - 0.03 0.97 - 0.51 0.49 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpbroadcastd .LCPI0_1(%rip), %xmm1
|
||||||
|
- - - - - - - - 0.65 - - 0.35 0.34 0.66 - 0.49 0.05 0.46 0.49 0.05 0.46 - - vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||||
|
- - - - - - - - - 0.06 2.94 - - - - - - - - - - - - vpmovsxdq %xmm11, %ymm1
|
||||||
|
- - - - - - - - - 0.65 0.35 - - - - - - - - - - - - vpsllq $3, %ymm1, %ymm1
|
||||||
|
- - - - - - - - - - - 1.00 - - - - - - - - - - - vpaddq %ymm1, %ymm3, %ymm1
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm1, %r14
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %r9
|
||||||
|
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm1, %xmm1
|
||||||
|
- - - - - - - - - - - - 0.50 0.50 - 0.48 0.35 0.17 0.48 0.35 0.17 - - vmovsd (%r14), %xmm2
|
||||||
|
- - - - - - - - 0.01 0.18 0.17 0.64 0.47 0.53 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||||
|
- - - - - - - - - 1.92 1.08 - - - - - - - - - - - - vpmovsxdq %xmm6, %ymm6
|
||||||
|
- - - - - - - - - 0.32 0.68 - - - - - - - - - - - - vpsllq $3, %ymm6, %ymm6
|
||||||
|
- - - - - - - - - - - - 1.30 0.70 - - - - - - - - - vmovq %xmm1, %rdi
|
||||||
|
- - - - - - - - - - 0.32 0.68 - - - - - - - - - - - vpaddq %ymm6, %ymm3, %ymm6
|
||||||
|
- - - - - - - - - - - - 1.34 0.66 - - - - - - - - - vmovq %xmm6, %rcx
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm1, %rbx
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm6, %rax
|
||||||
|
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm6, %xmm1
|
||||||
|
- - - - - - - - - - - - 0.50 0.50 - 0.03 0.65 0.32 0.03 0.65 0.32 - - vmovsd (%rdi), %xmm6
|
||||||
|
- - - - - - - - - - - - 0.36 1.64 - - - - - - - - - vmovq %xmm1, %rdi
|
||||||
|
- - - - - - - - - - - - 1.64 0.36 - - - - - - - - - vpextrq $1, %xmm1, %rsi
|
||||||
|
- - - - - - - - - - - - 0.32 0.68 - 0.51 0.33 0.16 0.51 0.33 0.16 - - vmovsd (%rdi), %xmm1
|
||||||
|
- - - - - - - - - - - - 0.68 0.32 - 0.49 0.01 0.50 0.49 0.01 0.50 - - vmovsd (%rcx), %xmm7
|
||||||
|
- - - - - - - - - 0.48 0.52 - 0.67 0.33 - 0.17 0.62 0.21 0.17 0.62 0.21 - - vpbroadcastd .LCPI0_2(%rip), %xmm12
|
||||||
|
- - - - - - - - - 0.01 0.99 - 0.17 0.83 - 0.02 0.64 0.34 0.02 0.64 0.34 - - vmovhpd (%r9), %xmm2, %xmm2
|
||||||
|
- - - - - - - - 0.01 - - 0.99 - - - - - - - - - - - vpaddd %xmm12, %xmm11, %xmm4
|
||||||
|
- - - - - - - - - 0.57 2.43 - - - - - - - - - - - - vpmovsxdq %xmm4, %ymm4
|
||||||
|
- - - - - - - - - 0.34 0.66 - 0.82 0.18 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovhpd (%rax), %xmm7, %xmm7
|
||||||
|
- - - - - - - - - 0.34 0.66 - - - - - - - - - - - - vpsllq $3, %ymm4, %ymm4
|
||||||
|
- - - - - - - - - - 0.01 0.99 - - - - - - - - - - - vpaddq %ymm4, %ymm3, %ymm4
|
||||||
|
- - - - - - - - - 0.51 0.49 - 0.49 0.51 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rbx), %xmm6, %xmm6
|
||||||
|
- - - - - - - - - - - - 1.04 0.96 - - - - - - - - - vpextrq $1, %xmm4, %rax
|
||||||
|
- - - - - - - - - 0.49 0.51 - 0.17 0.83 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovhpd (%rsi), %xmm1, %xmm1
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rcx
|
||||||
|
- - - - - - - - 1.00 - - - - - - - - - - - - - - vextracti128 $1, %ymm4, %xmm4
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vmovq %xmm4, %rsi
|
||||||
|
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||||
|
- - - - - - - - - - - - 1.00 1.00 - - - - - - - - - vpextrq $1, %xmm4, %rdi
|
||||||
|
- - - - - - - - - - - - 0.50 0.50 - 0.49 0.35 0.16 0.49 0.35 0.16 - - vmovsd (%rsi), %xmm4
|
||||||
|
- - - - - - - - - - 0.31 0.69 - - - - - - - - - - - vsubpd %ymm2, %ymm14, %ymm2
|
||||||
|
- - - - - - - - - 0.49 0.51 - 0.48 0.52 - 0.35 0.16 0.49 0.35 0.16 0.49 - - vmovhpd (%rdi), %xmm4, %xmm4
|
||||||
|
- - - - - - - - - - - - 0.52 0.48 - 0.16 0.49 0.35 0.16 0.49 0.35 - - vmovsd (%rcx), %xmm6
|
||||||
|
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||||
|
- - - - - - - - - 0.35 0.65 - 0.50 0.50 - 0.47 0.35 0.18 0.47 0.35 0.18 - - vmovhpd (%rax), %xmm6, %xmm6
|
||||||
|
- - - - - - - - - 1.00 - - - - - - - - - - - - - vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||||
|
- - - - - - - - - - 0.33 0.67 - - - - - - - - - - - vsubpd %ymm1, %ymm5, %ymm1
|
||||||
|
- - - - - - - - - - 0.51 0.49 - - - - - - - - - - - vsubpd %ymm4, %ymm10, %ymm4
|
||||||
|
- - - - - - - - 0.52 0.48 - - - - - - - - - - - - - vmulpd %ymm2, %ymm2, %ymm6
|
||||||
|
- - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vfmadd231pd %ymm1, %ymm1, %ymm6
|
||||||
|
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd231pd %ymm4, %ymm4, %ymm6
|
||||||
|
- - - - - - - - - 0.66 0.34 - 0.51 0.49 - 0.19 0.32 0.49 0.19 0.32 0.49 - - vbroadcastsd .LCPI0_3(%rip), %ymm7
|
||||||
|
- - - - - - - - - 5.00 - - - - - - - - - - - - - vdivpd %ymm6, %ymm7, %ymm7
|
||||||
|
- - - - - - - - 0.50 0.50 - - - - - - - - - - - - - vmulpd %ymm7, %ymm7, %ymm11
|
||||||
|
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm9, %ymm11, %ymm11
|
||||||
|
- - - - - - - - - 0.30 0.70 - 0.49 0.51 - 0.34 0.33 0.33 0.34 0.33 0.33 - - vbroadcastsd .LCPI0_4(%rip), %ymm12
|
||||||
|
- - - - - - - - 0.82 0.18 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm11
|
||||||
|
- - - - - - - - - - 0.17 0.83 - - - - - - - - - - - vaddpd %ymm12, %ymm11, %ymm12
|
||||||
|
- - - - - - - - 0.01 0.99 - - 0.18 0.82 - 0.46 0.02 0.52 0.46 0.02 0.52 - - vmulpd 128(%rsp), %ymm7, %ymm7
|
||||||
|
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vmulpd %ymm7, %ymm11, %ymm7
|
||||||
|
- - - - - - - - 0.67 0.33 - - - - - - - - - - - - - vmulpd %ymm7, %ymm12, %ymm7
|
||||||
|
- - - - - - - - 1.00 - - - - - - - - - - - - - - vcmpltpd %ymm8, %ymm6, %ymm6
|
||||||
|
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm0, %ymm7, %ymm2
|
||||||
|
- - - - - - - - 0.66 0.34 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||||
|
- - - - - - - - 0.66 1.34 - - - - - - - - - - - - - vfmadd213pd %ymm15, %ymm7, %ymm1
|
||||||
|
- - - - - - - - 1.34 0.66 - - - - - - - - - - - - - vfmadd213pd %ymm13, %ymm7, %ymm4
|
||||||
|
- - - - - - - - 0.34 0.66 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||||
|
- - - - - - - - 0.99 0.01 - - - - - - - - - - - - - vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||||
|
- - - - 0.40 0.20 0.40 - - - - - - - - - - - - - - - - addq $4, %rbp
|
||||||
|
- - - 0.20 0.20 0.40 0.20 - - - - - - - - - - - - - - - - cmpq %rdx, %rbp
|
||||||
|
- - - 0.40 - - - 0.60 - - - - - - - - - - - - - - - jb .LBB0_9
|
108
static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out
Normal file
108
static_analysis/jan/analyses/lammps-icx-avx2zen-osaca.out
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: lammps-icx-avx2zen.s
|
||||||
|
Architecture: ZEN3
|
||||||
|
Timestamp: 2023-02-10 16:31:30
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||||
|
--------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
175 | | | | | | | | | | | | | | | | || | | # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||||
|
176 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-BEGIN
|
||||||
|
177 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||||
|
178 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||||
|
179 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||||
|
180 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||||
|
181 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||||
|
182 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||||
|
183 | | 0.00 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||||
|
184 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||||
|
185 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||||
|
186 | 0.12 | 1.88 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||||
|
187 | | 1.00 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||||
|
188 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
189 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||||
|
190 | 0.00 | 0.75 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||||
|
191 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||||
|
192 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||||
|
193 | 0.00 | 0.00 | 0.51 | 0.49 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||||
|
194 | 0.00 | 0.00 | 0.49 | 0.51 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||||
|
195 | 0.13 | 1.87 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||||
|
196 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||||
|
197 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||||
|
198 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
199 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||||
|
200 | 0.00 | 2.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||||
|
201 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
202 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||||
|
203 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||||
|
204 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||||
|
205 | 0.00 | 0.00 | 0.63 | 0.37 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||||
|
206 | 0.00 | 0.75 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||||
|
207 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||||
|
208 | | 0.00 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||||
|
209 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||||
|
210 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
211 | 0.75 | 1.25 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||||
|
212 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||||
|
213 | 0.00 | 0.00 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||||
|
214 | | 1.00 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||||
|
215 | 0.00 | 0.00 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||||
|
216 | | 1.00 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||||
|
217 | 1.00 | 1.00 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||||
|
218 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||||
|
219 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||||
|
220 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||||
|
221 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
222 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||||
|
223 | | 0.00 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
224 | | 1.00 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||||
|
225 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||||
|
226 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||||
|
227 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||||
|
228 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||||
|
229 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||||
|
230 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||||
|
231 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||||
|
232 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||||
|
233 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||||
|
234 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||||
|
235 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||||
|
236 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||||
|
237 | 1.00 | 0.00 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||||
|
238 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||||
|
239 | 1.00 | 0.00 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||||
|
240 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||||
|
241 | 1.00 | 0.00 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||||
|
242 | 1.00 | 0.00 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||||
|
243 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||||
|
244 | 1.00 | 0.00 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||||
|
245 | 1.00 | 0.00 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||||
|
246 | 0.75 | 0.25 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||||
|
247 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||||
|
248 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||||
|
249 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||||
|
250 | | | | | | | | | | | | | | | | || | | # LLVM-MCA-END
|
||||||
|
|
||||||
|
18.8 18.5 15.9 15.9 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
244 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
|
||||||
|
243 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
|
||||||
|
241 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
|
||||||
|
247 | 1.0 | addq $4, %rbp | [247]
|
||||||
|
246 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [246]
|
||||||
|
245 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [245]
|
||||||
|
242 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [242]
|
||||||
|
|
BIN
static_analysis/jan/gromacs-icc-avx512-dp.o
Normal file
BIN
static_analysis/jan/gromacs-icc-avx512-dp.o
Normal file
Binary file not shown.
4334
static_analysis/jan/gromacs-icc-avx512-dp.s
Normal file
4334
static_analysis/jan/gromacs-icc-avx512-dp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/gromacs-icc-avx512-sp.o
Normal file
BIN
static_analysis/jan/gromacs-icc-avx512-sp.o
Normal file
Binary file not shown.
4018
static_analysis/jan/gromacs-icc-avx512-sp.s
Normal file
4018
static_analysis/jan/gromacs-icc-avx512-sp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/gromacs-icx-avx512-dp.o
Normal file
BIN
static_analysis/jan/gromacs-icx-avx512-dp.o
Normal file
Binary file not shown.
2453
static_analysis/jan/gromacs-icx-avx512-dp.s
Normal file
2453
static_analysis/jan/gromacs-icx-avx512-dp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/gromacs-icx-avx512-sp.o
Normal file
BIN
static_analysis/jan/gromacs-icx-avx512-sp.o
Normal file
Binary file not shown.
2013
static_analysis/jan/gromacs-icx-avx512-sp.s
Normal file
2013
static_analysis/jan/gromacs-icx-avx512-sp.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/lammps-icc-avx2.o
Normal file
BIN
static_analysis/jan/lammps-icc-avx2.o
Normal file
Binary file not shown.
1419
static_analysis/jan/lammps-icc-avx2.s
Normal file
1419
static_analysis/jan/lammps-icc-avx2.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/lammps-icc-avx512.o
Normal file
BIN
static_analysis/jan/lammps-icc-avx512.o
Normal file
Binary file not shown.
1559
static_analysis/jan/lammps-icc-avx512.s
Normal file
1559
static_analysis/jan/lammps-icc-avx512.s
Normal file
File diff suppressed because it is too large
Load Diff
BIN
static_analysis/jan/lammps-icx-avx2zen.o
Normal file
BIN
static_analysis/jan/lammps-icx-avx2zen.o
Normal file
Binary file not shown.
640
static_analysis/jan/lammps-icx-avx2zen.s
Normal file
640
static_analysis/jan/lammps-icx-avx2zen.s
Normal file
@@ -0,0 +1,640 @@
|
|||||||
|
.text
|
||||||
|
.file "force_lj.c"
|
||||||
|
.section .rodata.cst8,"aM",@progbits,8
|
||||||
|
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||||
|
.LCPI0_0:
|
||||||
|
.quad 4631952216750555136 # 48
|
||||||
|
.LCPI0_3:
|
||||||
|
.quad 4607182418800017408 # 1
|
||||||
|
.LCPI0_4:
|
||||||
|
.quad -4620693217682128896 # -0.5
|
||||||
|
.section .rodata.cst4,"aM",@progbits,4
|
||||||
|
.p2align 2
|
||||||
|
.LCPI0_1:
|
||||||
|
.long 3 # 0x3
|
||||||
|
.LCPI0_2:
|
||||||
|
.long 2 # 0x2
|
||||||
|
.section .rodata.cst16,"aM",@progbits,16
|
||||||
|
.p2align 4
|
||||||
|
.LCPI0_5:
|
||||||
|
.zero 16,255
|
||||||
|
.text
|
||||||
|
.globl computeForceLJFullNeigh_plain_c
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.type computeForceLJFullNeigh_plain_c,@function
|
||||||
|
computeForceLJFullNeigh_plain_c: #
|
||||||
|
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||||
|
.cfi_startproc
|
||||||
|
# %bb.0: #
|
||||||
|
pushq %rbp
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
pushq %r15
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
pushq %r14
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
pushq %r13
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
pushq %r12
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
pushq %rbx
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
subq $264, %rsp # imm = 0x108
|
||||||
|
.cfi_def_cfa_offset 320
|
||||||
|
.cfi_offset %rbx, -56
|
||||||
|
.cfi_offset %r12, -48
|
||||||
|
.cfi_offset %r13, -40
|
||||||
|
.cfi_offset %r14, -32
|
||||||
|
.cfi_offset %r15, -24
|
||||||
|
.cfi_offset %rbp, -16
|
||||||
|
movq %rcx, %rbx
|
||||||
|
movq %rdx, %r15
|
||||||
|
movq %rsi, %r12
|
||||||
|
movl 4(%rsi), %r14d
|
||||||
|
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||||
|
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||||
|
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||||
|
testl %r14d, %r14d
|
||||||
|
jle .LBB0_2
|
||||||
|
# %bb.1: #
|
||||||
|
movq 64(%r12), %rdi
|
||||||
|
leaq (,%r14,8), %rax
|
||||||
|
leaq (%rax,%rax,2), %rdx
|
||||||
|
xorl %esi, %esi
|
||||||
|
callq _intel_fast_memset
|
||||||
|
.LBB0_2: #
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||||
|
movl $.L.str, %edi
|
||||||
|
callq likwid_markerStartRegion
|
||||||
|
testl %r14d, %r14d
|
||||||
|
jle .LBB0_19
|
||||||
|
# %bb.3: #
|
||||||
|
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd %xmm0, %xmm0, %xmm13
|
||||||
|
movq 16(%r15), %r11
|
||||||
|
movq 24(%r15), %rsi
|
||||||
|
movslq 8(%r15), %rdi
|
||||||
|
movq 16(%r12), %r15
|
||||||
|
movq 64(%r12), %r8
|
||||||
|
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||||
|
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||||
|
vmovdqu (%rbx), %xmm14
|
||||||
|
decq %r14
|
||||||
|
vmovq %r15, %xmm0
|
||||||
|
vpbroadcastq %xmm0, %ymm3
|
||||||
|
vbroadcastsd %xmm13, %ymm2
|
||||||
|
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||||
|
vbroadcastsd %xmm12, %ymm8
|
||||||
|
vbroadcastsd %xmm15, %ymm9
|
||||||
|
shlq $2, %rdi
|
||||||
|
xorl %r10d, %r10d
|
||||||
|
movq %r14, 56(%rsp) # 8-byte Spill
|
||||||
|
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||||
|
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||||
|
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||||
|
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||||
|
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||||
|
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||||
|
jmp .LBB0_6
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_17: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
movq %r13, %rdx
|
||||||
|
.LBB0_5: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||||
|
vmovsd %xmm0, (%r8,%r12,8)
|
||||||
|
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||||
|
vmovsd %xmm0, (%r8,%rbx,8)
|
||||||
|
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||||
|
vmovsd %xmm0, (%r8,%rbp,8)
|
||||||
|
leal 3(%r13), %eax
|
||||||
|
addl $6, %r13d
|
||||||
|
testl %eax, %eax
|
||||||
|
cmovnsl %eax, %r13d
|
||||||
|
sarl $2, %r13d
|
||||||
|
movslq %r13d, %rax
|
||||||
|
vmovq %rax, %xmm0
|
||||||
|
vmovq %rdx, %xmm1
|
||||||
|
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||||
|
vpaddq %xmm0, %xmm14, %xmm14
|
||||||
|
addq %rdi, %r11
|
||||||
|
cmpq %r14, %r10
|
||||||
|
leaq 1(%r10), %r10
|
||||||
|
je .LBB0_18
|
||||||
|
.LBB0_6: #
|
||||||
|
# =>This Loop Header: Depth=1
|
||||||
|
# Child Loop BB0_9 Depth 2
|
||||||
|
# Child Loop BB0_13 Depth 2
|
||||||
|
movl (%rsi,%r10,4), %r13d
|
||||||
|
leal (%r10,%r10,2), %r12d
|
||||||
|
leal (%r10,%r10,2), %ebx
|
||||||
|
incl %ebx
|
||||||
|
leal (%r10,%r10,2), %ebp
|
||||||
|
addl $2, %ebp
|
||||||
|
testl %r13d, %r13d
|
||||||
|
jle .LBB0_4
|
||||||
|
# %bb.7: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
movq %r13, %rdx
|
||||||
|
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||||
|
andq %rax, %rdx
|
||||||
|
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||||
|
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||||
|
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||||
|
je .LBB0_16
|
||||||
|
# %bb.8: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||||
|
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||||
|
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||||
|
vbroadcastsd %xmm0, %ymm14
|
||||||
|
vbroadcastsd %xmm1, %ymm5
|
||||||
|
vbroadcastsd %xmm2, %ymm10
|
||||||
|
vxorpd %xmm0, %xmm0, %xmm0
|
||||||
|
vxorpd %xmm15, %xmm15, %xmm15
|
||||||
|
vxorpd %xmm13, %xmm13, %xmm13
|
||||||
|
xorl %ebp, %ebp
|
||||||
|
vmovapd %ymm8, %ymm9
|
||||||
|
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||||
|
.p2align 4, 0x90
|
||||||
|
movl $111, %ebx # OSACA START MARKER
|
||||||
|
.byte 100 # OSACA START MARKER
|
||||||
|
.byte 103 # OSACA START MARKER
|
||||||
|
.byte 144 # OSACA START MARKER
|
||||||
|
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||||
|
# LLVM-MCA-BEGIN
|
||||||
|
.LBB0_9: #
|
||||||
|
# Parent Loop BB0_6 Depth=1
|
||||||
|
# => This Inner Loop Header: Depth=2
|
||||||
|
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||||
|
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||||
|
vpmovsxdq %xmm11, %ymm1
|
||||||
|
vpsllq $3, %ymm1, %ymm1
|
||||||
|
vpaddq %ymm1, %ymm3, %ymm1
|
||||||
|
vmovq %xmm1, %r14
|
||||||
|
vpextrq $1, %xmm1, %r9
|
||||||
|
vextracti128 $1, %ymm1, %xmm1
|
||||||
|
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||||
|
vpmovsxdq %xmm6, %ymm6
|
||||||
|
vpsllq $3, %ymm6, %ymm6
|
||||||
|
vmovq %xmm1, %rdi
|
||||||
|
vpaddq %ymm6, %ymm3, %ymm6
|
||||||
|
vmovq %xmm6, %rcx
|
||||||
|
vpextrq $1, %xmm1, %rbx
|
||||||
|
vpextrq $1, %xmm6, %rax
|
||||||
|
vextracti128 $1, %ymm6, %xmm1
|
||||||
|
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
vmovq %xmm1, %rdi
|
||||||
|
vpextrq $1, %xmm1, %rsi
|
||||||
|
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||||
|
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||||
|
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||||
|
vpaddd %xmm12, %xmm11, %xmm4
|
||||||
|
vpmovsxdq %xmm4, %ymm4
|
||||||
|
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||||
|
vpsllq $3, %ymm4, %ymm4
|
||||||
|
vpaddq %ymm4, %ymm3, %ymm4
|
||||||
|
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
vpextrq $1, %xmm4, %rax
|
||||||
|
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||||
|
vmovq %xmm4, %rcx
|
||||||
|
vextracti128 $1, %ymm4, %xmm4
|
||||||
|
vmovq %xmm4, %rsi
|
||||||
|
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||||
|
vpextrq $1, %xmm4, %rdi
|
||||||
|
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||||
|
vsubpd %ymm2, %ymm14, %ymm2
|
||||||
|
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||||
|
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||||
|
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||||
|
vsubpd %ymm1, %ymm5, %ymm1
|
||||||
|
vsubpd %ymm4, %ymm10, %ymm4
|
||||||
|
vmulpd %ymm2, %ymm2, %ymm6
|
||||||
|
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||||
|
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||||
|
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||||
|
vdivpd %ymm6, %ymm7, %ymm7
|
||||||
|
vmulpd %ymm7, %ymm7, %ymm11
|
||||||
|
vmulpd %ymm9, %ymm11, %ymm11
|
||||||
|
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||||
|
vmulpd %ymm7, %ymm11, %ymm11
|
||||||
|
vaddpd %ymm12, %ymm11, %ymm12
|
||||||
|
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||||
|
vmulpd %ymm7, %ymm11, %ymm7
|
||||||
|
vmulpd %ymm7, %ymm12, %ymm7
|
||||||
|
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||||
|
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||||
|
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||||
|
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||||
|
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||||
|
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||||
|
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||||
|
addq $4, %rbp
|
||||||
|
cmpq %rdx, %rbp
|
||||||
|
jb .LBB0_9
|
||||||
|
# LLVM-MCA-END
|
||||||
|
movl $222, %ebx # OSACA END MARKER
|
||||||
|
.byte 100 # OSACA END MARKER
|
||||||
|
.byte 103 # OSACA END MARKER
|
||||||
|
.byte 144 # OSACA END MARKER
|
||||||
|
# %bb.10: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||||
|
vaddsd %xmm1, %xmm0, %xmm1
|
||||||
|
vextractf128 $1, %ymm0, %xmm0
|
||||||
|
vaddsd %xmm0, %xmm1, %xmm1
|
||||||
|
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||||
|
vaddsd %xmm0, %xmm1, %xmm10
|
||||||
|
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||||
|
vaddsd %xmm1, %xmm15, %xmm1
|
||||||
|
vextractf128 $1, %ymm15, %xmm2
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm1
|
||||||
|
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm11
|
||||||
|
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||||
|
vaddsd %xmm1, %xmm13, %xmm1
|
||||||
|
vextractf128 $1, %ymm13, %xmm2
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm1
|
||||||
|
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm5
|
||||||
|
movq 56(%rsp), %r14 # 8-byte Reload
|
||||||
|
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||||
|
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||||
|
movq 48(%rsp), %rsi # 8-byte Reload
|
||||||
|
movq 40(%rsp), %rdi # 8-byte Reload
|
||||||
|
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||||
|
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||||
|
vmovapd %ymm9, %ymm8
|
||||||
|
movq 72(%rsp), %rbx # 8-byte Reload
|
||||||
|
movq 64(%rsp), %rbp # 8-byte Reload
|
||||||
|
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||||
|
cmpq %r13, %rdx
|
||||||
|
jae .LBB0_17
|
||||||
|
jmp .LBB0_11
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_4: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
movslq %r13d, %rdx
|
||||||
|
vxorpd %xmm5, %xmm5, %xmm5
|
||||||
|
vxorpd %xmm11, %xmm11, %xmm11
|
||||||
|
vxorpd %xmm10, %xmm10, %xmm10
|
||||||
|
jmp .LBB0_5
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_16: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vxorpd %xmm10, %xmm10, %xmm10
|
||||||
|
vxorpd %xmm11, %xmm11, %xmm11
|
||||||
|
vxorpd %xmm5, %xmm5, %xmm5
|
||||||
|
cmpq %r13, %rdx
|
||||||
|
jae .LBB0_17
|
||||||
|
.LBB0_11: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||||
|
jmp .LBB0_13
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_12: #
|
||||||
|
# in Loop: Header=BB0_13 Depth=2
|
||||||
|
incq %rdx
|
||||||
|
cmpq %rdx, %r13
|
||||||
|
je .LBB0_17
|
||||||
|
.LBB0_13: #
|
||||||
|
# Parent Loop BB0_6 Depth=1
|
||||||
|
# => This Inner Loop Header: Depth=2
|
||||||
|
movl (%r11,%rdx,4), %eax
|
||||||
|
leal (%rax,%rax,2), %ecx
|
||||||
|
movslq %ecx, %rcx
|
||||||
|
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||||
|
leal (%rax,%rax,2), %ecx
|
||||||
|
incl %ecx
|
||||||
|
movslq %ecx, %rcx
|
||||||
|
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||||
|
leal 2(%rax,%rax,2), %eax
|
||||||
|
cltq
|
||||||
|
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||||
|
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||||
|
vmulsd %xmm6, %xmm6, %xmm7
|
||||||
|
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||||
|
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||||
|
vucomisd %xmm13, %xmm7
|
||||||
|
jae .LBB0_12
|
||||||
|
# %bb.14: #
|
||||||
|
# in Loop: Header=BB0_13 Depth=2
|
||||||
|
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vdivsd %xmm7, %xmm0, %xmm7
|
||||||
|
vmulsd %xmm7, %xmm7, %xmm0
|
||||||
|
vmulsd %xmm0, %xmm12, %xmm0
|
||||||
|
vmulsd %xmm7, %xmm0, %xmm0
|
||||||
|
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||||
|
vmulsd %xmm7, %xmm15, %xmm7
|
||||||
|
vmulsd %xmm0, %xmm7, %xmm0
|
||||||
|
vmulsd %xmm4, %xmm0, %xmm0
|
||||||
|
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||||
|
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||||
|
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||||
|
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||||
|
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||||
|
jmp .LBB0_12
|
||||||
|
.LBB0_18: #
|
||||||
|
movq 24(%rsp), %rax # 8-byte Reload
|
||||||
|
vmovdqu %xmm14, (%rax)
|
||||||
|
.LBB0_19: #
|
||||||
|
movl $.L.str, %edi
|
||||||
|
vzeroupper
|
||||||
|
callq likwid_markerStopRegion
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||||
|
addq $264, %rsp # imm = 0x108
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
popq %rbx
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
popq %r12
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
popq %r13
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
popq %r14
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
popq %r15
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
popq %rbp
|
||||||
|
.cfi_def_cfa_offset 8
|
||||||
|
retq
|
||||||
|
.Lfunc_end0:
|
||||||
|
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||||
|
.cfi_endproc
|
||||||
|
# -- End function
|
||||||
|
.section .rodata.cst8,"aM",@progbits,8
|
||||||
|
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||||
|
.LCPI1_0:
|
||||||
|
.quad 4631952216750555136 # 48
|
||||||
|
.LCPI1_1:
|
||||||
|
.quad 4607182418800017408 # 1
|
||||||
|
.LCPI1_2:
|
||||||
|
.quad -4620693217682128896 # -0.5
|
||||||
|
.text
|
||||||
|
.globl computeForceLJHalfNeigh
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.type computeForceLJHalfNeigh,@function
|
||||||
|
computeForceLJHalfNeigh: #
|
||||||
|
.LcomputeForceLJHalfNeigh$local:
|
||||||
|
.cfi_startproc
|
||||||
|
# %bb.0: #
|
||||||
|
pushq %rbp
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
pushq %r15
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
pushq %r14
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
pushq %r13
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
pushq %r12
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
pushq %rbx
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
subq $40, %rsp
|
||||||
|
.cfi_def_cfa_offset 96
|
||||||
|
.cfi_offset %rbx, -56
|
||||||
|
.cfi_offset %r12, -48
|
||||||
|
.cfi_offset %r13, -40
|
||||||
|
.cfi_offset %r14, -32
|
||||||
|
.cfi_offset %r15, -24
|
||||||
|
.cfi_offset %rbp, -16
|
||||||
|
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||||
|
movq %rdx, %r15
|
||||||
|
movq %rsi, %r12
|
||||||
|
movl 4(%rsi), %r13d
|
||||||
|
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||||
|
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||||
|
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||||
|
testl %r13d, %r13d
|
||||||
|
jle .LBB1_2
|
||||||
|
# %bb.1: #
|
||||||
|
movq 64(%r12), %rdi
|
||||||
|
leaq (,%r13,8), %rax
|
||||||
|
leaq (%rax,%rax,2), %rdx
|
||||||
|
xorl %esi, %esi
|
||||||
|
callq _intel_fast_memset
|
||||||
|
.LBB1_2: #
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||||
|
movl $.L.str.1, %edi
|
||||||
|
callq likwid_markerStartRegion
|
||||||
|
testl %r13d, %r13d
|
||||||
|
jle .LBB1_8
|
||||||
|
# %bb.3: #
|
||||||
|
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd %xmm0, %xmm0, %xmm12
|
||||||
|
movq 16(%r15), %rax
|
||||||
|
movq 24(%r15), %rcx
|
||||||
|
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||||
|
movslq 8(%r15), %rdx
|
||||||
|
movq 16(%r12), %rsi
|
||||||
|
movq 64(%r12), %rdi
|
||||||
|
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||||
|
movq 16(%rsp), %rcx # 8-byte Reload
|
||||||
|
vmovdqu (%rcx), %xmm10
|
||||||
|
shlq $2, %rdx
|
||||||
|
movq %rdx, (%rsp) # 8-byte Spill
|
||||||
|
xorl %r12d, %r12d
|
||||||
|
jmp .LBB1_4
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB1_5: #
|
||||||
|
# in Loop: Header=BB1_4 Depth=1
|
||||||
|
vxorpd %xmm13, %xmm13, %xmm13
|
||||||
|
movq %r9, %rdx
|
||||||
|
vxorpd %xmm9, %xmm9, %xmm9
|
||||||
|
vxorpd %xmm14, %xmm14, %xmm14
|
||||||
|
.LBB1_6: #
|
||||||
|
# in Loop: Header=BB1_4 Depth=1
|
||||||
|
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%r15,8)
|
||||||
|
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%r10,8)
|
||||||
|
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%r11,8)
|
||||||
|
leal 3(%r9), %ecx
|
||||||
|
addl $6, %r9d
|
||||||
|
testl %ecx, %ecx
|
||||||
|
cmovnsl %ecx, %r9d
|
||||||
|
sarl $2, %r9d
|
||||||
|
movslq %r9d, %rcx
|
||||||
|
vmovq %rcx, %xmm0
|
||||||
|
vmovq %rdx, %xmm1
|
||||||
|
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||||
|
vpaddq %xmm0, %xmm10, %xmm10
|
||||||
|
incq %r12
|
||||||
|
addq (%rsp), %rax # 8-byte Folded Reload
|
||||||
|
cmpq %r13, %r12
|
||||||
|
je .LBB1_7
|
||||||
|
.LBB1_4: #
|
||||||
|
# =>This Loop Header: Depth=1
|
||||||
|
# Child Loop BB1_10 Depth 2
|
||||||
|
movq 8(%rsp), %rcx # 8-byte Reload
|
||||||
|
movslq (%rcx,%r12,4), %r9
|
||||||
|
leaq (%r12,%r12,2), %rcx
|
||||||
|
leal 1(%rcx), %r10d
|
||||||
|
leal 2(%rcx), %r11d
|
||||||
|
movl %ecx, %r15d
|
||||||
|
testq %r9, %r9
|
||||||
|
jle .LBB1_5
|
||||||
|
# %bb.9: #
|
||||||
|
# in Loop: Header=BB1_4 Depth=1
|
||||||
|
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||||
|
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||||
|
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
movl %r9d, %edx
|
||||||
|
vxorpd %xmm14, %xmm14, %xmm14
|
||||||
|
xorl %ecx, %ecx
|
||||||
|
vxorpd %xmm9, %xmm9, %xmm9
|
||||||
|
vxorpd %xmm13, %xmm13, %xmm13
|
||||||
|
jmp .LBB1_10
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB1_13: #
|
||||||
|
# in Loop: Header=BB1_10 Depth=2
|
||||||
|
incq %rcx
|
||||||
|
cmpq %rcx, %rdx
|
||||||
|
je .LBB1_6
|
||||||
|
.LBB1_10: #
|
||||||
|
# Parent Loop BB1_4 Depth=1
|
||||||
|
# => This Inner Loop Header: Depth=2
|
||||||
|
movslq (%rax,%rcx,4), %r8
|
||||||
|
leaq (%r8,%r8,2), %r14
|
||||||
|
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||||
|
movslq %r14d, %rbp
|
||||||
|
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||||
|
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||||
|
vmulsd %xmm2, %xmm2, %xmm6
|
||||||
|
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||||
|
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||||
|
vucomisd %xmm12, %xmm6
|
||||||
|
jae .LBB1_13
|
||||||
|
# %bb.11: #
|
||||||
|
# in Loop: Header=BB1_10 Depth=2
|
||||||
|
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||||
|
vdivsd %xmm6, %xmm3, %xmm6
|
||||||
|
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||||
|
vmulsd %xmm6, %xmm6, %xmm8
|
||||||
|
vmulsd %xmm3, %xmm8, %xmm3
|
||||||
|
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||||
|
vmulsd %xmm6, %xmm11, %xmm6
|
||||||
|
vmulsd %xmm3, %xmm6, %xmm3
|
||||||
|
vmulsd %xmm7, %xmm3, %xmm3
|
||||||
|
vmulsd %xmm2, %xmm3, %xmm6
|
||||||
|
vaddsd %xmm6, %xmm14, %xmm14
|
||||||
|
vmulsd %xmm5, %xmm3, %xmm2
|
||||||
|
vaddsd %xmm2, %xmm9, %xmm9
|
||||||
|
vmulsd %xmm0, %xmm3, %xmm0
|
||||||
|
vaddsd %xmm0, %xmm13, %xmm13
|
||||||
|
cmpl %r13d, %r8d
|
||||||
|
jge .LBB1_13
|
||||||
|
# %bb.12: #
|
||||||
|
# in Loop: Header=BB1_10 Depth=2
|
||||||
|
leaq 1(%rbp), %rbx
|
||||||
|
addq $2, %rbp
|
||||||
|
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||||
|
vsubsd %xmm6, %xmm3, %xmm3
|
||||||
|
vmovsd %xmm3, (%rdi,%r14,8)
|
||||||
|
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||||
|
vsubsd %xmm2, %xmm3, %xmm2
|
||||||
|
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||||
|
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
vsubsd %xmm0, %xmm2, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||||
|
jmp .LBB1_13
|
||||||
|
.LBB1_7: #
|
||||||
|
movq 16(%rsp), %rax # 8-byte Reload
|
||||||
|
vmovdqu %xmm10, (%rax)
|
||||||
|
.LBB1_8: #
|
||||||
|
movl $.L.str.1, %edi
|
||||||
|
callq likwid_markerStopRegion
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||||
|
addq $40, %rsp
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
popq %rbx
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
popq %r12
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
popq %r13
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
popq %r14
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
popq %r15
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
popq %rbp
|
||||||
|
.cfi_def_cfa_offset 8
|
||||||
|
retq
|
||||||
|
.Lfunc_end1:
|
||||||
|
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||||
|
.cfi_endproc
|
||||||
|
# -- End function
|
||||||
|
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.type computeForceLJFullNeigh_simd,@function
|
||||||
|
computeForceLJFullNeigh_simd: #
|
||||||
|
.LcomputeForceLJFullNeigh_simd$local:
|
||||||
|
.cfi_startproc
|
||||||
|
# %bb.0: #
|
||||||
|
pushq %rax
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
movl 4(%rsi), %eax
|
||||||
|
testl %eax, %eax
|
||||||
|
jle .LBB2_2
|
||||||
|
# %bb.1: #
|
||||||
|
movq 64(%rsi), %rdi
|
||||||
|
shlq $3, %rax
|
||||||
|
leaq (%rax,%rax,2), %rdx
|
||||||
|
xorl %esi, %esi
|
||||||
|
callq _intel_fast_memset
|
||||||
|
.LBB2_2: #
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
movq stderr(%rip), %rcx
|
||||||
|
movl $.L.str.2, %edi
|
||||||
|
movl $65, %esi
|
||||||
|
movl $1, %edx
|
||||||
|
callq fwrite
|
||||||
|
movl $-1, %edi
|
||||||
|
callq exit
|
||||||
|
.Lfunc_end2:
|
||||||
|
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||||
|
.cfi_endproc
|
||||||
|
# -- End function
|
||||||
|
.type .L.str,@object #
|
||||||
|
.section .rodata.str1.1,"aMS",@progbits,1
|
||||||
|
.L.str:
|
||||||
|
.asciz "force"
|
||||||
|
.size .L.str, 6
|
||||||
|
.type .L.str.1,@object #
|
||||||
|
.L.str.1:
|
||||||
|
.asciz "forceLJ-halfneigh"
|
||||||
|
.size .L.str.1, 18
|
||||||
|
.type .L.str.2,@object #
|
||||||
|
.L.str.2:
|
||||||
|
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||||
|
.size .L.str.2, 66
|
||||||
|
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||||
|
.section ".note.GNU-stack","",@progbits
|
105
static_analysis/lammps-avx2-dp-ICX-osaca.txt
Normal file
105
static_analysis/lammps-avx2-dp-ICX-osaca.txt
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
|
||||||
|
Analyzed file: force_lj_icx_avx2_markers.s
|
||||||
|
Architecture: ZEN3
|
||||||
|
Timestamp: 2022-12-12 12:47:07
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 | 1 | 2 | 3 | DV0 | DV1 | 4 | 5 | 6 | 7 | 8 - 8DV | 9 | 10 | 11 | 12 | 13 || CP | LCD |
|
||||||
|
---------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
172 | | | | | | | | | | | | | | | | || | | .LBB0_9: #
|
||||||
|
173 | | | | | | | | | | | | | | | | || | | # Parent Loop BB0_6 Depth=1
|
||||||
|
174 | | | | | | | | | | | | | | | | || | | # => This Inner Loop Header: Depth=2
|
||||||
|
175 | | 0.250 | 0.75 | | | | | | | | | | | 0.50 | 0.50 | || 1.0 | | vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||||
|
176 | 0.00 | | | 1.00 | | | | | | | | | | 0.50 | 0.50 | || 3.0 | | vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||||
|
177 | 0.00 | 1.010 | 0.25 | 0.74 | | | | | | | | | | | | || 4.0 | | vpmovsxdq %xmm11, %ymm1
|
||||||
|
178 | | 0.000 | 1.00 | | | | | | | | | | | | | || 1.0 | | vpsllq $3, %ymm1, %ymm1
|
||||||
|
179 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || 1.0 | | vpaddq %ymm1, %ymm3, %ymm1
|
||||||
|
180 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm1, %r14
|
||||||
|
181 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %r9
|
||||||
|
182 | | 1.000 | | | | | | | | | | | | | | || 3.0 | | vextracti128 $1, %ymm1, %xmm1
|
||||||
|
183 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
184 | 0.00 | 0.000 | 0.49 | 0.51 | | | | | | | | | | | | || | | vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||||
|
185 | 0.00 | 0.750 | 0.38 | 0.87 | | | | | | | | | | | | || | | vpmovsxdq %xmm6, %ymm6
|
||||||
|
186 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm6, %ymm6
|
||||||
|
187 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||||
|
188 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vpaddq %ymm6, %ymm3, %ymm6
|
||||||
|
189 | 0.00 | 0.000 | 0.50 | 0.50 | | | | | | | | | | | | || | | vmovq %xmm6, %rcx
|
||||||
|
190 | | 1.000 | | | | | | | | | | | | | | || 6.0 | | vpextrq $1, %xmm1, %rbx
|
||||||
|
191 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm6, %rax
|
||||||
|
192 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm6, %xmm1
|
||||||
|
193 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
194 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm1, %rdi
|
||||||
|
195 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm1, %rsi
|
||||||
|
196 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
197 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||||
|
198 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||||
|
199 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||||
|
200 | 0.00 | 0.000 | 0.62 | 0.38 | | | | | | | | | | | | || | | vpaddd %xmm12, %xmm11, %xmm4
|
||||||
|
201 | 0.00 | 0.750 | 0.00 | 1.25 | | | | | | | | | | | | || | | vpmovsxdq %xmm4, %ymm4
|
||||||
|
202 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||||
|
203 | | 0.000 | 1.00 | | | | | | | | | | | | | || | | vpsllq $3, %ymm4, %ymm4
|
||||||
|
204 | 0.00 | 0.000 | 0.00 | 1.00 | | | | | | | | | | | | || | | vpaddq %ymm4, %ymm3, %ymm4
|
||||||
|
205 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || 5.0 | | vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
206 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rax
|
||||||
|
207 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||||
|
208 | 0.00 | 0.000 | 0.51 | 0.49 | | | | | | | | | | | | || | | vmovq %xmm4, %rcx
|
||||||
|
209 | | 1.000 | | | | | | | | | | | | | | || | | vextracti128 $1, %ymm4, %xmm4
|
||||||
|
210 | 0.00 | -0.01 | 0.00 | 1.00 | | | | | | | | | | | | || | | vmovq %xmm4, %rsi
|
||||||
|
211 | | 1.000 | | | | | | | | | | | | | | || 1.0 | | vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||||
|
212 | | 1.000 | | | | | | | | | | | | | | || | | vpextrq $1, %xmm4, %rdi
|
||||||
|
213 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||||
|
214 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vsubpd %ymm2, %ymm14, %ymm2
|
||||||
|
215 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||||
|
216 | | | | | | | | | | | | | | 0.50 | 0.50 | || | | vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
217 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||||
|
218 | | 0.000 | 1.00 | | | | | | | | | | | 0.50 | 0.50 | || | | vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
219 | | 1.000 | | | | | | | | | | | | | | || | | vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||||
|
220 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm1, %ymm5, %ymm1
|
||||||
|
221 | | | 0.00 | 1.00 | | | | | | | | | | | | || | | vsubpd %ymm4, %ymm10, %ymm4
|
||||||
|
222 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm2, %ymm2, %ymm6
|
||||||
|
223 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||||
|
224 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||||
|
225 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||||
|
226 | | | | | 4.50 | 4.50 | | | | | | | | | | || 13.0 | | vdivpd %ymm6, %ymm7, %ymm7
|
||||||
|
227 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm7, %ymm11
|
||||||
|
228 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm9, %ymm11, %ymm11
|
||||||
|
229 | 1.00 | | | | | | | | | | | | | 0.50 | 0.50 | || | | vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||||
|
230 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm11, %ymm11
|
||||||
|
231 | | | 0.00 | 1.00 | | | | | | | | | | | | || 3.0 | | vaddpd %ymm12, %ymm11, %ymm12
|
||||||
|
232 | 1.00 | 0.000 | | | | | | | | | | | | 0.50 | 0.50 | || | | vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||||
|
233 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vmulpd %ymm7, %ymm11, %ymm7
|
||||||
|
234 | 1.00 | 0.000 | | | | | | | | | | | | | | || 3.0 | | vmulpd %ymm7, %ymm12, %ymm7
|
||||||
|
235 | | | 0.12 | 0.88 | | | | | | | | | | | | || | | vcmpltpd %ymm8, %ymm6, %ymm6
|
||||||
|
236 | 1.00 | 0.000 | | | | | | | | | | | | | | || 4.0 | | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||||
|
237 | 1.00 | 0.000 | | | | | | | | | | | | | | || 1.0 | | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||||
|
238 | 1.00 | 0.000 | | | | | | | | | | | | | | || | | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||||
|
239 | 1.00 | 0.000 | | | | | | | | | | | | | | || | 4.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||||
|
240 | 0.62 | 0.380 | | | | | | | | | | | | | | || | | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||||
|
241 | 0.50 | 0.500 | | | | | | | | | | | | | | || | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||||
|
242 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | addq $4, %rbp
|
||||||
|
243 | | | | | | | | | 0.25 | 0.25 | 0.25 | 0.25 | | | | || | | cmpq %rdx, %rbp
|
||||||
|
244 | | | | | | | | | 0.00 | | | | 1.00 | | | || | | jb .LBB0_9
|
||||||
|
|
||||||
|
16.1 15.63 15.6 15.6 4.50 4.50 0.50 0.50 0.50 0.50 9.00 9.00 72 5.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
239 | 5.0 | vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
|
||||||
|
238 | 5.0 | vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
|
||||||
|
236 | 5.0 | vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
|
||||||
|
242 | 1.0 | addq $4, %rbp | [242]
|
||||||
|
241 | 1.0 | vblendvpd %ymm6, %ymm4, %ymm13, %ymm13| [241]
|
||||||
|
240 | 1.0 | vblendvpd %ymm6, %ymm1, %ymm15, %ymm15| [240]
|
||||||
|
237 | 1.0 | vblendvpd %ymm6, %ymm2, %ymm0, %ymm0| [237]
|
||||||
|
|
638
static_analysis/lammps-avx2-dp-ICX.s
Normal file
638
static_analysis/lammps-avx2-dp-ICX.s
Normal file
@@ -0,0 +1,638 @@
|
|||||||
|
.text
|
||||||
|
.file "force_lj.c"
|
||||||
|
.section .rodata.cst8,"aM",@progbits,8
|
||||||
|
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||||
|
.LCPI0_0:
|
||||||
|
.quad 4631952216750555136 # 48
|
||||||
|
.LCPI0_3:
|
||||||
|
.quad 4607182418800017408 # 1
|
||||||
|
.LCPI0_4:
|
||||||
|
.quad -4620693217682128896 # -0.5
|
||||||
|
.section .rodata.cst4,"aM",@progbits,4
|
||||||
|
.p2align 2
|
||||||
|
.LCPI0_1:
|
||||||
|
.long 3 # 0x3
|
||||||
|
.LCPI0_2:
|
||||||
|
.long 2 # 0x2
|
||||||
|
.section .rodata.cst16,"aM",@progbits,16
|
||||||
|
.p2align 4
|
||||||
|
.LCPI0_5:
|
||||||
|
.zero 16,255
|
||||||
|
.text
|
||||||
|
.globl computeForceLJFullNeigh_plain_c
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.type computeForceLJFullNeigh_plain_c,@function
|
||||||
|
computeForceLJFullNeigh_plain_c: #
|
||||||
|
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||||
|
.cfi_startproc
|
||||||
|
# %bb.0: #
|
||||||
|
pushq %rbp
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
pushq %r15
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
pushq %r14
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
pushq %r13
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
pushq %r12
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
pushq %rbx
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
subq $264, %rsp # imm = 0x108
|
||||||
|
.cfi_def_cfa_offset 320
|
||||||
|
.cfi_offset %rbx, -56
|
||||||
|
.cfi_offset %r12, -48
|
||||||
|
.cfi_offset %r13, -40
|
||||||
|
.cfi_offset %r14, -32
|
||||||
|
.cfi_offset %r15, -24
|
||||||
|
.cfi_offset %rbp, -16
|
||||||
|
movq %rcx, %rbx
|
||||||
|
movq %rdx, %r15
|
||||||
|
movq %rsi, %r12
|
||||||
|
movl 4(%rsi), %r14d
|
||||||
|
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||||
|
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||||
|
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||||
|
testl %r14d, %r14d
|
||||||
|
jle .LBB0_2
|
||||||
|
# %bb.1: #
|
||||||
|
movq 64(%r12), %rdi
|
||||||
|
leaq (,%r14,8), %rax
|
||||||
|
leaq (%rax,%rax,2), %rdx
|
||||||
|
xorl %esi, %esi
|
||||||
|
callq _intel_fast_memset
|
||||||
|
.LBB0_2: #
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||||
|
movl $.L.str, %edi
|
||||||
|
callq likwid_markerStartRegion
|
||||||
|
testl %r14d, %r14d
|
||||||
|
jle .LBB0_19
|
||||||
|
# %bb.3: #
|
||||||
|
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd %xmm0, %xmm0, %xmm13
|
||||||
|
movq 16(%r15), %r11
|
||||||
|
movq 24(%r15), %rsi
|
||||||
|
movslq 8(%r15), %rdi
|
||||||
|
movq 16(%r12), %r15
|
||||||
|
movq 64(%r12), %r8
|
||||||
|
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||||
|
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||||
|
vmovdqu (%rbx), %xmm14
|
||||||
|
decq %r14
|
||||||
|
vmovq %r15, %xmm0
|
||||||
|
vpbroadcastq %xmm0, %ymm3
|
||||||
|
vbroadcastsd %xmm13, %ymm2
|
||||||
|
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||||
|
vbroadcastsd %xmm12, %ymm8
|
||||||
|
vbroadcastsd %xmm15, %ymm9
|
||||||
|
shlq $2, %rdi
|
||||||
|
xorl %r10d, %r10d
|
||||||
|
movq %r14, 56(%rsp) # 8-byte Spill
|
||||||
|
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||||
|
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||||
|
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||||
|
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||||
|
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||||
|
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||||
|
jmp .LBB0_6
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_17: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
movq %r13, %rdx
|
||||||
|
.LBB0_5: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||||
|
vmovsd %xmm0, (%r8,%r12,8)
|
||||||
|
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||||
|
vmovsd %xmm0, (%r8,%rbx,8)
|
||||||
|
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||||
|
vmovsd %xmm0, (%r8,%rbp,8)
|
||||||
|
leal 3(%r13), %eax
|
||||||
|
addl $6, %r13d
|
||||||
|
testl %eax, %eax
|
||||||
|
cmovnsl %eax, %r13d
|
||||||
|
sarl $2, %r13d
|
||||||
|
movslq %r13d, %rax
|
||||||
|
vmovq %rax, %xmm0
|
||||||
|
vmovq %rdx, %xmm1
|
||||||
|
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||||
|
vpaddq %xmm0, %xmm14, %xmm14
|
||||||
|
addq %rdi, %r11
|
||||||
|
cmpq %r14, %r10
|
||||||
|
leaq 1(%r10), %r10
|
||||||
|
je .LBB0_18
|
||||||
|
.LBB0_6: #
|
||||||
|
# =>This Loop Header: Depth=1
|
||||||
|
# Child Loop BB0_9 Depth 2
|
||||||
|
# Child Loop BB0_13 Depth 2
|
||||||
|
movl (%rsi,%r10,4), %r13d
|
||||||
|
leal (%r10,%r10,2), %r12d
|
||||||
|
leal (%r10,%r10,2), %ebx
|
||||||
|
incl %ebx
|
||||||
|
leal (%r10,%r10,2), %ebp
|
||||||
|
addl $2, %ebp
|
||||||
|
testl %r13d, %r13d
|
||||||
|
jle .LBB0_4
|
||||||
|
# %bb.7: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
movq %r13, %rdx
|
||||||
|
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||||
|
andq %rax, %rdx
|
||||||
|
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||||
|
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||||
|
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||||
|
je .LBB0_16
|
||||||
|
# %bb.8: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||||
|
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||||
|
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||||
|
vbroadcastsd %xmm0, %ymm14
|
||||||
|
vbroadcastsd %xmm1, %ymm5
|
||||||
|
vbroadcastsd %xmm2, %ymm10
|
||||||
|
vxorpd %xmm0, %xmm0, %xmm0
|
||||||
|
vxorpd %xmm15, %xmm15, %xmm15
|
||||||
|
vxorpd %xmm13, %xmm13, %xmm13
|
||||||
|
xorl %ebp, %ebp
|
||||||
|
vmovapd %ymm8, %ymm9
|
||||||
|
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||||
|
.p2align 4, 0x90
|
||||||
|
# OSACA-BEGIN
|
||||||
|
# LLVM-MCA-BEGIN
|
||||||
|
.LBB0_9: #
|
||||||
|
# Parent Loop BB0_6 Depth=1
|
||||||
|
# => This Inner Loop Header: Depth=2
|
||||||
|
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||||
|
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||||
|
vpmovsxdq %xmm11, %ymm1
|
||||||
|
vpsllq $3, %ymm1, %ymm1
|
||||||
|
vpaddq %ymm1, %ymm3, %ymm1
|
||||||
|
vmovq %xmm1, %r14
|
||||||
|
vpextrq $1, %xmm1, %r9
|
||||||
|
vextracti128 $1, %ymm1, %xmm1
|
||||||
|
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||||
|
vpmovsxdq %xmm6, %ymm6
|
||||||
|
vpsllq $3, %ymm6, %ymm6
|
||||||
|
vmovq %xmm1, %rdi
|
||||||
|
vpaddq %ymm6, %ymm3, %ymm6
|
||||||
|
vmovq %xmm6, %rcx
|
||||||
|
vpextrq $1, %xmm1, %rbx
|
||||||
|
vpextrq $1, %xmm6, %rax
|
||||||
|
vextracti128 $1, %ymm6, %xmm1
|
||||||
|
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
vmovq %xmm1, %rdi
|
||||||
|
vpextrq $1, %xmm1, %rsi
|
||||||
|
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||||
|
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||||
|
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||||
|
vpaddd %xmm12, %xmm11, %xmm4
|
||||||
|
vpmovsxdq %xmm4, %ymm4
|
||||||
|
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||||
|
vpsllq $3, %ymm4, %ymm4
|
||||||
|
vpaddq %ymm4, %ymm3, %ymm4
|
||||||
|
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
vpextrq $1, %xmm4, %rax
|
||||||
|
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||||
|
vmovq %xmm4, %rcx
|
||||||
|
vextracti128 $1, %ymm4, %xmm4
|
||||||
|
vmovq %xmm4, %rsi
|
||||||
|
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||||
|
vpextrq $1, %xmm4, %rdi
|
||||||
|
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||||
|
vsubpd %ymm2, %ymm14, %ymm2
|
||||||
|
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||||
|
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||||
|
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||||
|
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||||
|
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||||
|
vsubpd %ymm1, %ymm5, %ymm1
|
||||||
|
vsubpd %ymm4, %ymm10, %ymm4
|
||||||
|
vmulpd %ymm2, %ymm2, %ymm6
|
||||||
|
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||||
|
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||||
|
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||||
|
vdivpd %ymm6, %ymm7, %ymm7
|
||||||
|
vmulpd %ymm7, %ymm7, %ymm11
|
||||||
|
vmulpd %ymm9, %ymm11, %ymm11
|
||||||
|
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||||
|
vmulpd %ymm7, %ymm11, %ymm11
|
||||||
|
vaddpd %ymm12, %ymm11, %ymm12
|
||||||
|
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||||
|
vmulpd %ymm7, %ymm11, %ymm7
|
||||||
|
vmulpd %ymm7, %ymm12, %ymm7
|
||||||
|
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||||
|
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||||
|
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||||
|
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||||
|
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||||
|
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||||
|
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||||
|
addq $4, %rbp
|
||||||
|
cmpq %rdx, %rbp
|
||||||
|
jb .LBB0_9
|
||||||
|
# LLVM-MCA-END
|
||||||
|
# OSACA-END
|
||||||
|
# %bb.10: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||||
|
vaddsd %xmm1, %xmm0, %xmm1
|
||||||
|
vextractf128 $1, %ymm0, %xmm0
|
||||||
|
vaddsd %xmm0, %xmm1, %xmm1
|
||||||
|
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||||
|
vaddsd %xmm0, %xmm1, %xmm10
|
||||||
|
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||||
|
vaddsd %xmm1, %xmm15, %xmm1
|
||||||
|
vextractf128 $1, %ymm15, %xmm2
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm1
|
||||||
|
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm11
|
||||||
|
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||||
|
vaddsd %xmm1, %xmm13, %xmm1
|
||||||
|
vextractf128 $1, %ymm13, %xmm2
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm1
|
||||||
|
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||||
|
vaddsd %xmm2, %xmm1, %xmm5
|
||||||
|
movq 56(%rsp), %r14 # 8-byte Reload
|
||||||
|
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||||
|
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||||
|
movq 48(%rsp), %rsi # 8-byte Reload
|
||||||
|
movq 40(%rsp), %rdi # 8-byte Reload
|
||||||
|
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||||
|
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||||
|
vmovapd %ymm9, %ymm8
|
||||||
|
movq 72(%rsp), %rbx # 8-byte Reload
|
||||||
|
movq 64(%rsp), %rbp # 8-byte Reload
|
||||||
|
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||||
|
cmpq %r13, %rdx
|
||||||
|
jae .LBB0_17
|
||||||
|
jmp .LBB0_11
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_4: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
movslq %r13d, %rdx
|
||||||
|
vxorpd %xmm5, %xmm5, %xmm5
|
||||||
|
vxorpd %xmm11, %xmm11, %xmm11
|
||||||
|
vxorpd %xmm10, %xmm10, %xmm10
|
||||||
|
jmp .LBB0_5
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_16: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vxorpd %xmm10, %xmm10, %xmm10
|
||||||
|
vxorpd %xmm11, %xmm11, %xmm11
|
||||||
|
vxorpd %xmm5, %xmm5, %xmm5
|
||||||
|
cmpq %r13, %rdx
|
||||||
|
jae .LBB0_17
|
||||||
|
.LBB0_11: #
|
||||||
|
# in Loop: Header=BB0_6 Depth=1
|
||||||
|
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||||
|
jmp .LBB0_13
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB0_12: #
|
||||||
|
# in Loop: Header=BB0_13 Depth=2
|
||||||
|
incq %rdx
|
||||||
|
cmpq %rdx, %r13
|
||||||
|
je .LBB0_17
|
||||||
|
.LBB0_13: #
|
||||||
|
# Parent Loop BB0_6 Depth=1
|
||||||
|
# => This Inner Loop Header: Depth=2
|
||||||
|
movl (%r11,%rdx,4), %eax
|
||||||
|
leal (%rax,%rax,2), %ecx
|
||||||
|
movslq %ecx, %rcx
|
||||||
|
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||||
|
leal (%rax,%rax,2), %ecx
|
||||||
|
incl %ecx
|
||||||
|
movslq %ecx, %rcx
|
||||||
|
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||||
|
leal 2(%rax,%rax,2), %eax
|
||||||
|
cltq
|
||||||
|
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||||
|
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||||
|
vmulsd %xmm6, %xmm6, %xmm7
|
||||||
|
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||||
|
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||||
|
vucomisd %xmm13, %xmm7
|
||||||
|
jae .LBB0_12
|
||||||
|
# %bb.14: #
|
||||||
|
# in Loop: Header=BB0_13 Depth=2
|
||||||
|
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vdivsd %xmm7, %xmm0, %xmm7
|
||||||
|
vmulsd %xmm7, %xmm7, %xmm0
|
||||||
|
vmulsd %xmm0, %xmm12, %xmm0
|
||||||
|
vmulsd %xmm7, %xmm0, %xmm0
|
||||||
|
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||||
|
vmulsd %xmm7, %xmm15, %xmm7
|
||||||
|
vmulsd %xmm0, %xmm7, %xmm0
|
||||||
|
vmulsd %xmm4, %xmm0, %xmm0
|
||||||
|
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||||
|
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||||
|
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||||
|
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||||
|
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||||
|
jmp .LBB0_12
|
||||||
|
.LBB0_18: #
|
||||||
|
movq 24(%rsp), %rax # 8-byte Reload
|
||||||
|
vmovdqu %xmm14, (%rax)
|
||||||
|
.LBB0_19: #
|
||||||
|
movl $.L.str, %edi
|
||||||
|
vzeroupper
|
||||||
|
callq likwid_markerStopRegion
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||||
|
addq $264, %rsp # imm = 0x108
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
popq %rbx
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
popq %r12
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
popq %r13
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
popq %r14
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
popq %r15
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
popq %rbp
|
||||||
|
.cfi_def_cfa_offset 8
|
||||||
|
retq
|
||||||
|
.Lfunc_end0:
|
||||||
|
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||||
|
.cfi_endproc
|
||||||
|
# -- End function
|
||||||
|
.section .rodata.cst8,"aM",@progbits,8
|
||||||
|
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||||
|
.LCPI1_0:
|
||||||
|
.quad 4631952216750555136 # 48
|
||||||
|
.LCPI1_1:
|
||||||
|
.quad 4607182418800017408 # 1
|
||||||
|
.LCPI1_2:
|
||||||
|
.quad -4620693217682128896 # -0.5
|
||||||
|
.text
|
||||||
|
.globl computeForceLJHalfNeigh
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.type computeForceLJHalfNeigh,@function
|
||||||
|
computeForceLJHalfNeigh: #
|
||||||
|
.LcomputeForceLJHalfNeigh$local:
|
||||||
|
.cfi_startproc
|
||||||
|
# %bb.0: #
|
||||||
|
pushq %rbp
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
pushq %r15
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
pushq %r14
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
pushq %r13
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
pushq %r12
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
pushq %rbx
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
subq $40, %rsp
|
||||||
|
.cfi_def_cfa_offset 96
|
||||||
|
.cfi_offset %rbx, -56
|
||||||
|
.cfi_offset %r12, -48
|
||||||
|
.cfi_offset %r13, -40
|
||||||
|
.cfi_offset %r14, -32
|
||||||
|
.cfi_offset %r15, -24
|
||||||
|
.cfi_offset %rbp, -16
|
||||||
|
movq %rcx, 16(%rsp) # 8-byte Spill
|
||||||
|
movq %rdx, %r15
|
||||||
|
movq %rsi, %r12
|
||||||
|
movl 4(%rsi), %r13d
|
||||||
|
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||||
|
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||||
|
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||||
|
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||||
|
testl %r13d, %r13d
|
||||||
|
jle .LBB1_2
|
||||||
|
# %bb.1: #
|
||||||
|
movq 64(%r12), %rdi
|
||||||
|
leaq (,%r13,8), %rax
|
||||||
|
leaq (%rax,%rax,2), %rdx
|
||||||
|
xorl %esi, %esi
|
||||||
|
callq _intel_fast_memset
|
||||||
|
.LBB1_2: #
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vmovsd %xmm0, 24(%rsp) # 8-byte Spill
|
||||||
|
movl $.L.str.1, %edi
|
||||||
|
callq likwid_markerStartRegion
|
||||||
|
testl %r13d, %r13d
|
||||||
|
jle .LBB1_8
|
||||||
|
# %bb.3: #
|
||||||
|
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd %xmm0, %xmm0, %xmm12
|
||||||
|
movq 16(%r15), %rax
|
||||||
|
movq 24(%r15), %rcx
|
||||||
|
movq %rcx, 8(%rsp) # 8-byte Spill
|
||||||
|
movslq 8(%r15), %rdx
|
||||||
|
movq 16(%r12), %rsi
|
||||||
|
movq 64(%r12), %rdi
|
||||||
|
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||||
|
# xmm0 = mem[0],zero
|
||||||
|
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||||
|
movq 16(%rsp), %rcx # 8-byte Reload
|
||||||
|
vmovdqu (%rcx), %xmm10
|
||||||
|
shlq $2, %rdx
|
||||||
|
movq %rdx, (%rsp) # 8-byte Spill
|
||||||
|
xorl %r12d, %r12d
|
||||||
|
jmp .LBB1_4
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB1_5: #
|
||||||
|
# in Loop: Header=BB1_4 Depth=1
|
||||||
|
vxorpd %xmm13, %xmm13, %xmm13
|
||||||
|
movq %r9, %rdx
|
||||||
|
vxorpd %xmm9, %xmm9, %xmm9
|
||||||
|
vxorpd %xmm14, %xmm14, %xmm14
|
||||||
|
.LBB1_6: #
|
||||||
|
# in Loop: Header=BB1_4 Depth=1
|
||||||
|
vaddsd (%rdi,%r15,8), %xmm14, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%r15,8)
|
||||||
|
vaddsd (%rdi,%r10,8), %xmm9, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%r10,8)
|
||||||
|
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%r11,8)
|
||||||
|
leal 3(%r9), %ecx
|
||||||
|
addl $6, %r9d
|
||||||
|
testl %ecx, %ecx
|
||||||
|
cmovnsl %ecx, %r9d
|
||||||
|
sarl $2, %r9d
|
||||||
|
movslq %r9d, %rcx
|
||||||
|
vmovq %rcx, %xmm0
|
||||||
|
vmovq %rdx, %xmm1
|
||||||
|
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||||
|
vpaddq %xmm0, %xmm10, %xmm10
|
||||||
|
incq %r12
|
||||||
|
addq (%rsp), %rax # 8-byte Folded Reload
|
||||||
|
cmpq %r13, %r12
|
||||||
|
je .LBB1_7
|
||||||
|
.LBB1_4: #
|
||||||
|
# =>This Loop Header: Depth=1
|
||||||
|
# Child Loop BB1_10 Depth 2
|
||||||
|
movq 8(%rsp), %rcx # 8-byte Reload
|
||||||
|
movslq (%rcx,%r12,4), %r9
|
||||||
|
leaq (%r12,%r12,2), %rcx
|
||||||
|
leal 1(%rcx), %r10d
|
||||||
|
leal 2(%rcx), %r11d
|
||||||
|
movl %ecx, %r15d
|
||||||
|
testq %r9, %r9
|
||||||
|
jle .LBB1_5
|
||||||
|
# %bb.9: #
|
||||||
|
# in Loop: Header=BB1_4 Depth=1
|
||||||
|
vmovsd (%rsi,%r15,8), %xmm15 # xmm15 = mem[0],zero
|
||||||
|
vmovsd (%rsi,%r10,8), %xmm4 # xmm4 = mem[0],zero
|
||||||
|
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||||
|
movl %r9d, %edx
|
||||||
|
vxorpd %xmm14, %xmm14, %xmm14
|
||||||
|
xorl %ecx, %ecx
|
||||||
|
vxorpd %xmm9, %xmm9, %xmm9
|
||||||
|
vxorpd %xmm13, %xmm13, %xmm13
|
||||||
|
jmp .LBB1_10
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.LBB1_13: #
|
||||||
|
# in Loop: Header=BB1_10 Depth=2
|
||||||
|
incq %rcx
|
||||||
|
cmpq %rcx, %rdx
|
||||||
|
je .LBB1_6
|
||||||
|
.LBB1_10: #
|
||||||
|
# Parent Loop BB1_4 Depth=1
|
||||||
|
# => This Inner Loop Header: Depth=2
|
||||||
|
movslq (%rax,%rcx,4), %r8
|
||||||
|
leaq (%r8,%r8,2), %r14
|
||||||
|
vsubsd (%rsi,%r14,8), %xmm15, %xmm2
|
||||||
|
movslq %r14d, %rbp
|
||||||
|
vsubsd 8(%rsi,%rbp,8), %xmm4, %xmm5
|
||||||
|
vsubsd 16(%rsi,%rbp,8), %xmm1, %xmm0
|
||||||
|
vmulsd %xmm2, %xmm2, %xmm6
|
||||||
|
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||||
|
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||||
|
vucomisd %xmm12, %xmm6
|
||||||
|
jae .LBB1_13
|
||||||
|
# %bb.11: #
|
||||||
|
# in Loop: Header=BB1_10 Depth=2
|
||||||
|
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||||
|
vdivsd %xmm6, %xmm3, %xmm6
|
||||||
|
vmulsd 32(%rsp), %xmm6, %xmm3 # 8-byte Folded Reload
|
||||||
|
vmulsd %xmm6, %xmm6, %xmm8
|
||||||
|
vmulsd %xmm3, %xmm8, %xmm3
|
||||||
|
vaddsd .LCPI1_2(%rip), %xmm3, %xmm7
|
||||||
|
vmulsd %xmm6, %xmm11, %xmm6
|
||||||
|
vmulsd %xmm3, %xmm6, %xmm3
|
||||||
|
vmulsd %xmm7, %xmm3, %xmm3
|
||||||
|
vmulsd %xmm2, %xmm3, %xmm6
|
||||||
|
vaddsd %xmm6, %xmm14, %xmm14
|
||||||
|
vmulsd %xmm5, %xmm3, %xmm2
|
||||||
|
vaddsd %xmm2, %xmm9, %xmm9
|
||||||
|
vmulsd %xmm0, %xmm3, %xmm0
|
||||||
|
vaddsd %xmm0, %xmm13, %xmm13
|
||||||
|
cmpl %r13d, %r8d
|
||||||
|
jge .LBB1_13
|
||||||
|
# %bb.12: #
|
||||||
|
# in Loop: Header=BB1_10 Depth=2
|
||||||
|
leaq 1(%rbp), %rbx
|
||||||
|
addq $2, %rbp
|
||||||
|
vmovsd (%rdi,%r14,8), %xmm3 # xmm3 = mem[0],zero
|
||||||
|
vsubsd %xmm6, %xmm3, %xmm3
|
||||||
|
vmovsd %xmm3, (%rdi,%r14,8)
|
||||||
|
vmovsd (%rdi,%rbx,8), %xmm3 # xmm3 = mem[0],zero
|
||||||
|
vsubsd %xmm2, %xmm3, %xmm2
|
||||||
|
vmovsd %xmm2, (%rdi,%rbx,8)
|
||||||
|
vmovsd (%rdi,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||||
|
vsubsd %xmm0, %xmm2, %xmm0
|
||||||
|
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||||
|
jmp .LBB1_13
|
||||||
|
.LBB1_7: #
|
||||||
|
movq 16(%rsp), %rax # 8-byte Reload
|
||||||
|
vmovdqu %xmm10, (%rax)
|
||||||
|
.LBB1_8: #
|
||||||
|
movl $.L.str.1, %edi
|
||||||
|
callq likwid_markerStopRegion
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||||
|
addq $40, %rsp
|
||||||
|
.cfi_def_cfa_offset 56
|
||||||
|
popq %rbx
|
||||||
|
.cfi_def_cfa_offset 48
|
||||||
|
popq %r12
|
||||||
|
.cfi_def_cfa_offset 40
|
||||||
|
popq %r13
|
||||||
|
.cfi_def_cfa_offset 32
|
||||||
|
popq %r14
|
||||||
|
.cfi_def_cfa_offset 24
|
||||||
|
popq %r15
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
popq %rbp
|
||||||
|
.cfi_def_cfa_offset 8
|
||||||
|
retq
|
||||||
|
.Lfunc_end1:
|
||||||
|
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||||
|
.cfi_endproc
|
||||||
|
# -- End function
|
||||||
|
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||||
|
.p2align 4, 0x90
|
||||||
|
.type computeForceLJFullNeigh_simd,@function
|
||||||
|
computeForceLJFullNeigh_simd: #
|
||||||
|
.LcomputeForceLJFullNeigh_simd$local:
|
||||||
|
.cfi_startproc
|
||||||
|
# %bb.0: #
|
||||||
|
pushq %rax
|
||||||
|
.cfi_def_cfa_offset 16
|
||||||
|
movl 4(%rsi), %eax
|
||||||
|
testl %eax, %eax
|
||||||
|
jle .LBB2_2
|
||||||
|
# %bb.1: #
|
||||||
|
movq 64(%rsi), %rdi
|
||||||
|
shlq $3, %rax
|
||||||
|
leaq (%rax,%rax,2), %rdx
|
||||||
|
xorl %esi, %esi
|
||||||
|
callq _intel_fast_memset
|
||||||
|
.LBB2_2: #
|
||||||
|
xorl %eax, %eax
|
||||||
|
callq getTimeStamp
|
||||||
|
movl $.L.str, %edi
|
||||||
|
callq likwid_markerStartRegion
|
||||||
|
movq stderr(%rip), %rcx
|
||||||
|
movl $.L.str.2, %edi
|
||||||
|
movl $65, %esi
|
||||||
|
movl $1, %edx
|
||||||
|
callq fwrite
|
||||||
|
movl $-1, %edi
|
||||||
|
callq exit
|
||||||
|
.Lfunc_end2:
|
||||||
|
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||||
|
.cfi_endproc
|
||||||
|
# -- End function
|
||||||
|
.type .L.str,@object #
|
||||||
|
.section .rodata.str1.1,"aMS",@progbits,1
|
||||||
|
.L.str:
|
||||||
|
.asciz "force"
|
||||||
|
.size .L.str, 6
|
||||||
|
|
||||||
|
.type .L.str.1,@object #
|
||||||
|
.L.str.1:
|
||||||
|
.asciz "forceLJ-halfneigh"
|
||||||
|
.size .L.str.1, 18
|
||||||
|
|
||||||
|
.type .L.str.2,@object #
|
||||||
|
.L.str.2:
|
||||||
|
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||||
|
.size .L.str.2, 66
|
||||||
|
|
||||||
|
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||||
|
.section ".note.GNU-stack","",@progbits
|
@@ -1,46 +1,112 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
TAG=ICX
|
[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
|
||||||
OPT_SCHEME=gromacs
|
[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
|
||||||
MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
|
[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit
|
||||||
FREQ=2.4
|
|
||||||
NRUNS=3
|
|
||||||
FIXED_PARAMS=--freq $FREQ
|
|
||||||
|
|
||||||
if [ "$OPT_SCHEME" = "gromacs" ]; then
|
MDBENCH_BIN=$1
|
||||||
STUB1_NAME=Stub-33
|
BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
|
||||||
STUB1_PARAMS=-na 4 -nn 33
|
OPT_SCHEME="${BIN_INFO%%-*}"
|
||||||
STUB2_NAME=Stub-128
|
PREC="${BIN_INFO##*-}"
|
||||||
STUB2_PARAMS=-na 4 -nn 128
|
BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
|
||||||
|
BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
|
||||||
|
TAG="${BIN_INFO%%-*}"
|
||||||
|
ISA="${BIN_INFO##*-}"
|
||||||
|
CORE="${CORE:-0}"
|
||||||
|
FREQ="${FREQ:-2.4}"
|
||||||
|
NRUNS="${NRUNS:-3}"
|
||||||
|
LOG="${LOG:-latencies_and_cfds.log}"
|
||||||
|
STUB_ONLY="${STUB_ONLY:-false}"
|
||||||
|
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
|
||||||
|
|
||||||
|
OPTIND=2
|
||||||
|
while getopts "c:f:n:l:s" flag; do
|
||||||
|
case "${flag}" in
|
||||||
|
c) CORE=${OPTARG};;
|
||||||
|
f) FREQ=${OPTARG};;
|
||||||
|
n) NRUNS=${OPTARG};;
|
||||||
|
l) LOG=${OPTARG};;
|
||||||
|
s) STUB_ONLY=true;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Other useful variables
|
||||||
|
MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
|
||||||
|
FIXED_PARAMS="--freq $FREQ"
|
||||||
|
CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
|
||||||
|
|
||||||
|
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
|
||||||
|
ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
|
||||||
|
PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
|
||||||
else
|
else
|
||||||
STUB1_NAME=Stub-76
|
ALL_PREFETCHERS=""
|
||||||
STUB1_PARAMS=-nn 76
|
PREFETCHERS=("IGNORE")
|
||||||
STUB2_NAME=Stub-1024
|
fi
|
||||||
STUB2_PARAMS=-nn 1024
|
|
||||||
|
if [ "$OPT_SCHEME" == "gromacs" ]; then
|
||||||
|
STUB1_NAME=stub-33
|
||||||
|
STUB1_PARAMS="-na 4 -nn 33"
|
||||||
|
STUB2_NAME=stub-128
|
||||||
|
STUB2_PARAMS="-na 4 -nn 128"
|
||||||
|
else
|
||||||
|
STUB1_NAME=stub-76
|
||||||
|
STUB1_PARAMS="-nn 76"
|
||||||
|
STUB2_NAME=stub-1024
|
||||||
|
STUB2_PARAMS="-nn 1024"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
function run_benchmark() {
|
function run_benchmark() {
|
||||||
|
BEST=10000000
|
||||||
for i in $(seq $NRUNS); do
|
for i in $(seq $NRUNS); do
|
||||||
likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
|
RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
|
||||||
|
if (( $(echo "$BEST > $RES" | bc -l ) )); then
|
||||||
|
BEST=$RES
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "Tag: $TAG"
|
echo "Tag: $TAG" | tee -a $LOG
|
||||||
echo "Optimization scheme: $OPT_SCHEME"
|
echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
|
||||||
echo "Binary: $MDBENCH_BIN(-stub)"
|
echo "Instruction set: $ISA" | tee -a $LOG
|
||||||
echo "Frequency: $FREQ"
|
echo "Precision: $PREC" | tee -a $LOG
|
||||||
echo "Number of runs: $NRUNS"
|
echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
|
||||||
|
echo "Frequency: $FREQ" | tee -a $LOG
|
||||||
|
echo "Number of runs: $NRUNS" | tee -a $LOG
|
||||||
|
echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG
|
||||||
|
|
||||||
|
if [ "$SKIP_SET_FREQ" == "false" ]; then
|
||||||
echo "Fixing frequencies..."
|
echo "Fixing frequencies..."
|
||||||
likwid-setFrequencies -f $FREQ -t 0
|
likwid-setFrequencies -f $FREQ -t 0
|
||||||
|
fi
|
||||||
|
|
||||||
echo "Standard"
|
for p in $PREFETCHERS; do
|
||||||
|
if [ "$p" != "IGNORE" ]; then
|
||||||
|
if [ "$p" == "ALL" ]; then
|
||||||
|
likwid-features -c $CORE -e $ALL_PREFETCHERS
|
||||||
|
elif [ "$p" == "NONE" ]; then
|
||||||
|
likwid-features -c $CORE -d $ALL_PREFETCHERS
|
||||||
|
else
|
||||||
|
likwid-features -c $CORE -d $ALL_PREFETCHERS
|
||||||
|
likwid-features -c $CORE -e $p
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Prefetcher settings: $p"
|
||||||
|
likwid-features -c $CORE -l
|
||||||
|
fi
|
||||||
|
|
||||||
|
MSG="$p: "
|
||||||
|
if [ "$STUB_ONLY" == "false" ]; then
|
||||||
run_benchmark $MDBENCH_BIN
|
run_benchmark $MDBENCH_BIN
|
||||||
echo "Melt"
|
MSG+="standard=$BEST, "
|
||||||
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
|
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
|
||||||
echo "Argon"
|
MSG+="melt=$BEST, "
|
||||||
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
|
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
|
||||||
echo "$STUB1_NAME"
|
MSG+="argon=$BEST, "
|
||||||
|
fi
|
||||||
|
|
||||||
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
|
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
|
||||||
echo "$STUB2_NAME"
|
MSG+="$STUB1_NAME=$BEST, "
|
||||||
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
|
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
|
||||||
|
MSG+="$STUB2_NAME=$BEST"
|
||||||
|
echo $MSG | tee -a $LOG
|
||||||
|
done
|
||||||
|
Reference in New Issue
Block a user